1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @vaddq(ptr %x, ptr %y, i32 %n) { 5; CHECK-LABEL: vaddq: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r2, #1 10; CHECK-NEXT: it lt 11; CHECK-NEXT: poplt {r7, pc} 12; CHECK-NEXT: .LBB0_1: @ %for.body.preheader 13; CHECK-NEXT: movs r3, #10 14; CHECK-NEXT: dlstp.32 lr, r2 15; CHECK-NEXT: .LBB0_2: @ %for.body 16; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 17; CHECK-NEXT: vldrw.u32 q0, [r0], #16 18; CHECK-NEXT: vadd.i32 q0, q0, r3 19; CHECK-NEXT: vstrw.32 q0, [r1], #16 20; CHECK-NEXT: letp lr, .LBB0_2 21; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 22; CHECK-NEXT: pop {r7, pc} 23entry: 24 %cmp11 = icmp sgt i32 %n, 0 25 br i1 %cmp11, label %for.body, label %for.cond.cleanup 26 27for.cond.cleanup: ; preds = %for.body, %entry 28 ret void 29 30for.body: ; preds = %entry, %for.body 31 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 32 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 33 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 34 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 35 %1 = bitcast ptr %x.addr.014 to ptr 36 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 37 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 38 %3 = add <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10> 39 %4 = bitcast ptr %y.addr.013 to ptr 40 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 41 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 42 %sub = add nsw i32 %i.012, -4 43 %cmp = icmp sgt i32 %i.012, 4 44 br i1 %cmp, label %for.body, label %for.cond.cleanup 45} 46 47define void @vadd(ptr %s1, i32 %c0, i32 %N) { 48; CHECK-LABEL: vadd: 49; CHECK: @ %bb.0: @ %entry 50; CHECK-NEXT: .save {r7, lr} 51; CHECK-NEXT: push {r7, lr} 52; CHECK-NEXT: cmp r2, #1 53; CHECK-NEXT: it lt 54; CHECK-NEXT: poplt {r7, pc} 55; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph 56; CHECK-NEXT: dlstp.32 lr, r2 57; CHECK-NEXT: .LBB1_2: @ %while.body 58; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 59; CHECK-NEXT: vldrw.u32 q0, [r0] 60; CHECK-NEXT: vadd.i32 q0, q0, r1 61; CHECK-NEXT: vstrw.32 q0, [r0], #16 62; CHECK-NEXT: letp lr, .LBB1_2 63; CHECK-NEXT: @ %bb.3: @ %while.end 64; CHECK-NEXT: pop {r7, pc} 65entry: 66 %cmp11 = icmp sgt i32 %N, 0 67 br i1 %cmp11, label %while.body.lr.ph, label %while.end 68 69while.body.lr.ph: ; preds = %entry 70 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 71 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 72 br label %while.body 73 74while.body: ; preds = %while.body.lr.ph, %while.body 75 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 76 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 77 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 78 %1 = bitcast ptr %s1.addr.013 to ptr 79 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 80 %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) 81 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 82 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 83 %sub = add nsw i32 %N.addr.012, -4 84 %cmp = icmp sgt i32 %N.addr.012, 4 85 br i1 %cmp, label %while.body, label %while.end 86 87while.end: ; preds = %while.body, %entry 88 ret void 89} 90 91define void @vsubq(ptr %x, ptr %y, i32 %n) { 92; CHECK-LABEL: vsubq: 93; CHECK: @ %bb.0: @ %entry 94; CHECK-NEXT: .save {r7, lr} 95; CHECK-NEXT: push {r7, lr} 96; CHECK-NEXT: cmp r2, #1 97; CHECK-NEXT: it lt 98; CHECK-NEXT: poplt {r7, pc} 99; CHECK-NEXT: .LBB2_1: @ %for.body.preheader 100; CHECK-NEXT: movs r3, #10 101; CHECK-NEXT: dlstp.32 lr, r2 102; CHECK-NEXT: .LBB2_2: @ %for.body 103; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 104; CHECK-NEXT: vldrw.u32 q0, [r0], #16 105; CHECK-NEXT: vsub.i32 q0, q0, r3 106; CHECK-NEXT: vstrw.32 q0, [r1], #16 107; CHECK-NEXT: letp lr, .LBB2_2 108; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 109; CHECK-NEXT: pop {r7, pc} 110entry: 111 %cmp11 = icmp sgt i32 %n, 0 112 br i1 %cmp11, label %for.body, label %for.cond.cleanup 113 114for.cond.cleanup: ; preds = %for.body, %entry 115 ret void 116 117for.body: ; preds = %entry, %for.body 118 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 119 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 120 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 121 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 122 %1 = bitcast ptr %x.addr.014 to ptr 123 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 124 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 125 %3 = sub <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10> 126 %4 = bitcast ptr %y.addr.013 to ptr 127 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 128 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 129 %sub = add nsw i32 %i.012, -4 130 %cmp = icmp sgt i32 %i.012, 4 131 br i1 %cmp, label %for.body, label %for.cond.cleanup 132} 133 134define void @vsub(ptr %s1, i32 %N) { 135; CHECK-LABEL: vsub: 136; CHECK: @ %bb.0: @ %entry 137; CHECK-NEXT: .save {r7, lr} 138; CHECK-NEXT: push {r7, lr} 139; CHECK-NEXT: cmp r1, #1 140; CHECK-NEXT: it lt 141; CHECK-NEXT: poplt {r7, pc} 142; CHECK-NEXT: .LBB3_1: @ %while.body.preheader 143; CHECK-NEXT: movs r2, #10 144; CHECK-NEXT: dlstp.32 lr, r1 145; CHECK-NEXT: .LBB3_2: @ %while.body 146; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 147; CHECK-NEXT: vldrw.u32 q0, [r0] 148; CHECK-NEXT: vsub.i32 q0, q0, r2 149; CHECK-NEXT: vstrw.32 q0, [r0], #16 150; CHECK-NEXT: letp lr, .LBB3_2 151; CHECK-NEXT: @ %bb.3: @ %while.end 152; CHECK-NEXT: pop {r7, pc} 153entry: 154 %cmp11 = icmp sgt i32 %N, 0 155 br i1 %cmp11, label %while.body.lr.ph, label %while.end 156 157while.body.lr.ph: ; preds = %entry 158 br label %while.body 159 160while.body: ; preds = %while.body.lr.ph, %while.body 161 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 162 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 163 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 164 %1 = bitcast ptr %s1.addr.013 to ptr 165 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 166 %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) 167 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 168 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 169 %sub = add nsw i32 %N.addr.012, -4 170 %cmp = icmp sgt i32 %N.addr.012, 4 171 br i1 %cmp, label %while.body, label %while.end 172 173while.end: ; preds = %while.body, %entry 174 ret void 175} 176 177define void @vmulq(ptr %x, ptr %y, i32 %n) { 178; CHECK-LABEL: vmulq: 179; CHECK: @ %bb.0: @ %entry 180; CHECK-NEXT: .save {r7, lr} 181; CHECK-NEXT: push {r7, lr} 182; CHECK-NEXT: cmp r2, #1 183; CHECK-NEXT: it lt 184; CHECK-NEXT: poplt {r7, pc} 185; CHECK-NEXT: .LBB4_1: @ %for.body.preheader 186; CHECK-NEXT: movs r3, #10 187; CHECK-NEXT: dlstp.32 lr, r2 188; CHECK-NEXT: .LBB4_2: @ %for.body 189; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 190; CHECK-NEXT: vldrw.u32 q0, [r0], #16 191; CHECK-NEXT: vmul.i32 q0, q0, r3 192; CHECK-NEXT: vstrw.32 q0, [r1], #16 193; CHECK-NEXT: letp lr, .LBB4_2 194; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 195; CHECK-NEXT: pop {r7, pc} 196entry: 197 %cmp11 = icmp sgt i32 %n, 0 198 br i1 %cmp11, label %for.body, label %for.cond.cleanup 199 200for.cond.cleanup: ; preds = %for.body, %entry 201 ret void 202 203for.body: ; preds = %entry, %for.body 204 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 205 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 206 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 207 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 208 %1 = bitcast ptr %x.addr.014 to ptr 209 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 210 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 211 %3 = mul <4 x i32> %2, <i32 10, i32 10, i32 10, i32 10> 212 %4 = bitcast ptr %y.addr.013 to ptr 213 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 214 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 215 %sub = add nsw i32 %i.012, -4 216 %cmp = icmp sgt i32 %i.012, 4 217 br i1 %cmp, label %for.body, label %for.cond.cleanup 218} 219 220define void @vmul(ptr %s1, i32 %N) { 221; CHECK-LABEL: vmul: 222; CHECK: @ %bb.0: @ %entry 223; CHECK-NEXT: .save {r7, lr} 224; CHECK-NEXT: push {r7, lr} 225; CHECK-NEXT: cmp r1, #1 226; CHECK-NEXT: it lt 227; CHECK-NEXT: poplt {r7, pc} 228; CHECK-NEXT: .LBB5_1: @ %while.body.preheader 229; CHECK-NEXT: movs r2, #10 230; CHECK-NEXT: dlstp.32 lr, r1 231; CHECK-NEXT: .LBB5_2: @ %while.body 232; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 233; CHECK-NEXT: vldrw.u32 q0, [r0] 234; CHECK-NEXT: vmul.i32 q0, q0, r2 235; CHECK-NEXT: vstrw.32 q0, [r0], #16 236; CHECK-NEXT: letp lr, .LBB5_2 237; CHECK-NEXT: @ %bb.3: @ %while.end 238; CHECK-NEXT: pop {r7, pc} 239entry: 240 %cmp11 = icmp sgt i32 %N, 0 241 br i1 %cmp11, label %while.body.lr.ph, label %while.end 242 243while.body.lr.ph: ; preds = %entry 244 br label %while.body 245 246while.body: ; preds = %while.body.lr.ph, %while.body 247 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 248 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 249 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 250 %1 = bitcast ptr %s1.addr.013 to ptr 251 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 252 %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) 253 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 254 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 255 %sub = add nsw i32 %N.addr.012, -4 256 %cmp = icmp sgt i32 %N.addr.012, 4 257 br i1 %cmp, label %while.body, label %while.end 258 259while.end: ; preds = %while.body, %entry 260 ret void 261} 262 263define void @vqaddq(ptr %x, ptr %y, i32 %n) { 264; CHECK-LABEL: vqaddq: 265; CHECK: @ %bb.0: @ %entry 266; CHECK-NEXT: .save {r7, lr} 267; CHECK-NEXT: push {r7, lr} 268; CHECK-NEXT: cmp r2, #1 269; CHECK-NEXT: it lt 270; CHECK-NEXT: poplt {r7, pc} 271; CHECK-NEXT: .LBB6_1: @ %for.body.preheader 272; CHECK-NEXT: movs r3, #10 273; CHECK-NEXT: dlstp.32 lr, r2 274; CHECK-NEXT: .LBB6_2: @ %for.body 275; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 276; CHECK-NEXT: vldrw.u32 q0, [r0], #16 277; CHECK-NEXT: vqadd.s32 q0, q0, r3 278; CHECK-NEXT: vstrw.32 q0, [r1], #16 279; CHECK-NEXT: letp lr, .LBB6_2 280; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 281; CHECK-NEXT: pop {r7, pc} 282entry: 283 %cmp11 = icmp sgt i32 %n, 0 284 br i1 %cmp11, label %for.body, label %for.cond.cleanup 285 286for.cond.cleanup: ; preds = %for.body, %entry 287 ret void 288 289for.body: ; preds = %entry, %for.body 290 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 291 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 292 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 293 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 294 %1 = bitcast ptr %x.addr.014 to ptr 295 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 296 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 297 %3 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) 298 %4 = bitcast ptr %y.addr.013 to ptr 299 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 300 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 301 %sub = add nsw i32 %i.012, -4 302 %cmp = icmp sgt i32 %i.012, 4 303 br i1 %cmp, label %for.body, label %for.cond.cleanup 304} 305 306define void @vqaddqu(ptr %x, ptr %y, i32 %n) { 307; CHECK-LABEL: vqaddqu: 308; CHECK: @ %bb.0: @ %entry 309; CHECK-NEXT: .save {r7, lr} 310; CHECK-NEXT: push {r7, lr} 311; CHECK-NEXT: cmp r2, #1 312; CHECK-NEXT: it lt 313; CHECK-NEXT: poplt {r7, pc} 314; CHECK-NEXT: .LBB7_1: @ %for.body.preheader 315; CHECK-NEXT: movs r3, #10 316; CHECK-NEXT: dlstp.32 lr, r2 317; CHECK-NEXT: .LBB7_2: @ %for.body 318; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 319; CHECK-NEXT: vldrw.u32 q0, [r0], #16 320; CHECK-NEXT: vqadd.u32 q0, q0, r3 321; CHECK-NEXT: vstrw.32 q0, [r1], #16 322; CHECK-NEXT: letp lr, .LBB7_2 323; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 324; CHECK-NEXT: pop {r7, pc} 325entry: 326 %cmp11 = icmp sgt i32 %n, 0 327 br i1 %cmp11, label %for.body, label %for.cond.cleanup 328 329for.cond.cleanup: ; preds = %for.body, %entry 330 ret void 331 332for.body: ; preds = %entry, %for.body 333 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 334 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 335 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 336 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 337 %1 = bitcast ptr %x.addr.014 to ptr 338 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 339 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 340 %3 = tail call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) 341 %4 = bitcast ptr %y.addr.013 to ptr 342 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 343 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 344 %sub = add nsw i32 %i.012, -4 345 %cmp = icmp sgt i32 %i.012, 4 346 br i1 %cmp, label %for.body, label %for.cond.cleanup 347} 348 349define void @vqadd(ptr %s1, i32 %N) { 350; CHECK-LABEL: vqadd: 351; CHECK: @ %bb.0: @ %entry 352; CHECK-NEXT: .save {r7, lr} 353; CHECK-NEXT: push {r7, lr} 354; CHECK-NEXT: cmp r1, #1 355; CHECK-NEXT: it lt 356; CHECK-NEXT: poplt {r7, pc} 357; CHECK-NEXT: .LBB8_1: @ %while.body.preheader 358; CHECK-NEXT: movs r2, #10 359; CHECK-NEXT: dlstp.32 lr, r1 360; CHECK-NEXT: .LBB8_2: @ %while.body 361; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 362; CHECK-NEXT: vldrw.u32 q0, [r0] 363; CHECK-NEXT: vqadd.s32 q0, q0, r2 364; CHECK-NEXT: vstrw.32 q0, [r0], #16 365; CHECK-NEXT: letp lr, .LBB8_2 366; CHECK-NEXT: @ %bb.3: @ %while.end 367; CHECK-NEXT: pop {r7, pc} 368entry: 369 %cmp11 = icmp sgt i32 %N, 0 370 br i1 %cmp11, label %while.body.lr.ph, label %while.end 371 372while.body.lr.ph: ; preds = %entry 373 br label %while.body 374 375while.body: ; preds = %while.body.lr.ph, %while.body 376 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 377 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 378 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 379 %1 = bitcast ptr %s1.addr.013 to ptr 380 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 381 %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) 382 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 383 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 384 %sub = add nsw i32 %N.addr.012, -4 385 %cmp = icmp sgt i32 %N.addr.012, 4 386 br i1 %cmp, label %while.body, label %while.end 387 388while.end: ; preds = %while.body, %entry 389 ret void 390} 391 392define void @vqsubq(ptr %x, ptr %y, i32 %n) { 393; CHECK-LABEL: vqsubq: 394; CHECK: @ %bb.0: @ %entry 395; CHECK-NEXT: .save {r7, lr} 396; CHECK-NEXT: push {r7, lr} 397; CHECK-NEXT: cmp r2, #1 398; CHECK-NEXT: it lt 399; CHECK-NEXT: poplt {r7, pc} 400; CHECK-NEXT: .LBB9_1: @ %for.body.preheader 401; CHECK-NEXT: movs r3, #10 402; CHECK-NEXT: dlstp.32 lr, r2 403; CHECK-NEXT: .LBB9_2: @ %for.body 404; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 405; CHECK-NEXT: vldrw.u32 q0, [r0], #16 406; CHECK-NEXT: vqsub.s32 q0, q0, r3 407; CHECK-NEXT: vstrw.32 q0, [r1], #16 408; CHECK-NEXT: letp lr, .LBB9_2 409; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 410; CHECK-NEXT: pop {r7, pc} 411entry: 412 %cmp11 = icmp sgt i32 %n, 0 413 br i1 %cmp11, label %for.body, label %for.cond.cleanup 414 415for.cond.cleanup: ; preds = %for.body, %entry 416 ret void 417 418for.body: ; preds = %entry, %for.body 419 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 420 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 421 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 422 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 423 %1 = bitcast ptr %x.addr.014 to ptr 424 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 425 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 426 %3 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) 427 %4 = bitcast ptr %y.addr.013 to ptr 428 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 429 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 430 %sub = add nsw i32 %i.012, -4 431 %cmp = icmp sgt i32 %i.012, 4 432 br i1 %cmp, label %for.body, label %for.cond.cleanup 433} 434 435define void @vqsubqu(ptr %x, ptr %y, i32 %n) { 436; CHECK-LABEL: vqsubqu: 437; CHECK: @ %bb.0: @ %entry 438; CHECK-NEXT: .save {r7, lr} 439; CHECK-NEXT: push {r7, lr} 440; CHECK-NEXT: cmp r2, #1 441; CHECK-NEXT: it lt 442; CHECK-NEXT: poplt {r7, pc} 443; CHECK-NEXT: .LBB10_1: @ %for.body.preheader 444; CHECK-NEXT: movs r3, #10 445; CHECK-NEXT: dlstp.32 lr, r2 446; CHECK-NEXT: .LBB10_2: @ %for.body 447; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 448; CHECK-NEXT: vldrw.u32 q0, [r0], #16 449; CHECK-NEXT: vqsub.u32 q0, q0, r3 450; CHECK-NEXT: vstrw.32 q0, [r1], #16 451; CHECK-NEXT: letp lr, .LBB10_2 452; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 453; CHECK-NEXT: pop {r7, pc} 454entry: 455 %cmp11 = icmp sgt i32 %n, 0 456 br i1 %cmp11, label %for.body, label %for.cond.cleanup 457 458for.cond.cleanup: ; preds = %for.body, %entry 459 ret void 460 461for.body: ; preds = %entry, %for.body 462 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 463 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 464 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 465 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 466 %1 = bitcast ptr %x.addr.014 to ptr 467 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 468 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 469 %3 = tail call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) 470 %4 = bitcast ptr %y.addr.013 to ptr 471 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 472 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 473 %sub = add nsw i32 %i.012, -4 474 %cmp = icmp sgt i32 %i.012, 4 475 br i1 %cmp, label %for.body, label %for.cond.cleanup 476} 477 478define void @vqsub(ptr %s1, i32 %N) { 479; CHECK-LABEL: vqsub: 480; CHECK: @ %bb.0: @ %entry 481; CHECK-NEXT: .save {r7, lr} 482; CHECK-NEXT: push {r7, lr} 483; CHECK-NEXT: cmp r1, #1 484; CHECK-NEXT: it lt 485; CHECK-NEXT: poplt {r7, pc} 486; CHECK-NEXT: .LBB11_1: @ %while.body.preheader 487; CHECK-NEXT: movs r2, #10 488; CHECK-NEXT: dlstp.32 lr, r1 489; CHECK-NEXT: .LBB11_2: @ %while.body 490; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 491; CHECK-NEXT: vldrw.u32 q0, [r0] 492; CHECK-NEXT: vqsub.s32 q0, q0, r2 493; CHECK-NEXT: vstrw.32 q0, [r0], #16 494; CHECK-NEXT: letp lr, .LBB11_2 495; CHECK-NEXT: @ %bb.3: @ %while.end 496; CHECK-NEXT: pop {r7, pc} 497entry: 498 %cmp11 = icmp sgt i32 %N, 0 499 br i1 %cmp11, label %while.body.lr.ph, label %while.end 500 501while.body.lr.ph: ; preds = %entry 502 br label %while.body 503 504while.body: ; preds = %while.body.lr.ph, %while.body 505 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 506 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 507 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 508 %1 = bitcast ptr %s1.addr.013 to ptr 509 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 510 %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) 511 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 512 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 513 %sub = add nsw i32 %N.addr.012, -4 514 %cmp = icmp sgt i32 %N.addr.012, 4 515 br i1 %cmp, label %while.body, label %while.end 516 517while.end: ; preds = %while.body, %entry 518 ret void 519} 520 521define void @vhaddq(ptr %x, ptr %y, i32 %n) { 522; CHECK-LABEL: vhaddq: 523; CHECK: @ %bb.0: @ %entry 524; CHECK-NEXT: .save {r7, lr} 525; CHECK-NEXT: push {r7, lr} 526; CHECK-NEXT: cmp r2, #1 527; CHECK-NEXT: it lt 528; CHECK-NEXT: poplt {r7, pc} 529; CHECK-NEXT: .LBB12_1: @ %for.body.preheader 530; CHECK-NEXT: movs r3, #10 531; CHECK-NEXT: dlstp.32 lr, r2 532; CHECK-NEXT: .LBB12_2: @ %for.body 533; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 534; CHECK-NEXT: vldrw.u32 q0, [r0], #16 535; CHECK-NEXT: vhadd.s32 q0, q0, r3 536; CHECK-NEXT: vstrw.32 q0, [r1], #16 537; CHECK-NEXT: letp lr, .LBB12_2 538; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 539; CHECK-NEXT: pop {r7, pc} 540entry: 541 %cmp11 = icmp sgt i32 %n, 0 542 br i1 %cmp11, label %for.body, label %for.cond.cleanup 543 544for.cond.cleanup: ; preds = %for.body, %entry 545 ret void 546 547for.body: ; preds = %entry, %for.body 548 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 549 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 550 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 551 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 552 %1 = bitcast ptr %x.addr.014 to ptr 553 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 554 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 555 %3 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0) 556 %4 = bitcast ptr %y.addr.013 to ptr 557 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 558 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 559 %sub = add nsw i32 %i.012, -4 560 %cmp = icmp sgt i32 %i.012, 4 561 br i1 %cmp, label %for.body, label %for.cond.cleanup 562} 563 564define void @vhadd(ptr %s1, i32 %N) { 565; CHECK-LABEL: vhadd: 566; CHECK: @ %bb.0: @ %entry 567; CHECK-NEXT: .save {r7, lr} 568; CHECK-NEXT: push {r7, lr} 569; CHECK-NEXT: cmp r1, #1 570; CHECK-NEXT: it lt 571; CHECK-NEXT: poplt {r7, pc} 572; CHECK-NEXT: .LBB13_1: @ %while.body.preheader 573; CHECK-NEXT: movs r2, #10 574; CHECK-NEXT: dlstp.32 lr, r1 575; CHECK-NEXT: .LBB13_2: @ %while.body 576; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 577; CHECK-NEXT: vldrw.u32 q0, [r0] 578; CHECK-NEXT: vhadd.s32 q0, q0, r2 579; CHECK-NEXT: vstrw.32 q0, [r0], #16 580; CHECK-NEXT: letp lr, .LBB13_2 581; CHECK-NEXT: @ %bb.3: @ %while.end 582; CHECK-NEXT: pop {r7, pc} 583entry: 584 %cmp11 = icmp sgt i32 %N, 0 585 br i1 %cmp11, label %while.body.lr.ph, label %while.end 586 587while.body.lr.ph: ; preds = %entry 588 br label %while.body 589 590while.body: ; preds = %while.body.lr.ph, %while.body 591 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 592 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 593 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 594 %1 = bitcast ptr %s1.addr.013 to ptr 595 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 596 %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) 597 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 598 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 599 %sub = add nsw i32 %N.addr.012, -4 600 %cmp = icmp sgt i32 %N.addr.012, 4 601 br i1 %cmp, label %while.body, label %while.end 602 603while.end: ; preds = %while.body, %entry 604 ret void 605} 606 607define void @vhsubq(ptr %x, ptr %y, i32 %n) { 608; CHECK-LABEL: vhsubq: 609; CHECK: @ %bb.0: @ %entry 610; CHECK-NEXT: .save {r7, lr} 611; CHECK-NEXT: push {r7, lr} 612; CHECK-NEXT: cmp r2, #1 613; CHECK-NEXT: it lt 614; CHECK-NEXT: poplt {r7, pc} 615; CHECK-NEXT: .LBB14_1: @ %for.body.preheader 616; CHECK-NEXT: movs r3, #10 617; CHECK-NEXT: dlstp.32 lr, r2 618; CHECK-NEXT: .LBB14_2: @ %for.body 619; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 620; CHECK-NEXT: vldrw.u32 q0, [r0], #16 621; CHECK-NEXT: vhsub.s32 q0, q0, r3 622; CHECK-NEXT: vstrw.32 q0, [r1], #16 623; CHECK-NEXT: letp lr, .LBB14_2 624; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 625; CHECK-NEXT: pop {r7, pc} 626entry: 627 %cmp11 = icmp sgt i32 %n, 0 628 br i1 %cmp11, label %for.body, label %for.cond.cleanup 629 630for.cond.cleanup: ; preds = %for.body, %entry 631 ret void 632 633for.body: ; preds = %entry, %for.body 634 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 635 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 636 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 637 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 638 %1 = bitcast ptr %x.addr.014 to ptr 639 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 640 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 641 %3 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0) 642 %4 = bitcast ptr %y.addr.013 to ptr 643 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 644 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 645 %sub = add nsw i32 %i.012, -4 646 %cmp = icmp sgt i32 %i.012, 4 647 br i1 %cmp, label %for.body, label %for.cond.cleanup 648} 649 650define void @vhsub(ptr %s1, i32 %N) { 651; CHECK-LABEL: vhsub: 652; CHECK: @ %bb.0: @ %entry 653; CHECK-NEXT: .save {r7, lr} 654; CHECK-NEXT: push {r7, lr} 655; CHECK-NEXT: cmp r1, #1 656; CHECK-NEXT: it lt 657; CHECK-NEXT: poplt {r7, pc} 658; CHECK-NEXT: .LBB15_1: @ %while.body.preheader 659; CHECK-NEXT: movs r2, #10 660; CHECK-NEXT: dlstp.32 lr, r1 661; CHECK-NEXT: .LBB15_2: @ %while.body 662; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 663; CHECK-NEXT: vldrw.u32 q0, [r0] 664; CHECK-NEXT: vhsub.s32 q0, q0, r2 665; CHECK-NEXT: vstrw.32 q0, [r0], #16 666; CHECK-NEXT: letp lr, .LBB15_2 667; CHECK-NEXT: @ %bb.3: @ %while.end 668; CHECK-NEXT: pop {r7, pc} 669entry: 670 %cmp11 = icmp sgt i32 %N, 0 671 br i1 %cmp11, label %while.body.lr.ph, label %while.end 672 673while.body.lr.ph: ; preds = %entry 674 br label %while.body 675 676while.body: ; preds = %while.body.lr.ph, %while.body 677 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 678 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 679 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 680 %1 = bitcast ptr %s1.addr.013 to ptr 681 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 682 %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0, <4 x i1> %0, <4 x i32> %2) 683 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 684 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 685 %sub = add nsw i32 %N.addr.012, -4 686 %cmp = icmp sgt i32 %N.addr.012, 4 687 br i1 %cmp, label %while.body, label %while.end 688 689while.end: ; preds = %while.body, %entry 690 ret void 691} 692 693define void @vqdmullbq(ptr %x, ptr %y, i32 %n) { 694; CHECK-LABEL: vqdmullbq: 695; CHECK: @ %bb.0: @ %entry 696; CHECK-NEXT: .save {r7, lr} 697; CHECK-NEXT: push {r7, lr} 698; CHECK-NEXT: cmp r2, #1 699; CHECK-NEXT: it lt 700; CHECK-NEXT: poplt {r7, pc} 701; CHECK-NEXT: .LBB16_1: @ %for.body.preheader 702; CHECK-NEXT: movs r3, #10 703; CHECK-NEXT: dlstp.32 lr, r2 704; CHECK-NEXT: .LBB16_2: @ %for.body 705; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 706; CHECK-NEXT: vldrw.u32 q0, [r0], #16 707; CHECK-NEXT: vqdmullb.s32 q1, q0, r3 708; CHECK-NEXT: vstrw.32 q1, [r1], #16 709; CHECK-NEXT: letp lr, .LBB16_2 710; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 711; CHECK-NEXT: pop {r7, pc} 712entry: 713 %cmp11 = icmp sgt i32 %n, 0 714 br i1 %cmp11, label %for.body, label %for.cond.cleanup 715 716for.cond.cleanup: ; preds = %for.body, %entry 717 ret void 718 719for.body: ; preds = %entry, %for.body 720 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 721 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 722 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 723 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 724 %1 = bitcast ptr %x.addr.014 to ptr 725 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 726 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 727 %3 = tail call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, i32 0) 728 %4 = bitcast <2 x i64> %3 to <4 x i32> 729 %5 = bitcast ptr %y.addr.013 to ptr 730 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %5, i32 4, <4 x i1> %0) 731 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 732 %sub = add nsw i32 %i.012, -4 733 %cmp = icmp sgt i32 %i.012, 4 734 br i1 %cmp, label %for.body, label %for.cond.cleanup 735} 736 737 738define void @vqdmull(ptr %s1, i32 %N) { 739; CHECK-LABEL: vqdmull: 740; CHECK: @ %bb.0: @ %entry 741; CHECK-NEXT: .save {r7, lr} 742; CHECK-NEXT: push {r7, lr} 743; CHECK-NEXT: cmp r1, #1 744; CHECK-NEXT: it lt 745; CHECK-NEXT: poplt {r7, pc} 746; CHECK-NEXT: .LBB17_1: @ %while.body.preheader 747; CHECK-NEXT: movs r2, #10 748; CHECK-NEXT: dlstp.32 lr, r1 749; CHECK-NEXT: .LBB17_2: @ %while.body 750; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 751; CHECK-NEXT: vldrh.s32 q0, [r0] 752; CHECK-NEXT: vqdmullb.s16 q0, q0, r2 753; CHECK-NEXT: vstrw.32 q0, [r0], #16 754; CHECK-NEXT: letp lr, .LBB17_2 755; CHECK-NEXT: @ %bb.3: @ %while.end 756; CHECK-NEXT: pop {r7, pc} 757entry: 758 %cmp11 = icmp sgt i32 %N, 0 759 br i1 %cmp11, label %while.body.lr.ph, label %while.end 760 761while.body.lr.ph: ; preds = %entry 762 br label %while.body 763 764while.body: ; preds = %while.body.lr.ph, %while.body 765 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 766 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 767 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 768 %1 = bitcast ptr %s1.addr.013 to ptr 769 %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer) 770 %3 = sext <4 x i16> %2 to <4 x i32> 771 %4 = bitcast <4 x i32> %3 to <8 x i16> 772 %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>, i32 0, <4 x i1> %0, <4 x i32> %3) 773 %6 = bitcast ptr %s1.addr.013 to ptr 774 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %6, i32 4, <4 x i1> %0) 775 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 776 %sub = add nsw i32 %N.addr.012, -4 777 %cmp = icmp sgt i32 %N.addr.012, 4 778 br i1 %cmp, label %while.body, label %while.end 779 780while.end: ; preds = %while.body, %entry 781 ret void 782} 783 784define void @vqdmulhq(ptr %x, ptr %y, i32 %n) { 785; CHECK-LABEL: vqdmulhq: 786; CHECK: @ %bb.0: @ %entry 787; CHECK-NEXT: .save {r7, lr} 788; CHECK-NEXT: push {r7, lr} 789; CHECK-NEXT: cmp r2, #1 790; CHECK-NEXT: it lt 791; CHECK-NEXT: poplt {r7, pc} 792; CHECK-NEXT: .LBB18_1: @ %for.body.preheader 793; CHECK-NEXT: movs r3, #10 794; CHECK-NEXT: dlstp.32 lr, r2 795; CHECK-NEXT: .LBB18_2: @ %for.body 796; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 797; CHECK-NEXT: vldrw.u32 q0, [r0], #16 798; CHECK-NEXT: vqdmulh.s32 q0, q0, r3 799; CHECK-NEXT: vstrw.32 q0, [r1], #16 800; CHECK-NEXT: letp lr, .LBB18_2 801; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 802; CHECK-NEXT: pop {r7, pc} 803entry: 804 %cmp11 = icmp sgt i32 %n, 0 805 br i1 %cmp11, label %for.body, label %for.cond.cleanup 806 807for.cond.cleanup: ; preds = %for.body, %entry 808 ret void 809 810for.body: ; preds = %entry, %for.body 811 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 812 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 813 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 814 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 815 %1 = bitcast ptr %x.addr.014 to ptr 816 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 817 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 818 %3 = tail call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) 819 %4 = bitcast ptr %y.addr.013 to ptr 820 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 821 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 822 %sub = add nsw i32 %i.012, -4 823 %cmp = icmp sgt i32 %i.012, 4 824 br i1 %cmp, label %for.body, label %for.cond.cleanup 825} 826 827define void @vqdmulh(ptr %s1, i32 %N) { 828; CHECK-LABEL: vqdmulh: 829; CHECK: @ %bb.0: @ %entry 830; CHECK-NEXT: .save {r7, lr} 831; CHECK-NEXT: push {r7, lr} 832; CHECK-NEXT: cmp r1, #1 833; CHECK-NEXT: it lt 834; CHECK-NEXT: poplt {r7, pc} 835; CHECK-NEXT: .LBB19_1: @ %while.body.preheader 836; CHECK-NEXT: movs r2, #10 837; CHECK-NEXT: dlstp.32 lr, r1 838; CHECK-NEXT: .LBB19_2: @ %while.body 839; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 840; CHECK-NEXT: vldrw.u32 q0, [r0] 841; CHECK-NEXT: vqdmulh.s32 q0, q0, r2 842; CHECK-NEXT: vstrw.32 q0, [r0], #16 843; CHECK-NEXT: letp lr, .LBB19_2 844; CHECK-NEXT: @ %bb.3: @ %while.end 845; CHECK-NEXT: pop {r7, pc} 846entry: 847 %cmp11 = icmp sgt i32 %N, 0 848 br i1 %cmp11, label %while.body.lr.ph, label %while.end 849 850while.body.lr.ph: ; preds = %entry 851 br label %while.body 852 853while.body: ; preds = %while.body.lr.ph, %while.body 854 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 855 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 856 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 857 %1 = bitcast ptr %s1.addr.013 to ptr 858 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 859 %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) 860 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 861 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 862 %sub = add nsw i32 %N.addr.012, -4 863 %cmp = icmp sgt i32 %N.addr.012, 4 864 br i1 %cmp, label %while.body, label %while.end 865 866while.end: ; preds = %while.body, %entry 867 ret void 868} 869 870define void @vqrdmulhq(ptr %x, ptr %y, i32 %n) { 871; CHECK-LABEL: vqrdmulhq: 872; CHECK: @ %bb.0: @ %entry 873; CHECK-NEXT: .save {r7, lr} 874; CHECK-NEXT: push {r7, lr} 875; CHECK-NEXT: cmp r2, #1 876; CHECK-NEXT: it lt 877; CHECK-NEXT: poplt {r7, pc} 878; CHECK-NEXT: .LBB20_1: @ %for.body.preheader 879; CHECK-NEXT: movs r3, #10 880; CHECK-NEXT: dlstp.32 lr, r2 881; CHECK-NEXT: .LBB20_2: @ %for.body 882; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 883; CHECK-NEXT: vldrw.u32 q0, [r0], #16 884; CHECK-NEXT: vqrdmulh.s32 q0, q0, r3 885; CHECK-NEXT: vstrw.32 q0, [r1], #16 886; CHECK-NEXT: letp lr, .LBB20_2 887; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 888; CHECK-NEXT: pop {r7, pc} 889entry: 890 %cmp11 = icmp sgt i32 %n, 0 891 br i1 %cmp11, label %for.body, label %for.cond.cleanup 892 893for.cond.cleanup: ; preds = %for.body, %entry 894 ret void 895 896for.body: ; preds = %entry, %for.body 897 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 898 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 899 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 900 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 901 %1 = bitcast ptr %x.addr.014 to ptr 902 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 903 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4 904 %3 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>) 905 %4 = bitcast ptr %y.addr.013 to ptr 906 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %4, i32 4, <4 x i1> %0) 907 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4 908 %sub = add nsw i32 %i.012, -4 909 %cmp = icmp sgt i32 %i.012, 4 910 br i1 %cmp, label %for.body, label %for.cond.cleanup 911} 912 913define void @vqrdmulh(ptr %s1, i32 %N) { 914; CHECK-LABEL: vqrdmulh: 915; CHECK: @ %bb.0: @ %entry 916; CHECK-NEXT: .save {r7, lr} 917; CHECK-NEXT: push {r7, lr} 918; CHECK-NEXT: cmp r1, #1 919; CHECK-NEXT: it lt 920; CHECK-NEXT: poplt {r7, pc} 921; CHECK-NEXT: .LBB21_1: @ %while.body.preheader 922; CHECK-NEXT: movs r2, #10 923; CHECK-NEXT: dlstp.32 lr, r1 924; CHECK-NEXT: .LBB21_2: @ %while.body 925; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 926; CHECK-NEXT: vldrw.u32 q0, [r0] 927; CHECK-NEXT: vqrdmulh.s32 q0, q0, r2 928; CHECK-NEXT: vstrw.32 q0, [r0], #16 929; CHECK-NEXT: letp lr, .LBB21_2 930; CHECK-NEXT: @ %bb.3: @ %while.end 931; CHECK-NEXT: pop {r7, pc} 932entry: 933 %cmp11 = icmp sgt i32 %N, 0 934 br i1 %cmp11, label %while.body.lr.ph, label %while.end 935 936while.body.lr.ph: ; preds = %entry 937 br label %while.body 938 939while.body: ; preds = %while.body.lr.ph, %while.body 940 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 941 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 942 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 943 %1 = bitcast ptr %s1.addr.013 to ptr 944 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 945 %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i1> %0, <4 x i32> %2) 946 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %1, i32 4, <4 x i1> %0) 947 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4 948 %sub = add nsw i32 %N.addr.012, -4 949 %cmp = icmp sgt i32 %N.addr.012, 4 950 br i1 %cmp, label %while.body, label %while.end 951 952while.end: ; preds = %while.body, %entry 953 ret void 954} 955 956define void @vmlaq(ptr %x, ptr %y, i32 %n) { 957; CHECK-LABEL: vmlaq: 958; CHECK: @ %bb.0: @ %entry 959; CHECK-NEXT: .save {r7, lr} 960; CHECK-NEXT: push {r7, lr} 961; CHECK-NEXT: cmp r2, #1 962; CHECK-NEXT: it lt 963; CHECK-NEXT: poplt {r7, pc} 964; CHECK-NEXT: .LBB22_1: @ %for.body.preheader 965; CHECK-NEXT: movs r3, #10 966; CHECK-NEXT: dlstp.32 lr, r2 967; CHECK-NEXT: .LBB22_2: @ %for.body 968; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 969; CHECK-NEXT: vldrw.u32 q0, [r1] 970; CHECK-NEXT: vldrw.u32 q1, [r0], #16 971; CHECK-NEXT: vmla.i32 q1, q0, r3 972; CHECK-NEXT: vstrw.32 q1, [r1], #16 973; CHECK-NEXT: letp lr, .LBB22_2 974; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 975; CHECK-NEXT: pop {r7, pc} 976entry: 977 %cmp14 = icmp sgt i32 %n, 0 978 br i1 %cmp14, label %for.body, label %for.cond.cleanup 979 980for.cond.cleanup: ; preds = %for.body, %entry 981 ret void 982 983for.body: ; preds = %entry, %for.body 984 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 985 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 986 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 987 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) 988 %1 = bitcast ptr %x.addr.017 to ptr 989 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 990 %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4 991 %3 = bitcast ptr %y.addr.016 to ptr 992 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 993 %5 = mul <4 x i32> %4, <i32 10, i32 10, i32 10, i32 10> 994 %6 = add <4 x i32> %5, %2 995 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %3, i32 4, <4 x i1> %0) 996 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4 997 %sub = add nsw i32 %i.015, -4 998 %cmp = icmp sgt i32 %i.015, 4 999 br i1 %cmp, label %for.body, label %for.cond.cleanup 1000} 1001 1002define void @vmlaqp(ptr %x, ptr %y, i32 %n) { 1003; CHECK-LABEL: vmlaqp: 1004; CHECK: @ %bb.0: @ %entry 1005; CHECK-NEXT: .save {r7, lr} 1006; CHECK-NEXT: push {r7, lr} 1007; CHECK-NEXT: cmp r2, #1 1008; CHECK-NEXT: it lt 1009; CHECK-NEXT: poplt {r7, pc} 1010; CHECK-NEXT: .LBB23_1: @ %for.body.preheader 1011; CHECK-NEXT: movs r3, #10 1012; CHECK-NEXT: dlstp.32 lr, r2 1013; CHECK-NEXT: .LBB23_2: @ %for.body 1014; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1015; CHECK-NEXT: vldrw.u32 q0, [r1] 1016; CHECK-NEXT: vldrw.u32 q1, [r0], #16 1017; CHECK-NEXT: vmla.i32 q1, q0, r3 1018; CHECK-NEXT: vstrw.32 q1, [r1], #16 1019; CHECK-NEXT: letp lr, .LBB23_2 1020; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1021; CHECK-NEXT: pop {r7, pc} 1022entry: 1023 %cmp15 = icmp sgt i32 %n, 0 1024 br i1 %cmp15, label %for.body, label %for.cond.cleanup 1025 1026for.cond.cleanup: ; preds = %for.body, %entry 1027 ret void 1028 1029for.body: ; preds = %entry, %for.body 1030 %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1031 %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1032 %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1033 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016) 1034 %1 = bitcast ptr %x.addr.018 to ptr 1035 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 1036 %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4 1037 %3 = bitcast ptr %y.addr.017 to ptr 1038 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 1039 %5 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 10, <4 x i1> %0) 1040 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %3, i32 4, <4 x i1> %0) 1041 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4 1042 %sub = add nsw i32 %i.016, -4 1043 %cmp = icmp sgt i32 %i.016, 4 1044 br i1 %cmp, label %for.body, label %for.cond.cleanup 1045} 1046 1047define void @vmlasq(ptr %x, ptr %y, i32 %n) { 1048; CHECK-LABEL: vmlasq: 1049; CHECK: @ %bb.0: @ %entry 1050; CHECK-NEXT: .save {r7, lr} 1051; CHECK-NEXT: push {r7, lr} 1052; CHECK-NEXT: cmp r2, #1 1053; CHECK-NEXT: it lt 1054; CHECK-NEXT: poplt {r7, pc} 1055; CHECK-NEXT: .LBB24_1: @ %for.body.preheader 1056; CHECK-NEXT: movs r3, #10 1057; CHECK-NEXT: dlstp.32 lr, r2 1058; CHECK-NEXT: .LBB24_2: @ %for.body 1059; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1060; CHECK-NEXT: vldrw.u32 q0, [r0], #16 1061; CHECK-NEXT: vldrw.u32 q1, [r1] 1062; CHECK-NEXT: vmlas.i32 q1, q0, r3 1063; CHECK-NEXT: vstrw.32 q1, [r1], #16 1064; CHECK-NEXT: letp lr, .LBB24_2 1065; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1066; CHECK-NEXT: pop {r7, pc} 1067entry: 1068 %cmp14 = icmp sgt i32 %n, 0 1069 br i1 %cmp14, label %for.body, label %for.cond.cleanup 1070 1071for.cond.cleanup: ; preds = %for.body, %entry 1072 ret void 1073 1074for.body: ; preds = %entry, %for.body 1075 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1076 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1077 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1078 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) 1079 %1 = bitcast ptr %x.addr.017 to ptr 1080 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 1081 %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4 1082 %3 = bitcast ptr %y.addr.016 to ptr 1083 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 1084 %5 = mul <4 x i32> %4, %2 1085 %6 = add <4 x i32> %5, <i32 10, i32 10, i32 10, i32 10> 1086 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %3, i32 4, <4 x i1> %0) 1087 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4 1088 %sub = add nsw i32 %i.015, -4 1089 %cmp = icmp sgt i32 %i.015, 4 1090 br i1 %cmp, label %for.body, label %for.cond.cleanup 1091} 1092 1093define void @vmlasqp(ptr %x, ptr %y, i32 %n) { 1094; CHECK-LABEL: vmlasqp: 1095; CHECK: @ %bb.0: @ %entry 1096; CHECK-NEXT: .save {r7, lr} 1097; CHECK-NEXT: push {r7, lr} 1098; CHECK-NEXT: cmp r2, #1 1099; CHECK-NEXT: it lt 1100; CHECK-NEXT: poplt {r7, pc} 1101; CHECK-NEXT: .LBB25_1: @ %for.body.preheader 1102; CHECK-NEXT: movs r3, #10 1103; CHECK-NEXT: dlstp.32 lr, r2 1104; CHECK-NEXT: .LBB25_2: @ %for.body 1105; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1106; CHECK-NEXT: vldrw.u32 q0, [r1] 1107; CHECK-NEXT: vldrw.u32 q1, [r0], #16 1108; CHECK-NEXT: vmlas.i32 q1, q0, r3 1109; CHECK-NEXT: vstrw.32 q1, [r1], #16 1110; CHECK-NEXT: letp lr, .LBB25_2 1111; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1112; CHECK-NEXT: pop {r7, pc} 1113entry: 1114 %cmp15 = icmp sgt i32 %n, 0 1115 br i1 %cmp15, label %for.body, label %for.cond.cleanup 1116 1117for.cond.cleanup: ; preds = %for.body, %entry 1118 ret void 1119 1120for.body: ; preds = %entry, %for.body 1121 %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1122 %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1123 %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1124 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016) 1125 %1 = bitcast ptr %x.addr.018 to ptr 1126 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 1127 %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4 1128 %3 = bitcast ptr %y.addr.017 to ptr 1129 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 1130 %5 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 10, <4 x i1> %0) 1131 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %3, i32 4, <4 x i1> %0) 1132 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4 1133 %sub = add nsw i32 %i.016, -4 1134 %cmp = icmp sgt i32 %i.016, 4 1135 br i1 %cmp, label %for.body, label %for.cond.cleanup 1136} 1137 1138define void @vaddqf(ptr %x, ptr %y, i32 %n) { 1139; CHECK-LABEL: vaddqf: 1140; CHECK: @ %bb.0: @ %entry 1141; CHECK-NEXT: .save {r7, lr} 1142; CHECK-NEXT: push {r7, lr} 1143; CHECK-NEXT: cmp r2, #1 1144; CHECK-NEXT: it lt 1145; CHECK-NEXT: poplt {r7, pc} 1146; CHECK-NEXT: .LBB26_1: @ %for.body.preheader 1147; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 1148; CHECK-NEXT: dlstp.32 lr, r2 1149; CHECK-NEXT: .LBB26_2: @ %for.body 1150; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1151; CHECK-NEXT: vldrw.u32 q1, [r0], #16 1152; CHECK-NEXT: vadd.f32 q1, q1, q0 1153; CHECK-NEXT: vstrw.32 q1, [r1], #16 1154; CHECK-NEXT: letp lr, .LBB26_2 1155; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1156; CHECK-NEXT: pop {r7, pc} 1157entry: 1158 %cmp11 = icmp sgt i32 %n, 0 1159 br i1 %cmp11, label %for.body, label %for.cond.cleanup 1160 1161for.cond.cleanup: ; preds = %for.body, %entry 1162 ret void 1163 1164for.body: ; preds = %entry, %for.body 1165 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1166 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1167 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1168 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 1169 %1 = bitcast ptr %x.addr.014 to ptr 1170 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1171 %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4 1172 %3 = fadd fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0> 1173 %4 = bitcast ptr %y.addr.013 to ptr 1174 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %0) 1175 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4 1176 %sub = add nsw i32 %i.012, -4 1177 %cmp = icmp sgt i32 %i.012, 4 1178 br i1 %cmp, label %for.body, label %for.cond.cleanup 1179} 1180 1181define void @vaddf(ptr %s1, i32 %N) { 1182; CHECK-LABEL: vaddf: 1183; CHECK: @ %bb.0: @ %entry 1184; CHECK-NEXT: .save {r7, lr} 1185; CHECK-NEXT: push {r7, lr} 1186; CHECK-NEXT: cmp r1, #1 1187; CHECK-NEXT: it lt 1188; CHECK-NEXT: poplt {r7, pc} 1189; CHECK-NEXT: .LBB27_1: @ %while.body.preheader 1190; CHECK-NEXT: movs r2, #0 1191; CHECK-NEXT: movt r2, #16672 1192; CHECK-NEXT: dlstp.32 lr, r1 1193; CHECK-NEXT: .LBB27_2: @ %while.body 1194; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1195; CHECK-NEXT: vldrw.u32 q0, [r0] 1196; CHECK-NEXT: vadd.f32 q0, q0, r2 1197; CHECK-NEXT: vstrw.32 q0, [r0], #16 1198; CHECK-NEXT: letp lr, .LBB27_2 1199; CHECK-NEXT: @ %bb.3: @ %while.end 1200; CHECK-NEXT: pop {r7, pc} 1201entry: 1202 %cmp11 = icmp sgt i32 %N, 0 1203 br i1 %cmp11, label %while.body.lr.ph, label %while.end 1204 1205while.body.lr.ph: ; preds = %entry 1206 br label %while.body 1207 1208while.body: ; preds = %while.body.lr.ph, %while.body 1209 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 1210 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 1211 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 1212 %1 = bitcast ptr %s1.addr.013 to ptr 1213 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1214 %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2) 1215 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %1, i32 4, <4 x i1> %0) 1216 %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4 1217 %sub = add nsw i32 %N.addr.012, -4 1218 %cmp = icmp sgt i32 %N.addr.012, 4 1219 br i1 %cmp, label %while.body, label %while.end 1220 1221while.end: ; preds = %while.body, %entry 1222 ret void 1223} 1224 1225define void @vsubqf(ptr %x, ptr %y, i32 %n) { 1226; CHECK-LABEL: vsubqf: 1227; CHECK: @ %bb.0: @ %entry 1228; CHECK-NEXT: .save {r7, lr} 1229; CHECK-NEXT: push {r7, lr} 1230; CHECK-NEXT: cmp r2, #1 1231; CHECK-NEXT: it lt 1232; CHECK-NEXT: poplt {r7, pc} 1233; CHECK-NEXT: .LBB28_1: @ %for.body.preheader 1234; CHECK-NEXT: vmov.f32 q0, #-1.000000e+01 1235; CHECK-NEXT: dlstp.32 lr, r2 1236; CHECK-NEXT: .LBB28_2: @ %for.body 1237; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1238; CHECK-NEXT: vldrw.u32 q1, [r0], #16 1239; CHECK-NEXT: vadd.f32 q1, q1, q0 1240; CHECK-NEXT: vstrw.32 q1, [r1], #16 1241; CHECK-NEXT: letp lr, .LBB28_2 1242; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1243; CHECK-NEXT: pop {r7, pc} 1244entry: 1245 %cmp11 = icmp sgt i32 %n, 0 1246 br i1 %cmp11, label %for.body, label %for.cond.cleanup 1247 1248for.cond.cleanup: ; preds = %for.body, %entry 1249 ret void 1250 1251for.body: ; preds = %entry, %for.body 1252 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1253 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1254 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1255 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 1256 %1 = bitcast ptr %x.addr.014 to ptr 1257 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1258 %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4 1259 %3 = fsub fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0> 1260 %4 = bitcast ptr %y.addr.013 to ptr 1261 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %0) 1262 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4 1263 %sub = add nsw i32 %i.012, -4 1264 %cmp = icmp sgt i32 %i.012, 4 1265 br i1 %cmp, label %for.body, label %for.cond.cleanup 1266} 1267 1268define void @vsubf(ptr %s1, i32 %N) { 1269; CHECK-LABEL: vsubf: 1270; CHECK: @ %bb.0: @ %entry 1271; CHECK-NEXT: .save {r7, lr} 1272; CHECK-NEXT: push {r7, lr} 1273; CHECK-NEXT: cmp r1, #1 1274; CHECK-NEXT: it lt 1275; CHECK-NEXT: poplt {r7, pc} 1276; CHECK-NEXT: .LBB29_1: @ %while.body.preheader 1277; CHECK-NEXT: movs r2, #0 1278; CHECK-NEXT: movt r2, #16672 1279; CHECK-NEXT: dlstp.32 lr, r1 1280; CHECK-NEXT: .LBB29_2: @ %while.body 1281; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1282; CHECK-NEXT: vldrw.u32 q0, [r0] 1283; CHECK-NEXT: vsub.f32 q0, q0, r2 1284; CHECK-NEXT: vstrw.32 q0, [r0], #16 1285; CHECK-NEXT: letp lr, .LBB29_2 1286; CHECK-NEXT: @ %bb.3: @ %while.end 1287; CHECK-NEXT: pop {r7, pc} 1288entry: 1289 %cmp11 = icmp sgt i32 %N, 0 1290 br i1 %cmp11, label %while.body.lr.ph, label %while.end 1291 1292while.body.lr.ph: ; preds = %entry 1293 br label %while.body 1294 1295while.body: ; preds = %while.body.lr.ph, %while.body 1296 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 1297 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 1298 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 1299 %1 = bitcast ptr %s1.addr.013 to ptr 1300 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1301 %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2) 1302 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %1, i32 4, <4 x i1> %0) 1303 %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4 1304 %sub = add nsw i32 %N.addr.012, -4 1305 %cmp = icmp sgt i32 %N.addr.012, 4 1306 br i1 %cmp, label %while.body, label %while.end 1307 1308while.end: ; preds = %while.body, %entry 1309 ret void 1310} 1311 1312define void @vmulqf(ptr %x, ptr %y, i32 %n) { 1313; CHECK-LABEL: vmulqf: 1314; CHECK: @ %bb.0: @ %entry 1315; CHECK-NEXT: .save {r7, lr} 1316; CHECK-NEXT: push {r7, lr} 1317; CHECK-NEXT: cmp r2, #1 1318; CHECK-NEXT: it lt 1319; CHECK-NEXT: poplt {r7, pc} 1320; CHECK-NEXT: .LBB30_1: @ %for.body.preheader 1321; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 1322; CHECK-NEXT: dlstp.32 lr, r2 1323; CHECK-NEXT: .LBB30_2: @ %for.body 1324; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1325; CHECK-NEXT: vldrw.u32 q1, [r0], #16 1326; CHECK-NEXT: vmul.f32 q1, q1, q0 1327; CHECK-NEXT: vstrw.32 q1, [r1], #16 1328; CHECK-NEXT: letp lr, .LBB30_2 1329; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1330; CHECK-NEXT: pop {r7, pc} 1331entry: 1332 %cmp11 = icmp sgt i32 %n, 0 1333 br i1 %cmp11, label %for.body, label %for.cond.cleanup 1334 1335for.cond.cleanup: ; preds = %for.body, %entry 1336 ret void 1337 1338for.body: ; preds = %entry, %for.body 1339 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1340 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1341 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1342 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012) 1343 %1 = bitcast ptr %x.addr.014 to ptr 1344 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1345 %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4 1346 %3 = fmul fast <4 x float> %2, <float 10.0, float 10.0, float 10.0, float 10.0> 1347 %4 = bitcast ptr %y.addr.013 to ptr 1348 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %0) 1349 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4 1350 %sub = add nsw i32 %i.012, -4 1351 %cmp = icmp sgt i32 %i.012, 4 1352 br i1 %cmp, label %for.body, label %for.cond.cleanup 1353} 1354 1355define void @vmulf(ptr %s1, i32 %N) { 1356; CHECK-LABEL: vmulf: 1357; CHECK: @ %bb.0: @ %entry 1358; CHECK-NEXT: .save {r7, lr} 1359; CHECK-NEXT: push {r7, lr} 1360; CHECK-NEXT: cmp r1, #1 1361; CHECK-NEXT: it lt 1362; CHECK-NEXT: poplt {r7, pc} 1363; CHECK-NEXT: .LBB31_1: @ %while.body.preheader 1364; CHECK-NEXT: movs r2, #0 1365; CHECK-NEXT: movt r2, #16672 1366; CHECK-NEXT: dlstp.32 lr, r1 1367; CHECK-NEXT: .LBB31_2: @ %while.body 1368; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1369; CHECK-NEXT: vldrw.u32 q0, [r0] 1370; CHECK-NEXT: vmul.f32 q0, q0, r2 1371; CHECK-NEXT: vstrw.32 q0, [r0], #16 1372; CHECK-NEXT: letp lr, .LBB31_2 1373; CHECK-NEXT: @ %bb.3: @ %while.end 1374; CHECK-NEXT: pop {r7, pc} 1375entry: 1376 %cmp11 = icmp sgt i32 %N, 0 1377 br i1 %cmp11, label %while.body.lr.ph, label %while.end 1378 1379while.body.lr.ph: ; preds = %entry 1380 br label %while.body 1381 1382while.body: ; preds = %while.body.lr.ph, %while.body 1383 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 1384 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 1385 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 1386 %1 = bitcast ptr %s1.addr.013 to ptr 1387 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1388 %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %0, <4 x float> %2) 1389 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %1, i32 4, <4 x i1> %0) 1390 %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4 1391 %sub = add nsw i32 %N.addr.012, -4 1392 %cmp = icmp sgt i32 %N.addr.012, 4 1393 br i1 %cmp, label %while.body, label %while.end 1394 1395while.end: ; preds = %while.body, %entry 1396 ret void 1397} 1398 1399define void @vfmaq(ptr %x, ptr %y, i32 %n) { 1400; CHECK-LABEL: vfmaq: 1401; CHECK: @ %bb.0: @ %entry 1402; CHECK-NEXT: .save {r7, lr} 1403; CHECK-NEXT: push {r7, lr} 1404; CHECK-NEXT: cmp r2, #1 1405; CHECK-NEXT: it lt 1406; CHECK-NEXT: poplt {r7, pc} 1407; CHECK-NEXT: .LBB32_1: @ %for.body.preheader 1408; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 1409; CHECK-NEXT: dlstp.32 lr, r2 1410; CHECK-NEXT: .LBB32_2: @ %for.body 1411; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1412; CHECK-NEXT: vldrw.u32 q1, [r1] 1413; CHECK-NEXT: vldrw.u32 q2, [r0], #16 1414; CHECK-NEXT: vfma.f32 q2, q1, q0 1415; CHECK-NEXT: vstrw.32 q2, [r1], #16 1416; CHECK-NEXT: letp lr, .LBB32_2 1417; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1418; CHECK-NEXT: pop {r7, pc} 1419entry: 1420 %cmp14 = icmp sgt i32 %n, 0 1421 br i1 %cmp14, label %for.body, label %for.cond.cleanup 1422 1423for.cond.cleanup: ; preds = %for.body, %entry 1424 ret void 1425 1426for.body: ; preds = %entry, %for.body 1427 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1428 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1429 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1430 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) 1431 %1 = bitcast ptr %x.addr.017 to ptr 1432 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1433 %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4 1434 %3 = bitcast ptr %y.addr.016 to ptr 1435 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1436 %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x float> %2) 1437 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %3, i32 4, <4 x i1> %0) 1438 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4 1439 %sub = add nsw i32 %i.015, -4 1440 %cmp = icmp sgt i32 %i.015, 4 1441 br i1 %cmp, label %for.body, label %for.cond.cleanup 1442} 1443 1444define void @vfma(ptr %s1, ptr %s2, i32 %N) { 1445; CHECK-LABEL: vfma: 1446; CHECK: @ %bb.0: @ %entry 1447; CHECK-NEXT: .save {r7, lr} 1448; CHECK-NEXT: push {r7, lr} 1449; CHECK-NEXT: cmp r2, #1 1450; CHECK-NEXT: it lt 1451; CHECK-NEXT: poplt {r7, pc} 1452; CHECK-NEXT: .LBB33_1: @ %while.body.lr.ph 1453; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 1454; CHECK-NEXT: dlstp.32 lr, r2 1455; CHECK-NEXT: .LBB33_2: @ %while.body 1456; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1457; CHECK-NEXT: vldrw.u32 q1, [r1] 1458; CHECK-NEXT: vldrw.u32 q2, [r0] 1459; CHECK-NEXT: vfma.f32 q2, q1, q0 1460; CHECK-NEXT: vstrw.32 q2, [r0], #16 1461; CHECK-NEXT: letp lr, .LBB33_2 1462; CHECK-NEXT: @ %bb.3: @ %while.end 1463; CHECK-NEXT: pop {r7, pc} 1464entry: 1465 %cmp12 = icmp sgt i32 %N, 0 1466 br i1 %cmp12, label %while.body.lr.ph, label %while.end 1467 1468while.body.lr.ph: ; preds = %entry 1469 %0 = bitcast ptr %s2 to ptr 1470 br label %while.body 1471 1472while.body: ; preds = %while.body.lr.ph, %while.body 1473 %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 1474 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 1475 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) 1476 %2 = bitcast ptr %s1.addr.014 to ptr 1477 %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 1478 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 1479 %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x float> %3, <4 x i1> %1) 1480 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %2, i32 4, <4 x i1> %1) 1481 %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4 1482 %sub = add nsw i32 %N.addr.013, -4 1483 %cmp = icmp sgt i32 %N.addr.013, 4 1484 br i1 %cmp, label %while.body, label %while.end 1485 1486while.end: ; preds = %while.body, %entry 1487 ret void 1488} 1489 1490define void @vfmasq(ptr %x, ptr %y, i32 %n) { 1491; CHECK-LABEL: vfmasq: 1492; CHECK: @ %bb.0: @ %entry 1493; CHECK-NEXT: .save {r7, lr} 1494; CHECK-NEXT: push {r7, lr} 1495; CHECK-NEXT: cmp r2, #1 1496; CHECK-NEXT: it lt 1497; CHECK-NEXT: poplt {r7, pc} 1498; CHECK-NEXT: .LBB34_1: @ %for.body.preheader 1499; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 1500; CHECK-NEXT: dlstp.32 lr, r2 1501; CHECK-NEXT: .LBB34_2: @ %for.body 1502; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1503; CHECK-NEXT: vmov q3, q0 1504; CHECK-NEXT: vldrw.u32 q1, [r1] 1505; CHECK-NEXT: vldrw.u32 q2, [r0], #16 1506; CHECK-NEXT: vfma.f32 q3, q2, q1 1507; CHECK-NEXT: vstrw.32 q3, [r1], #16 1508; CHECK-NEXT: letp lr, .LBB34_2 1509; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1510; CHECK-NEXT: pop {r7, pc} 1511entry: 1512 %cmp14 = icmp sgt i32 %n, 0 1513 br i1 %cmp14, label %for.body, label %for.cond.cleanup 1514 1515for.cond.cleanup: ; preds = %for.body, %entry 1516 ret void 1517 1518for.body: ; preds = %entry, %for.body 1519 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ] 1520 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ] 1521 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 1522 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015) 1523 %1 = bitcast ptr %x.addr.017 to ptr 1524 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1525 %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4 1526 %3 = bitcast ptr %y.addr.016 to ptr 1527 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 1528 %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %2, <4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>) 1529 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %3, i32 4, <4 x i1> %0) 1530 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4 1531 %sub = add nsw i32 %i.015, -4 1532 %cmp = icmp sgt i32 %i.015, 4 1533 br i1 %cmp, label %for.body, label %for.cond.cleanup 1534} 1535 1536define void @vfmas(ptr %s1, ptr %s2, i32 %N) { 1537; CHECK-LABEL: vfmas: 1538; CHECK: @ %bb.0: @ %entry 1539; CHECK-NEXT: .save {r7, lr} 1540; CHECK-NEXT: push {r7, lr} 1541; CHECK-NEXT: cmp r2, #1 1542; CHECK-NEXT: it lt 1543; CHECK-NEXT: poplt {r7, pc} 1544; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph 1545; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 1546; CHECK-NEXT: dlstp.32 lr, r2 1547; CHECK-NEXT: .LBB35_2: @ %while.body 1548; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1549; CHECK-NEXT: vmov q3, q0 1550; CHECK-NEXT: vldrw.u32 q1, [r1] 1551; CHECK-NEXT: vldrw.u32 q2, [r0] 1552; CHECK-NEXT: vfma.f32 q3, q2, q1 1553; CHECK-NEXT: vstrw.32 q3, [r0], #16 1554; CHECK-NEXT: letp lr, .LBB35_2 1555; CHECK-NEXT: @ %bb.3: @ %while.end 1556; CHECK-NEXT: pop {r7, pc} 1557entry: 1558 %cmp12 = icmp sgt i32 %N, 0 1559 br i1 %cmp12, label %while.body.lr.ph, label %while.end 1560 1561while.body.lr.ph: ; preds = %entry 1562 %0 = bitcast ptr %s2 to ptr 1563 br label %while.body 1564 1565while.body: ; preds = %while.body.lr.ph, %while.body 1566 %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 1567 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 1568 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) 1569 %2 = bitcast ptr %s1.addr.014 to ptr 1570 %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 1571 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 1572 %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> <float 10.0, float 10.0, float 10.0, float 10.0>, <4 x i1> %1) 1573 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %5, ptr %2, i32 4, <4 x i1> %1) 1574 %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4 1575 %sub = add nsw i32 %N.addr.013, -4 1576 %cmp = icmp sgt i32 %N.addr.013, 4 1577 br i1 %cmp, label %while.body, label %while.end 1578 1579while.end: ; preds = %while.body, %entry 1580 ret void 1581} 1582 1583define void @rgbconvert(ptr noalias %pwSourceBase, i16 signext %iSourceStride, ptr noalias %phwTargetBase, i16 signext %iTargetStride, i16 %iHeight, i16 %iWidth) { 1584; CHECK-LABEL: rgbconvert: 1585; CHECK: @ %bb.0: @ %entry 1586; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1587; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1588; CHECK-NEXT: .pad #4 1589; CHECK-NEXT: sub sp, #4 1590; CHECK-NEXT: .vsave {d8, d9, d10, d11} 1591; CHECK-NEXT: vpush {d8, d9, d10, d11} 1592; CHECK-NEXT: .pad #8 1593; CHECK-NEXT: sub sp, #8 1594; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill 1595; CHECK-NEXT: ldrsh.w r3, [sp, #80] 1596; CHECK-NEXT: cmp r3, #1 1597; CHECK-NEXT: blt .LBB36_5 1598; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph 1599; CHECK-NEXT: mov r9, r2 1600; CHECK-NEXT: ldr r2, [sp, #84] 1601; CHECK-NEXT: mov.w r10, #0 1602; CHECK-NEXT: mov.w r11, #8388608 1603; CHECK-NEXT: mov.w r4, #67108864 1604; CHECK-NEXT: sxth.w r12, r2 1605; CHECK-NEXT: vmov.i32 q0, #0xf800 1606; CHECK-NEXT: vmov.i32 q1, #0x1f 1607; CHECK-NEXT: mov.w r2, #2016 1608; CHECK-NEXT: mov.w r7, #268435456 1609; CHECK-NEXT: vdup.32 q2, r2 1610; CHECK-NEXT: .LBB36_2: @ %for.body 1611; CHECK-NEXT: @ =>This Loop Header: Depth=1 1612; CHECK-NEXT: @ Child Loop BB36_3 Depth 2 1613; CHECK-NEXT: mov r2, r9 1614; CHECK-NEXT: mov r5, r0 1615; CHECK-NEXT: dlstp.32 lr, r12 1616; CHECK-NEXT: .LBB36_3: @ %do.body 1617; CHECK-NEXT: @ Parent Loop BB36_2 Depth=1 1618; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1619; CHECK-NEXT: vldrw.u32 q3, [r5], #16 1620; CHECK-NEXT: vqdmulh.s32 q4, q3, r4 1621; CHECK-NEXT: vqdmulh.s32 q5, q3, r7 1622; CHECK-NEXT: vqdmulh.s32 q3, q3, r11 1623; CHECK-NEXT: vand q4, q4, q2 1624; CHECK-NEXT: vand q5, q5, q1 1625; CHECK-NEXT: vand q3, q3, q0 1626; CHECK-NEXT: vorr q4, q4, q5 1627; CHECK-NEXT: vorr q3, q4, q3 1628; CHECK-NEXT: vstrh.32 q3, [r2], #8 1629; CHECK-NEXT: letp lr, .LBB36_3 1630; CHECK-NEXT: @ %bb.4: @ %do.end 1631; CHECK-NEXT: @ in Loop: Header=BB36_2 Depth=1 1632; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload 1633; CHECK-NEXT: add.w r10, r10, #1 1634; CHECK-NEXT: add.w r0, r0, r1, lsl #2 1635; CHECK-NEXT: cmp r10, r3 1636; CHECK-NEXT: add.w r9, r9, r2, lsl #1 1637; CHECK-NEXT: bne .LBB36_2 1638; CHECK-NEXT: .LBB36_5: @ %for.cond.cleanup 1639; CHECK-NEXT: add sp, #8 1640; CHECK-NEXT: vpop {d8, d9, d10, d11} 1641; CHECK-NEXT: add sp, #4 1642; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1643entry: 1644 %conv = sext i16 %iHeight to i32 1645 %conv9 = sext i16 %iSourceStride to i32 1646 %conv11 = sext i16 %iTargetStride to i32 1647 %cmp37 = icmp sgt i16 %iHeight, 0 1648 br i1 %cmp37, label %for.body.lr.ph, label %for.cond.cleanup 1649 1650for.body.lr.ph: ; preds = %entry 1651 %conv2 = sext i16 %iWidth to i32 1652 br label %for.body 1653 1654for.cond.cleanup: ; preds = %do.end, %entry 1655 ret void 1656 1657for.body: ; preds = %for.body.lr.ph, %do.end 1658 %pwSourceBase.addr.040 = phi ptr [ %pwSourceBase, %for.body.lr.ph ], [ %add.ptr10, %do.end ] 1659 %phwTargetBase.addr.039 = phi ptr [ %phwTargetBase, %for.body.lr.ph ], [ %add.ptr12, %do.end ] 1660 %y.038 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %do.end ] 1661 br label %do.body 1662 1663do.body: ; preds = %do.body, %for.body 1664 %pTarget.0 = phi ptr [ %phwTargetBase.addr.039, %for.body ], [ %add.ptr6, %do.body ] 1665 %pSource.0 = phi ptr [ %pwSourceBase.addr.040, %for.body ], [ %add.ptr, %do.body ] 1666 %blkCnt.0 = phi i32 [ %conv2, %for.body ], [ %sub, %do.body ] 1667 %l2 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) 1668 %l3 = bitcast ptr %pSource.0 to ptr 1669 %l4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %l3, i32 4, <4 x i1> %l2, <4 x i32> zeroinitializer) 1670 %l5 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 268435456, i32 268435456, i32 268435456, i32 268435456>, <4 x i1> %l2, <4 x i32> undef) 1671 %and = and <4 x i32> %l5, <i32 31, i32 31, i32 31, i32 31> 1672 %l6 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 67108864, i32 67108864, i32 67108864, i32 67108864>, <4 x i1> %l2, <4 x i32> undef) 1673 %and3 = and <4 x i32> %l6, <i32 2016, i32 2016, i32 2016, i32 2016> 1674 %l7 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %l4, <4 x i32> <i32 8388608, i32 8388608, i32 8388608, i32 8388608>, <4 x i1> %l2, <4 x i32> undef) 1675 %and4 = and <4 x i32> %l7, <i32 63488, i32 63488, i32 63488, i32 63488> 1676 %or = or <4 x i32> %and3, %and 1677 %or5 = or <4 x i32> %or, %and4 1678 %l8 = trunc <4 x i32> %or5 to <4 x i16> 1679 %l9 = bitcast ptr %pTarget.0 to ptr 1680 tail call void @llvm.masked.store.v4i16.p0(<4 x i16> %l8, ptr %l9, i32 2, <4 x i1> %l2) 1681 %add.ptr = getelementptr inbounds i32, ptr %pSource.0, i32 4 1682 %add.ptr6 = getelementptr inbounds i16, ptr %pTarget.0, i32 4 1683 %sub = add nsw i32 %blkCnt.0, -4 1684 %cmp7 = icmp sgt i32 %blkCnt.0, 4 1685 br i1 %cmp7, label %do.body, label %do.end 1686 1687do.end: ; preds = %do.body 1688 %add.ptr10 = getelementptr inbounds i32, ptr %pwSourceBase.addr.040, i32 %conv9 1689 %add.ptr12 = getelementptr inbounds i16, ptr %phwTargetBase.addr.039, i32 %conv11 1690 %inc = add nuw nsw i32 %y.038, 1 1691 %exitcond.not = icmp eq i32 %inc, %conv 1692 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 1693} 1694 1695declare <4 x i1> @llvm.arm.mve.vctp32(i32) 1696declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) 1697declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) 1698declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) 1699declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) 1700declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>) 1701declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32 immarg, <4 x i1>) #3 1702 1703declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 1704declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 1705declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 1706declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 1707declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) 1708declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) 1709declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 1710declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) 1711declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) 1712declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 1713declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32) 1714declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 1715declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32) 1716declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1 1717declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>) 1718declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 1719declare <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32>, <4 x i32>) 1720declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 1721declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) 1722declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 1723declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) 1724declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) 1725declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) 1726declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) 1727declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) 1728declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 1729