1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s 3 4define dso_local i32 @mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { 5; CHECK-LABEL: mul_reduce_add: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: cmp r2, #0 8; CHECK-NEXT: itt eq 9; CHECK-NEXT: moveq r0, #0 10; CHECK-NEXT: bxeq lr 11; CHECK-NEXT: .LBB0_1: @ %vector.ph 12; CHECK-NEXT: push {r7, lr} 13; CHECK-NEXT: adds r3, r2, #3 14; CHECK-NEXT: vmov.i32 q1, #0x0 15; CHECK-NEXT: bic r3, r3, #3 16; CHECK-NEXT: sub.w r12, r3, #4 17; CHECK-NEXT: movs r3, #1 18; CHECK-NEXT: add.w r3, r3, r12, lsr #2 19; CHECK-NEXT: dls lr, r3 20; CHECK-NEXT: .LBB0_2: @ %vector.body 21; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 22; CHECK-NEXT: vctp.32 r2 23; CHECK-NEXT: vmov q0, q1 24; CHECK-NEXT: vpstt 25; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 26; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 27; CHECK-NEXT: subs r2, #4 28; CHECK-NEXT: vmul.i32 q1, q2, q1 29; CHECK-NEXT: vadd.i32 q1, q1, q0 30; CHECK-NEXT: le lr, .LBB0_2 31; CHECK-NEXT: @ %bb.3: @ %middle.block 32; CHECK-NEXT: vpsel q0, q1, q0 33; CHECK-NEXT: vaddv.u32 r0, q0 34; CHECK-NEXT: pop {r7, pc} 35entry: 36 %cmp8 = icmp eq i32 %N, 0 37 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 38 39vector.ph: ; preds = %entry 40 %n.rnd.up = add i32 %N, 3 41 %n.vec = and i32 %n.rnd.up, -4 42 br label %vector.body 43 44vector.body: ; preds = %vector.body, %vector.ph 45 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 46 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] 47 %0 = getelementptr inbounds i32, ptr %a, i32 %index 48 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 49 %2 = bitcast ptr %0 to ptr 50 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 51 %3 = getelementptr inbounds i32, ptr %b, i32 %index 52 %4 = bitcast ptr %3 to ptr 53 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %4, i32 4, <4 x i1> %1, <4 x i32> undef) 54 %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load 55 %6 = add nsw <4 x i32> %5, %vec.phi 56 %index.next = add i32 %index, 4 57 %7 = icmp eq i32 %index.next, %n.vec 58 br i1 %7, label %middle.block, label %vector.body 59 60middle.block: ; preds = %vector.body 61 %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi 62 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8) 63 br label %for.cond.cleanup 64 65for.cond.cleanup: ; preds = %middle.block, %entry 66 %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ] 67 ret i32 %res.0.lcssa 68} 69 70define dso_local i32 @mul_reduce_add_const(ptr noalias nocapture readonly %a, i32 %b, i32 %N) { 71; CHECK-LABEL: mul_reduce_add_const: 72; CHECK: @ %bb.0: @ %entry 73; CHECK-NEXT: cmp r2, #0 74; CHECK-NEXT: itt eq 75; CHECK-NEXT: moveq r0, #0 76; CHECK-NEXT: bxeq lr 77; CHECK-NEXT: .LBB1_1: @ %vector.ph 78; CHECK-NEXT: push {r7, lr} 79; CHECK-NEXT: adds r1, r2, #3 80; CHECK-NEXT: movs r3, #1 81; CHECK-NEXT: bic r1, r1, #3 82; CHECK-NEXT: vmov.i32 q0, #0x0 83; CHECK-NEXT: subs r1, #4 84; CHECK-NEXT: add.w r1, r3, r1, lsr #2 85; CHECK-NEXT: dls lr, r1 86; CHECK-NEXT: .LBB1_2: @ %vector.body 87; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 88; CHECK-NEXT: vctp.32 r2 89; CHECK-NEXT: vmov q1, q0 90; CHECK-NEXT: vpst 91; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 92; CHECK-NEXT: subs r2, #4 93; CHECK-NEXT: vadd.i32 q0, q0, q1 94; CHECK-NEXT: le lr, .LBB1_2 95; CHECK-NEXT: @ %bb.3: @ %middle.block 96; CHECK-NEXT: vpsel q0, q0, q1 97; CHECK-NEXT: vaddv.u32 r0, q0 98; CHECK-NEXT: pop {r7, pc} 99entry: 100 %cmp6 = icmp eq i32 %N, 0 101 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 102 103vector.ph: ; preds = %entry 104 %n.rnd.up = add i32 %N, 3 105 %n.vec = and i32 %n.rnd.up, -4 106 br label %vector.body 107 108vector.body: ; preds = %vector.body, %vector.ph 109 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 110 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 111 %0 = getelementptr inbounds i32, ptr %a, i32 %index 112 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 113 %2 = bitcast ptr %0 to ptr 114 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 115 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi 116 %index.next = add i32 %index, 4 117 %4 = icmp eq i32 %index.next, %n.vec 118 br i1 %4, label %middle.block, label %vector.body 119 120middle.block: ; preds = %vector.body 121 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi 122 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 123 br label %for.cond.cleanup 124 125for.cond.cleanup: ; preds = %middle.block, %entry 126 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] 127 ret i32 %res.0.lcssa 128} 129 130define dso_local i32 @add_reduce_add_const(ptr noalias nocapture readonly %a, i32 %b, i32 %N) { 131; CHECK-LABEL: add_reduce_add_const: 132; CHECK: @ %bb.0: @ %entry 133; CHECK-NEXT: cmp r2, #0 134; CHECK-NEXT: itt eq 135; CHECK-NEXT: moveq r0, #0 136; CHECK-NEXT: bxeq lr 137; CHECK-NEXT: .LBB2_1: @ %vector.ph 138; CHECK-NEXT: push {r7, lr} 139; CHECK-NEXT: adds r1, r2, #3 140; CHECK-NEXT: movs r3, #1 141; CHECK-NEXT: bic r1, r1, #3 142; CHECK-NEXT: vmov.i32 q0, #0x0 143; CHECK-NEXT: subs r1, #4 144; CHECK-NEXT: add.w r1, r3, r1, lsr #2 145; CHECK-NEXT: dls lr, r1 146; CHECK-NEXT: .LBB2_2: @ %vector.body 147; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 148; CHECK-NEXT: vctp.32 r2 149; CHECK-NEXT: vmov q1, q0 150; CHECK-NEXT: vpst 151; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 152; CHECK-NEXT: subs r2, #4 153; CHECK-NEXT: vadd.i32 q0, q0, q1 154; CHECK-NEXT: le lr, .LBB2_2 155; CHECK-NEXT: @ %bb.3: @ %middle.block 156; CHECK-NEXT: vpsel q0, q0, q1 157; CHECK-NEXT: vaddv.u32 r0, q0 158; CHECK-NEXT: pop {r7, pc} 159entry: 160 %cmp6 = icmp eq i32 %N, 0 161 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 162 163vector.ph: ; preds = %entry 164 %n.rnd.up = add i32 %N, 3 165 %n.vec = and i32 %n.rnd.up, -4 166 br label %vector.body 167 168vector.body: ; preds = %vector.body, %vector.ph 169 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 170 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 171 %0 = getelementptr inbounds i32, ptr %a, i32 %index 172 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 173 %2 = bitcast ptr %0 to ptr 174 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 175 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi 176 %index.next = add i32 %index, 4 177 %4 = icmp eq i32 %index.next, %n.vec 178 br i1 %4, label %middle.block, label %vector.body 179 180middle.block: ; preds = %vector.body 181 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi 182 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 183 br label %for.cond.cleanup 184 185for.cond.cleanup: ; preds = %middle.block, %entry 186 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] 187 ret i32 %res.0.lcssa 188} 189 190define dso_local void @vector_mul_const(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %c, i32 %N) { 191; CHECK-LABEL: vector_mul_const: 192; CHECK: @ %bb.0: @ %entry 193; CHECK-NEXT: push {r7, lr} 194; CHECK-NEXT: cmp r3, #0 195; CHECK-NEXT: it eq 196; CHECK-NEXT: popeq {r7, pc} 197; CHECK-NEXT: .LBB3_1: @ %vector.ph 198; CHECK-NEXT: dlstp.32 lr, r3 199; CHECK-NEXT: .LBB3_2: @ %vector.body 200; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 201; CHECK-NEXT: vldrw.u32 q0, [r1], #16 202; CHECK-NEXT: vmul.i32 q0, q0, r2 203; CHECK-NEXT: vstrw.32 q0, [r0], #16 204; CHECK-NEXT: letp lr, .LBB3_2 205; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 206; CHECK-NEXT: pop {r7, pc} 207entry: 208 %cmp6 = icmp eq i32 %N, 0 209 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 210 211vector.ph: ; preds = %entry 212 %n.rnd.up = add i32 %N, 3 213 %n.vec = and i32 %n.rnd.up, -4 214 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 215 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 216 br label %vector.body 217 218vector.body: ; preds = %vector.body, %vector.ph 219 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 220 %0 = getelementptr inbounds i32, ptr %b, i32 %index 221 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 222 %2 = bitcast ptr %0 to ptr 223 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 224 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 225 %4 = getelementptr inbounds i32, ptr %a, i32 %index 226 %5 = bitcast ptr %4 to ptr 227 call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %5, i32 4, <4 x i1> %1) 228 %index.next = add i32 %index, 4 229 %6 = icmp eq i32 %index.next, %n.vec 230 br i1 %6, label %for.cond.cleanup, label %vector.body 231 232for.cond.cleanup: ; preds = %vector.body, %entry 233 ret void 234} 235 236define dso_local void @vector_add_const(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i32 %c, i32 %N) { 237; CHECK-LABEL: vector_add_const: 238; CHECK: @ %bb.0: @ %entry 239; CHECK-NEXT: push {r7, lr} 240; CHECK-NEXT: cmp r3, #0 241; CHECK-NEXT: it eq 242; CHECK-NEXT: popeq {r7, pc} 243; CHECK-NEXT: .LBB4_1: @ %vector.ph 244; CHECK-NEXT: dlstp.32 lr, r3 245; CHECK-NEXT: .LBB4_2: @ %vector.body 246; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 247; CHECK-NEXT: vldrw.u32 q0, [r1], #16 248; CHECK-NEXT: vadd.i32 q0, q0, r2 249; CHECK-NEXT: vstrw.32 q0, [r0], #16 250; CHECK-NEXT: letp lr, .LBB4_2 251; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 252; CHECK-NEXT: pop {r7, pc} 253entry: 254 %cmp6 = icmp eq i32 %N, 0 255 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 256 257vector.ph: ; preds = %entry 258 %n.rnd.up = add i32 %N, 3 259 %n.vec = and i32 %n.rnd.up, -4 260 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 261 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 262 br label %vector.body 263 264vector.body: ; preds = %vector.body, %vector.ph 265 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 266 %0 = getelementptr inbounds i32, ptr %b, i32 %index 267 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 268 %2 = bitcast ptr %0 to ptr 269 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef) 270 %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 271 %4 = getelementptr inbounds i32, ptr %a, i32 %index 272 %5 = bitcast ptr %4 to ptr 273 call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %5, i32 4, <4 x i1> %1) 274 %index.next = add i32 %index, 4 275 %6 = icmp eq i32 %index.next, %n.vec 276 br i1 %6, label %for.cond.cleanup, label %vector.body 277 278for.cond.cleanup: ; preds = %vector.body, %entry 279 ret void 280} 281 282define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c, i32 %N) { 283; CHECK-LABEL: vector_mul_vector_i8: 284; CHECK: @ %bb.0: @ %entry 285; CHECK-NEXT: push {r7, lr} 286; CHECK-NEXT: cmp r3, #0 287; CHECK-NEXT: it eq 288; CHECK-NEXT: popeq {r7, pc} 289; CHECK-NEXT: .LBB5_1: @ %vector.ph 290; CHECK-NEXT: dlstp.8 lr, r3 291; CHECK-NEXT: .LBB5_2: @ %vector.body 292; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 293; CHECK-NEXT: vldrb.u8 q0, [r1], #16 294; CHECK-NEXT: vldrb.u8 q1, [r2], #16 295; CHECK-NEXT: vmul.i8 q0, q1, q0 296; CHECK-NEXT: vstrb.8 q0, [r0], #16 297; CHECK-NEXT: letp lr, .LBB5_2 298; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 299; CHECK-NEXT: pop {r7, pc} 300entry: 301 %cmp10 = icmp eq i32 %N, 0 302 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 303 304vector.ph: ; preds = %entry 305 %n.rnd.up = add i32 %N, 15 306 %n.vec = and i32 %n.rnd.up, -16 307 br label %vector.body 308 309vector.body: ; preds = %vector.body, %vector.ph 310 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 311 %0 = getelementptr inbounds i8, ptr %b, i32 %index 312 %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) 313 %2 = bitcast ptr %0 to ptr 314 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %1, <16 x i8> undef) 315 %3 = getelementptr inbounds i8, ptr %c, i32 %index 316 %4 = bitcast ptr %3 to ptr 317 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %4, i32 1, <16 x i1> %1, <16 x i8> undef) 318 %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load 319 %6 = getelementptr inbounds i8, ptr %a, i32 %index 320 %7 = bitcast ptr %6 to ptr 321 call void @llvm.masked.store.v16i8.p0(<16 x i8> %5, ptr %7, i32 1, <16 x i1> %1) 322 %index.next = add i32 %index, 16 323 %8 = icmp eq i32 %index.next, %n.vec 324 br i1 %8, label %for.cond.cleanup, label %vector.body 325 326for.cond.cleanup: ; preds = %vector.body, %entry 327 ret void 328} 329 330; Function Attrs: nofree norecurse nounwind 331define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 { 332; CHECK-LABEL: vector_mul_vector_i16: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: push {r7, lr} 335; CHECK-NEXT: cmp r3, #0 336; CHECK-NEXT: it eq 337; CHECK-NEXT: popeq {r7, pc} 338; CHECK-NEXT: .LBB6_1: @ %vector.ph 339; CHECK-NEXT: dlstp.16 lr, r3 340; CHECK-NEXT: .LBB6_2: @ %vector.body 341; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 342; CHECK-NEXT: vldrh.u16 q0, [r1], #16 343; CHECK-NEXT: vldrh.u16 q1, [r2], #16 344; CHECK-NEXT: vmul.i16 q0, q1, q0 345; CHECK-NEXT: vstrh.16 q0, [r0], #16 346; CHECK-NEXT: letp lr, .LBB6_2 347; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 348; CHECK-NEXT: pop {r7, pc} 349entry: 350 %cmp10 = icmp eq i32 %N, 0 351 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 352 353vector.ph: ; preds = %entry 354 %n.rnd.up = add i32 %N, 7 355 %n.vec = and i32 %n.rnd.up, -8 356 br label %vector.body 357 358vector.body: ; preds = %vector.body, %vector.ph 359 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 360 %0 = getelementptr inbounds i16, ptr %b, i32 %index 361 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) 362 %2 = bitcast ptr %0 to ptr 363 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %1, <8 x i16> undef) 364 %3 = getelementptr inbounds i16, ptr %c, i32 %index 365 %4 = bitcast ptr %3 to ptr 366 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %4, i32 2, <8 x i1> %1, <8 x i16> undef) 367 %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load 368 %6 = getelementptr inbounds i16, ptr %a, i32 %index 369 %7 = bitcast ptr %6 to ptr 370 call void @llvm.masked.store.v8i16.p0(<8 x i16> %5, ptr %7, i32 2, <8 x i1> %1) 371 %index.next = add i32 %index, 8 372 %8 = icmp eq i32 %index.next, %n.vec 373 br i1 %8, label %for.cond.cleanup, label %vector.body 374 375for.cond.cleanup: ; preds = %vector.body, %entry 376 ret void 377} 378 379declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) 380declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) 381declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) 382declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) 383declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) 384declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) 385declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 386declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 387declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 388declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 389