1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc void @fmas1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 5; CHECK-LABEL: fmas1: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, lr} 8; CHECK-NEXT: push {r4, lr} 9; CHECK-NEXT: cmp r3, #1 10; CHECK-NEXT: it lt 11; CHECK-NEXT: poplt {r4, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.ph 13; CHECK-NEXT: vmov r12, s0 14; CHECK-NEXT: dlstp.32 lr, r3 15; CHECK-NEXT: .LBB0_2: @ %vector.body 16; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 17; CHECK-NEXT: vldrw.u32 q0, [r1], #16 18; CHECK-NEXT: vldrw.u32 q1, [r0], #16 19; CHECK-NEXT: vfmas.f32 q1, q0, r12 20; CHECK-NEXT: vstrw.32 q1, [r2], #16 21; CHECK-NEXT: letp lr, .LBB0_2 22; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 23; CHECK-NEXT: pop {r4, pc} 24entry: 25 %cmp8 = icmp sgt i32 %n, 0 26 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 27 28vector.ph: ; preds = %entry 29 %n.rnd.up = add i32 %n, 3 30 %n.vec = and i32 %n.rnd.up, -4 31 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 32 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 33 br label %vector.body 34 35vector.body: ; preds = %vector.body, %vector.ph 36 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 37 %0 = getelementptr inbounds float, ptr %x, i32 %index 38 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 39 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 40 %2 = getelementptr inbounds float, ptr %y, i32 %index 41 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 42 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14) 43 %4 = getelementptr inbounds float, ptr %z, i32 %index 44 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1) 45 %index.next = add i32 %index, 4 46 %5 = icmp eq i32 %index.next, %n.vec 47 br i1 %5, label %for.cond.cleanup, label %vector.body 48 49for.cond.cleanup: ; preds = %vector.body, %entry 50 ret void 51} 52 53define arm_aapcs_vfpcc void @fmas2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 54; CHECK-LABEL: fmas2: 55; CHECK: @ %bb.0: @ %entry 56; CHECK-NEXT: .save {r4, lr} 57; CHECK-NEXT: push {r4, lr} 58; CHECK-NEXT: cmp r3, #1 59; CHECK-NEXT: it lt 60; CHECK-NEXT: poplt {r4, pc} 61; CHECK-NEXT: .LBB1_1: @ %vector.ph 62; CHECK-NEXT: vmov r12, s0 63; CHECK-NEXT: dlstp.32 lr, r3 64; CHECK-NEXT: .LBB1_2: @ %vector.body 65; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 66; CHECK-NEXT: vldrw.u32 q0, [r0], #16 67; CHECK-NEXT: vldrw.u32 q1, [r1], #16 68; CHECK-NEXT: vfmas.f32 q1, q0, r12 69; CHECK-NEXT: vstrw.32 q1, [r2], #16 70; CHECK-NEXT: letp lr, .LBB1_2 71; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 72; CHECK-NEXT: pop {r4, pc} 73entry: 74 %cmp8 = icmp sgt i32 %n, 0 75 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 76 77vector.ph: ; preds = %entry 78 %n.rnd.up = add i32 %n, 3 79 %n.vec = and i32 %n.rnd.up, -4 80 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 81 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 82 br label %vector.body 83 84vector.body: ; preds = %vector.body, %vector.ph 85 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 86 %0 = getelementptr inbounds float, ptr %x, i32 %index 87 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 88 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 89 %2 = getelementptr inbounds float, ptr %y, i32 %index 90 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 91 %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load 92 %4 = fadd fast <4 x float> %3, %broadcast.splat14 93 %5 = getelementptr inbounds float, ptr %z, i32 %index 94 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 95 %index.next = add i32 %index, 4 96 %6 = icmp eq i32 %index.next, %n.vec 97 br i1 %6, label %for.cond.cleanup, label %vector.body 98 99for.cond.cleanup: ; preds = %vector.body, %entry 100 ret void 101} 102 103define arm_aapcs_vfpcc void @fma1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 104; CHECK-LABEL: fma1: 105; CHECK: @ %bb.0: @ %entry 106; CHECK-NEXT: .save {r4, lr} 107; CHECK-NEXT: push {r4, lr} 108; CHECK-NEXT: cmp r3, #1 109; CHECK-NEXT: it lt 110; CHECK-NEXT: poplt {r4, pc} 111; CHECK-NEXT: .LBB2_1: @ %vector.ph 112; CHECK-NEXT: vmov r12, s0 113; CHECK-NEXT: dlstp.32 lr, r3 114; CHECK-NEXT: .LBB2_2: @ %vector.body 115; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 116; CHECK-NEXT: vldrw.u32 q0, [r0], #16 117; CHECK-NEXT: vldrw.u32 q1, [r1], #16 118; CHECK-NEXT: vfma.f32 q1, q0, r12 119; CHECK-NEXT: vstrw.32 q1, [r2], #16 120; CHECK-NEXT: letp lr, .LBB2_2 121; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 122; CHECK-NEXT: pop {r4, pc} 123entry: 124 %cmp8 = icmp sgt i32 %n, 0 125 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 126 127vector.ph: ; preds = %entry 128 %n.rnd.up = add i32 %n, 3 129 %n.vec = and i32 %n.rnd.up, -4 130 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 131 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 132 br label %vector.body 133 134vector.body: ; preds = %vector.body, %vector.ph 135 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 136 %0 = getelementptr inbounds float, ptr %x, i32 %index 137 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 138 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 139 %2 = getelementptr inbounds float, ptr %y, i32 %index 140 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 141 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12) 142 %4 = getelementptr inbounds float, ptr %z, i32 %index 143 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1) 144 %index.next = add i32 %index, 4 145 %5 = icmp eq i32 %index.next, %n.vec 146 br i1 %5, label %for.cond.cleanup, label %vector.body 147 148for.cond.cleanup: ; preds = %vector.body, %entry 149 ret void 150} 151 152define arm_aapcs_vfpcc void @fma2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 153; CHECK-LABEL: fma2: 154; CHECK: @ %bb.0: @ %entry 155; CHECK-NEXT: .save {r4, lr} 156; CHECK-NEXT: push {r4, lr} 157; CHECK-NEXT: cmp r3, #1 158; CHECK-NEXT: it lt 159; CHECK-NEXT: poplt {r4, pc} 160; CHECK-NEXT: .LBB3_1: @ %vector.ph 161; CHECK-NEXT: vmov r12, s0 162; CHECK-NEXT: dlstp.32 lr, r3 163; CHECK-NEXT: .LBB3_2: @ %vector.body 164; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 165; CHECK-NEXT: vldrw.u32 q0, [r0], #16 166; CHECK-NEXT: vldrw.u32 q1, [r1], #16 167; CHECK-NEXT: vfma.f32 q1, q0, r12 168; CHECK-NEXT: vstrw.32 q1, [r2], #16 169; CHECK-NEXT: letp lr, .LBB3_2 170; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 171; CHECK-NEXT: pop {r4, pc} 172entry: 173 %cmp8 = icmp sgt i32 %n, 0 174 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 175 176vector.ph: ; preds = %entry 177 %n.rnd.up = add i32 %n, 3 178 %n.vec = and i32 %n.rnd.up, -4 179 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 180 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer 181 br label %vector.body 182 183vector.body: ; preds = %vector.body, %vector.ph 184 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 185 %0 = getelementptr inbounds float, ptr %x, i32 %index 186 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 187 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 188 %2 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 189 %3 = getelementptr inbounds float, ptr %y, i32 %index 190 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %1, <4 x float> undef) 191 %4 = fadd fast <4 x float> %2, %wide.masked.load14 192 %5 = getelementptr inbounds float, ptr %z, i32 %index 193 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 194 %index.next = add i32 %index, 4 195 %6 = icmp eq i32 %index.next, %n.vec 196 br i1 %6, label %for.cond.cleanup, label %vector.body 197 198for.cond.cleanup: ; preds = %vector.body, %entry 199 ret void 200} 201 202define arm_aapcs_vfpcc void @fmss1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 203; CHECK-LABEL: fmss1: 204; CHECK: @ %bb.0: @ %entry 205; CHECK-NEXT: .save {r4, lr} 206; CHECK-NEXT: push {r4, lr} 207; CHECK-NEXT: cmp r3, #1 208; CHECK-NEXT: it lt 209; CHECK-NEXT: poplt {r4, pc} 210; CHECK-NEXT: .LBB4_1: @ %vector.ph 211; CHECK-NEXT: vmov r12, s0 212; CHECK-NEXT: eor r12, r12, #-2147483648 213; CHECK-NEXT: dlstp.32 lr, r3 214; CHECK-NEXT: .LBB4_2: @ %vector.body 215; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 216; CHECK-NEXT: vldrw.u32 q0, [r1], #16 217; CHECK-NEXT: vldrw.u32 q1, [r0], #16 218; CHECK-NEXT: vfmas.f32 q1, q0, r12 219; CHECK-NEXT: vstrw.32 q1, [r2], #16 220; CHECK-NEXT: letp lr, .LBB4_2 221; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 222; CHECK-NEXT: pop {r4, pc} 223entry: 224 %cmp8 = icmp sgt i32 %n, 0 225 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 226 227vector.ph: ; preds = %entry 228 %fneg = fneg fast float %a 229 %n.rnd.up = add i32 %n, 3 230 %n.vec = and i32 %n.rnd.up, -4 231 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 232 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 233 br label %vector.body 234 235vector.body: ; preds = %vector.body, %vector.ph 236 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 237 %0 = getelementptr inbounds float, ptr %x, i32 %index 238 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 239 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 240 %2 = getelementptr inbounds float, ptr %y, i32 %index 241 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 242 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14) 243 %4 = getelementptr inbounds float, ptr %z, i32 %index 244 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1) 245 %index.next = add i32 %index, 4 246 %5 = icmp eq i32 %index.next, %n.vec 247 br i1 %5, label %for.cond.cleanup, label %vector.body 248 249for.cond.cleanup: ; preds = %vector.body, %entry 250 ret void 251} 252 253define arm_aapcs_vfpcc void @fmss2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 254; CHECK-LABEL: fmss2: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: .save {r4, lr} 257; CHECK-NEXT: push {r4, lr} 258; CHECK-NEXT: cmp r3, #1 259; CHECK-NEXT: it lt 260; CHECK-NEXT: poplt {r4, pc} 261; CHECK-NEXT: .LBB5_1: @ %vector.ph 262; CHECK-NEXT: vmov r12, s0 263; CHECK-NEXT: vdup.32 q0, r12 264; CHECK-NEXT: vneg.f32 q0, q0 265; CHECK-NEXT: dlstp.32 lr, r3 266; CHECK-NEXT: .LBB5_2: @ %vector.body 267; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 268; CHECK-NEXT: vmov q3, q0 269; CHECK-NEXT: vldrw.u32 q1, [r0], #16 270; CHECK-NEXT: vldrw.u32 q2, [r1], #16 271; CHECK-NEXT: vfma.f32 q3, q2, q1 272; CHECK-NEXT: vstrw.32 q3, [r2], #16 273; CHECK-NEXT: letp lr, .LBB5_2 274; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 275; CHECK-NEXT: pop {r4, pc} 276entry: 277 %cmp8 = icmp sgt i32 %n, 0 278 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 279 280vector.ph: ; preds = %entry 281 %n.rnd.up = add i32 %n, 3 282 %n.vec = and i32 %n.rnd.up, -4 283 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 284 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 285 br label %vector.body 286 287vector.body: ; preds = %vector.body, %vector.ph 288 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 289 %0 = getelementptr inbounds float, ptr %x, i32 %index 290 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 291 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 292 %2 = getelementptr inbounds float, ptr %y, i32 %index 293 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 294 %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load 295 %4 = fsub fast <4 x float> %3, %broadcast.splat14 296 %5 = getelementptr inbounds float, ptr %z, i32 %index 297 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 298 %index.next = add i32 %index, 4 299 %6 = icmp eq i32 %index.next, %n.vec 300 br i1 %6, label %for.cond.cleanup, label %vector.body 301 302for.cond.cleanup: ; preds = %vector.body, %entry 303 ret void 304} 305 306define arm_aapcs_vfpcc void @fmss3(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 307; CHECK-LABEL: fmss3: 308; CHECK: @ %bb.0: @ %entry 309; CHECK-NEXT: .save {r4, lr} 310; CHECK-NEXT: push {r4, lr} 311; CHECK-NEXT: cmp r3, #1 312; CHECK-NEXT: it lt 313; CHECK-NEXT: poplt {r4, pc} 314; CHECK-NEXT: .LBB6_1: @ %vector.ph 315; CHECK-NEXT: vmov r4, s0 316; CHECK-NEXT: vdup.32 q0, r4 317; CHECK-NEXT: dlstp.32 lr, r3 318; CHECK-NEXT: .LBB6_2: @ %vector.body 319; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 320; CHECK-NEXT: vmov q3, q0 321; CHECK-NEXT: vldrw.u32 q1, [r0], #16 322; CHECK-NEXT: vldrw.u32 q2, [r1], #16 323; CHECK-NEXT: vfms.f32 q3, q2, q1 324; CHECK-NEXT: vstrw.32 q3, [r2], #16 325; CHECK-NEXT: letp lr, .LBB6_2 326; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 327; CHECK-NEXT: pop {r4, pc} 328entry: 329 %cmp8 = icmp sgt i32 %n, 0 330 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 331 332vector.ph: ; preds = %entry 333 %n.rnd.up = add i32 %n, 3 334 %n.vec = and i32 %n.rnd.up, -4 335 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 336 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 337 br label %vector.body 338 339vector.body: ; preds = %vector.body, %vector.ph 340 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 341 %0 = getelementptr inbounds float, ptr %x, i32 %index 342 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 343 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 344 %2 = getelementptr inbounds float, ptr %y, i32 %index 345 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 346 %3 = fneg fast <4 x float> %wide.masked.load12 347 %4 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %3, <4 x float> %broadcast.splat14) 348 %5 = getelementptr inbounds float, ptr %z, i32 %index 349 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 350 %index.next = add i32 %index, 4 351 %6 = icmp eq i32 %index.next, %n.vec 352 br i1 %6, label %for.cond.cleanup, label %vector.body 353 354for.cond.cleanup: ; preds = %vector.body, %entry 355 ret void 356} 357 358define arm_aapcs_vfpcc void @fmss4(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 359; CHECK-LABEL: fmss4: 360; CHECK: @ %bb.0: @ %entry 361; CHECK-NEXT: .save {r4, lr} 362; CHECK-NEXT: push {r4, lr} 363; CHECK-NEXT: cmp r3, #1 364; CHECK-NEXT: it lt 365; CHECK-NEXT: poplt {r4, pc} 366; CHECK-NEXT: .LBB7_1: @ %vector.ph 367; CHECK-NEXT: vmov r4, s0 368; CHECK-NEXT: vdup.32 q0, r4 369; CHECK-NEXT: dlstp.32 lr, r3 370; CHECK-NEXT: .LBB7_2: @ %vector.body 371; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 372; CHECK-NEXT: vmov q3, q0 373; CHECK-NEXT: vldrw.u32 q1, [r0], #16 374; CHECK-NEXT: vldrw.u32 q2, [r1], #16 375; CHECK-NEXT: vfms.f32 q3, q2, q1 376; CHECK-NEXT: vstrw.32 q3, [r2], #16 377; CHECK-NEXT: letp lr, .LBB7_2 378; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 379; CHECK-NEXT: pop {r4, pc} 380entry: 381 %cmp8 = icmp sgt i32 %n, 0 382 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 383 384vector.ph: ; preds = %entry 385 %n.rnd.up = add i32 %n, 3 386 %n.vec = and i32 %n.rnd.up, -4 387 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 388 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 389 br label %vector.body 390 391vector.body: ; preds = %vector.body, %vector.ph 392 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 393 %0 = getelementptr inbounds float, ptr %x, i32 %index 394 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 395 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 396 %2 = getelementptr inbounds float, ptr %y, i32 %index 397 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 398 %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load 399 %4 = fsub fast <4 x float> %broadcast.splat14, %3 400 %5 = getelementptr inbounds float, ptr %z, i32 %index 401 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 402 %index.next = add i32 %index, 4 403 %6 = icmp eq i32 %index.next, %n.vec 404 br i1 %6, label %for.cond.cleanup, label %vector.body 405 406for.cond.cleanup: ; preds = %vector.body, %entry 407 ret void 408} 409 410define arm_aapcs_vfpcc void @fms1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 411; CHECK-LABEL: fms1: 412; CHECK: @ %bb.0: @ %entry 413; CHECK-NEXT: .save {r4, lr} 414; CHECK-NEXT: push {r4, lr} 415; CHECK-NEXT: cmp r3, #1 416; CHECK-NEXT: it lt 417; CHECK-NEXT: poplt {r4, pc} 418; CHECK-NEXT: .LBB8_1: @ %vector.ph 419; CHECK-NEXT: vmov r12, s0 420; CHECK-NEXT: eor r12, r12, #-2147483648 421; CHECK-NEXT: dlstp.32 lr, r3 422; CHECK-NEXT: .LBB8_2: @ %vector.body 423; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 424; CHECK-NEXT: vldrw.u32 q0, [r0], #16 425; CHECK-NEXT: vldrw.u32 q1, [r1], #16 426; CHECK-NEXT: vfma.f32 q1, q0, r12 427; CHECK-NEXT: vstrw.32 q1, [r2], #16 428; CHECK-NEXT: letp lr, .LBB8_2 429; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 430; CHECK-NEXT: pop {r4, pc} 431entry: 432 %cmp8 = icmp sgt i32 %n, 0 433 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 434 435vector.ph: ; preds = %entry 436 %fneg = fneg fast float %a 437 %n.rnd.up = add i32 %n, 3 438 %n.vec = and i32 %n.rnd.up, -4 439 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 440 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 441 br label %vector.body 442 443vector.body: ; preds = %vector.body, %vector.ph 444 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 445 %0 = getelementptr inbounds float, ptr %x, i32 %index 446 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 447 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 448 %2 = getelementptr inbounds float, ptr %y, i32 %index 449 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 450 %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12) 451 %4 = getelementptr inbounds float, ptr %z, i32 %index 452 call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1) 453 %index.next = add i32 %index, 4 454 %5 = icmp eq i32 %index.next, %n.vec 455 br i1 %5, label %for.cond.cleanup, label %vector.body 456 457for.cond.cleanup: ; preds = %vector.body, %entry 458 ret void 459} 460 461define arm_aapcs_vfpcc void @fms2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 462; CHECK-LABEL: fms2: 463; CHECK: @ %bb.0: @ %entry 464; CHECK-NEXT: .save {r4, lr} 465; CHECK-NEXT: push {r4, lr} 466; CHECK-NEXT: cmp r3, #1 467; CHECK-NEXT: it lt 468; CHECK-NEXT: poplt {r4, pc} 469; CHECK-NEXT: .LBB9_1: @ %vector.ph 470; CHECK-NEXT: vmov r4, s0 471; CHECK-NEXT: vdup.32 q0, r4 472; CHECK-NEXT: dlstp.32 lr, r3 473; CHECK-NEXT: .LBB9_2: @ %vector.body 474; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 475; CHECK-NEXT: vldrw.u32 q1, [r0], #16 476; CHECK-NEXT: vldrw.u32 q2, [r1], #16 477; CHECK-NEXT: vfms.f32 q2, q1, q0 478; CHECK-NEXT: vstrw.32 q2, [r2], #16 479; CHECK-NEXT: letp lr, .LBB9_2 480; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 481; CHECK-NEXT: pop {r4, pc} 482entry: 483 %cmp8 = icmp sgt i32 %n, 0 484 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 485 486vector.ph: ; preds = %entry 487 %n.rnd.up = add i32 %n, 3 488 %n.vec = and i32 %n.rnd.up, -4 489 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 490 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 491 br label %vector.body 492 493vector.body: ; preds = %vector.body, %vector.ph 494 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 495 %0 = getelementptr inbounds float, ptr %x, i32 %index 496 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 497 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 498 %2 = getelementptr inbounds float, ptr %y, i32 %index 499 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 500 %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14 501 %4 = fsub fast <4 x float> %wide.masked.load12, %3 502 %5 = getelementptr inbounds float, ptr %z, i32 %index 503 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 504 %index.next = add i32 %index, 4 505 %6 = icmp eq i32 %index.next, %n.vec 506 br i1 %6, label %for.cond.cleanup, label %vector.body 507 508for.cond.cleanup: ; preds = %vector.body, %entry 509 ret void 510} 511 512define arm_aapcs_vfpcc void @fms3(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 513; CHECK-LABEL: fms3: 514; CHECK: @ %bb.0: @ %entry 515; CHECK-NEXT: .save {r4, lr} 516; CHECK-NEXT: push {r4, lr} 517; CHECK-NEXT: cmp r3, #1 518; CHECK-NEXT: it lt 519; CHECK-NEXT: poplt {r4, pc} 520; CHECK-NEXT: .LBB10_1: @ %vector.ph 521; CHECK-NEXT: vmov r12, s0 522; CHECK-NEXT: dlstp.32 lr, r3 523; CHECK-NEXT: .LBB10_2: @ %vector.body 524; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 525; CHECK-NEXT: vldrw.u32 q0, [r1], #16 526; CHECK-NEXT: vldrw.u32 q1, [r0], #16 527; CHECK-NEXT: vneg.f32 q0, q0 528; CHECK-NEXT: vfma.f32 q0, q1, r12 529; CHECK-NEXT: vstrw.32 q0, [r2], #16 530; CHECK-NEXT: letp lr, .LBB10_2 531; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 532; CHECK-NEXT: pop {r4, pc} 533entry: 534 %cmp8 = icmp sgt i32 %n, 0 535 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 536 537vector.ph: ; preds = %entry 538 %n.rnd.up = add i32 %n, 3 539 %n.vec = and i32 %n.rnd.up, -4 540 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 541 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 542 br label %vector.body 543 544vector.body: ; preds = %vector.body, %vector.ph 545 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 546 %0 = getelementptr inbounds float, ptr %x, i32 %index 547 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 548 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 549 %2 = getelementptr inbounds float, ptr %y, i32 %index 550 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef) 551 %3 = fneg fast <4 x float> %wide.masked.load12 552 %4 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %3) 553 %5 = getelementptr inbounds float, ptr %z, i32 %index 554 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 555 %index.next = add i32 %index, 4 556 %6 = icmp eq i32 %index.next, %n.vec 557 br i1 %6, label %for.cond.cleanup, label %vector.body 558 559for.cond.cleanup: ; preds = %vector.body, %entry 560 ret void 561} 562 563define arm_aapcs_vfpcc void @fms4(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) { 564; CHECK-LABEL: fms4: 565; CHECK: @ %bb.0: @ %entry 566; CHECK-NEXT: .save {r4, lr} 567; CHECK-NEXT: push {r4, lr} 568; CHECK-NEXT: cmp r3, #1 569; CHECK-NEXT: it lt 570; CHECK-NEXT: poplt {r4, pc} 571; CHECK-NEXT: .LBB11_1: @ %vector.ph 572; CHECK-NEXT: vmov r12, s0 573; CHECK-NEXT: dlstp.32 lr, r3 574; CHECK-NEXT: .LBB11_2: @ %vector.body 575; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 576; CHECK-NEXT: vldrw.u32 q0, [r1], #16 577; CHECK-NEXT: vldrw.u32 q1, [r0], #16 578; CHECK-NEXT: vneg.f32 q0, q0 579; CHECK-NEXT: vfma.f32 q0, q1, r12 580; CHECK-NEXT: vstrw.32 q0, [r2], #16 581; CHECK-NEXT: letp lr, .LBB11_2 582; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 583; CHECK-NEXT: pop {r4, pc} 584entry: 585 %cmp8 = icmp sgt i32 %n, 0 586 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 587 588vector.ph: ; preds = %entry 589 %n.rnd.up = add i32 %n, 3 590 %n.vec = and i32 %n.rnd.up, -4 591 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 592 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer 593 br label %vector.body 594 595vector.body: ; preds = %vector.body, %vector.ph 596 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 597 %0 = getelementptr inbounds float, ptr %x, i32 %index 598 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 599 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef) 600 %2 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 601 %3 = getelementptr inbounds float, ptr %y, i32 %index 602 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %1, <4 x float> undef) 603 %4 = fsub fast <4 x float> %2, %wide.masked.load14 604 %5 = getelementptr inbounds float, ptr %z, i32 %index 605 call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1) 606 %index.next = add i32 %index, 4 607 %6 = icmp eq i32 %index.next, %n.vec 608 br i1 %6, label %for.cond.cleanup, label %vector.body 609 610for.cond.cleanup: ; preds = %vector.body, %entry 611 ret void 612} 613 614declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) 615declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 616declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>) 617declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 618