1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc void @round(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 { 5; CHECK-LABEL: round: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r2, #0 10; CHECK-NEXT: it eq 11; CHECK-NEXT: popeq {r7, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.ph 13; CHECK-NEXT: dlstp.32 lr, r2 14; CHECK-NEXT: .LBB0_2: @ %vector.body 15; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 16; CHECK-NEXT: vldrw.u32 q0, [r0], #16 17; CHECK-NEXT: vrinta.f32 q0, q0 18; CHECK-NEXT: vstrw.32 q0, [r1], #16 19; CHECK-NEXT: letp lr, .LBB0_2 20; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 21; CHECK-NEXT: pop {r7, pc} 22entry: 23 %cmp5 = icmp eq i32 %n, 0 24 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 25 26vector.ph: ; preds = %entry 27 %n.rnd.up = add i32 %n, 3 28 %n.vec = and i32 %n.rnd.up, -4 29 br label %vector.body 30 31vector.body: ; preds = %vector.body, %vector.ph 32 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 33 %next.gep = getelementptr float, ptr %pSrcA, i32 %index 34 %next.gep14 = getelementptr float, ptr %pDst, i32 %index 35 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 36 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 37 %0 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load) 38 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask) 39 %index.next = add i32 %index, 4 40 %1 = icmp eq i32 %index.next, %n.vec 41 br i1 %1, label %for.cond.cleanup, label %vector.body 42 43for.cond.cleanup: ; preds = %vector.body, %entry 44 ret void 45} 46 47define arm_aapcs_vfpcc void @rint(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 { 48; CHECK-LABEL: rint: 49; CHECK: @ %bb.0: @ %entry 50; CHECK-NEXT: .save {r7, lr} 51; CHECK-NEXT: push {r7, lr} 52; CHECK-NEXT: cmp r2, #0 53; CHECK-NEXT: it eq 54; CHECK-NEXT: popeq {r7, pc} 55; CHECK-NEXT: .LBB1_1: @ %vector.ph 56; CHECK-NEXT: dlstp.32 lr, r2 57; CHECK-NEXT: .LBB1_2: @ %vector.body 58; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 59; CHECK-NEXT: vldrw.u32 q0, [r0], #16 60; CHECK-NEXT: vrintx.f32 q0, q0 61; CHECK-NEXT: vstrw.32 q0, [r1], #16 62; CHECK-NEXT: letp lr, .LBB1_2 63; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 64; CHECK-NEXT: pop {r7, pc} 65entry: 66 %cmp5 = icmp eq i32 %n, 0 67 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 68 69vector.ph: ; preds = %entry 70 %n.rnd.up = add i32 %n, 3 71 %n.vec = and i32 %n.rnd.up, -4 72 br label %vector.body 73 74vector.body: ; preds = %vector.body, %vector.ph 75 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 76 %next.gep = getelementptr float, ptr %pSrcA, i32 %index 77 %next.gep14 = getelementptr float, ptr %pDst, i32 %index 78 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 79 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 80 %0 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load) 81 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask) 82 %index.next = add i32 %index, 4 83 %1 = icmp eq i32 %index.next, %n.vec 84 br i1 %1, label %for.cond.cleanup, label %vector.body 85 86for.cond.cleanup: ; preds = %vector.body, %entry 87 ret void 88} 89 90define arm_aapcs_vfpcc void @trunc(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 { 91; CHECK-LABEL: trunc: 92; CHECK: @ %bb.0: @ %entry 93; CHECK-NEXT: .save {r7, lr} 94; CHECK-NEXT: push {r7, lr} 95; CHECK-NEXT: cmp r2, #0 96; CHECK-NEXT: it eq 97; CHECK-NEXT: popeq {r7, pc} 98; CHECK-NEXT: .LBB2_1: @ %vector.ph 99; CHECK-NEXT: dlstp.32 lr, r2 100; CHECK-NEXT: .LBB2_2: @ %vector.body 101; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 102; CHECK-NEXT: vldrw.u32 q0, [r0], #16 103; CHECK-NEXT: vrintz.f32 q0, q0 104; CHECK-NEXT: vstrw.32 q0, [r1], #16 105; CHECK-NEXT: letp lr, .LBB2_2 106; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 107; CHECK-NEXT: pop {r7, pc} 108entry: 109 %cmp5 = icmp eq i32 %n, 0 110 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 111 112vector.ph: ; preds = %entry 113 %n.rnd.up = add i32 %n, 3 114 %n.vec = and i32 %n.rnd.up, -4 115 br label %vector.body 116 117vector.body: ; preds = %vector.body, %vector.ph 118 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 119 %next.gep = getelementptr float, ptr %pSrcA, i32 %index 120 %next.gep14 = getelementptr float, ptr %pDst, i32 %index 121 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 122 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 123 %0 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load) 124 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask) 125 %index.next = add i32 %index, 4 126 %1 = icmp eq i32 %index.next, %n.vec 127 br i1 %1, label %for.cond.cleanup, label %vector.body 128 129for.cond.cleanup: ; preds = %vector.body, %entry 130 ret void 131} 132 133define arm_aapcs_vfpcc void @ceil(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 { 134; CHECK-LABEL: ceil: 135; CHECK: @ %bb.0: @ %entry 136; CHECK-NEXT: .save {r7, lr} 137; CHECK-NEXT: push {r7, lr} 138; CHECK-NEXT: cmp r2, #0 139; CHECK-NEXT: it eq 140; CHECK-NEXT: popeq {r7, pc} 141; CHECK-NEXT: .LBB3_1: @ %vector.ph 142; CHECK-NEXT: dlstp.32 lr, r2 143; CHECK-NEXT: .LBB3_2: @ %vector.body 144; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 145; CHECK-NEXT: vldrw.u32 q0, [r0], #16 146; CHECK-NEXT: vrintp.f32 q0, q0 147; CHECK-NEXT: vstrw.32 q0, [r1], #16 148; CHECK-NEXT: letp lr, .LBB3_2 149; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 150; CHECK-NEXT: pop {r7, pc} 151entry: 152 %cmp5 = icmp eq i32 %n, 0 153 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 154 155vector.ph: ; preds = %entry 156 %n.rnd.up = add i32 %n, 3 157 %n.vec = and i32 %n.rnd.up, -4 158 br label %vector.body 159 160vector.body: ; preds = %vector.body, %vector.ph 161 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 162 %next.gep = getelementptr float, ptr %pSrcA, i32 %index 163 %next.gep14 = getelementptr float, ptr %pDst, i32 %index 164 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 165 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 166 %0 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load) 167 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask) 168 %index.next = add i32 %index, 4 169 %1 = icmp eq i32 %index.next, %n.vec 170 br i1 %1, label %for.cond.cleanup, label %vector.body 171 172for.cond.cleanup: ; preds = %vector.body, %entry 173 ret void 174} 175 176define arm_aapcs_vfpcc void @floor(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 { 177; CHECK-LABEL: floor: 178; CHECK: @ %bb.0: @ %entry 179; CHECK-NEXT: .save {r7, lr} 180; CHECK-NEXT: push {r7, lr} 181; CHECK-NEXT: cmp r2, #0 182; CHECK-NEXT: it eq 183; CHECK-NEXT: popeq {r7, pc} 184; CHECK-NEXT: .LBB4_1: @ %vector.ph 185; CHECK-NEXT: dlstp.32 lr, r2 186; CHECK-NEXT: .LBB4_2: @ %vector.body 187; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 188; CHECK-NEXT: vldrw.u32 q0, [r0], #16 189; CHECK-NEXT: vrintm.f32 q0, q0 190; CHECK-NEXT: vstrw.32 q0, [r1], #16 191; CHECK-NEXT: letp lr, .LBB4_2 192; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 193; CHECK-NEXT: pop {r7, pc} 194entry: 195 %cmp5 = icmp eq i32 %n, 0 196 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 197 198vector.ph: ; preds = %entry 199 %n.rnd.up = add i32 %n, 3 200 %n.vec = and i32 %n.rnd.up, -4 201 br label %vector.body 202 203vector.body: ; preds = %vector.body, %vector.ph 204 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 205 %next.gep = getelementptr float, ptr %pSrcA, i32 %index 206 %next.gep14 = getelementptr float, ptr %pDst, i32 %index 207 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 208 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 209 %0 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load) 210 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask) 211 %index.next = add i32 %index, 4 212 %1 = icmp eq i32 %index.next, %n.vec 213 br i1 %1, label %for.cond.cleanup, label %vector.body 214 215for.cond.cleanup: ; preds = %vector.body, %entry 216 ret void 217} 218 219; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions 220define arm_aapcs_vfpcc void @nearbyint(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 { 221; CHECK-LABEL: nearbyint: 222; CHECK: @ %bb.0: @ %entry 223; CHECK-NEXT: .save {r7, lr} 224; CHECK-NEXT: push {r7, lr} 225; CHECK-NEXT: cmp r2, #0 226; CHECK-NEXT: it eq 227; CHECK-NEXT: popeq {r7, pc} 228; CHECK-NEXT: .LBB5_1: @ %vector.ph 229; CHECK-NEXT: dlstp.32 lr, r2 230; CHECK-NEXT: .LBB5_2: @ %vector.body 231; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 232; CHECK-NEXT: vldrw.u32 q0, [r0], #16 233; CHECK-NEXT: vrintr.f32 s3, s3 234; CHECK-NEXT: vrintr.f32 s2, s2 235; CHECK-NEXT: vrintr.f32 s1, s1 236; CHECK-NEXT: vrintr.f32 s0, s0 237; CHECK-NEXT: vstrw.32 q0, [r1], #16 238; CHECK-NEXT: letp lr, .LBB5_2 239; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 240; CHECK-NEXT: pop {r7, pc} 241entry: 242 %cmp5 = icmp eq i32 %n, 0 243 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 244 245vector.ph: ; preds = %entry 246 %n.rnd.up = add i32 %n, 3 247 %n.vec = and i32 %n.rnd.up, -4 248 br label %vector.body 249 250vector.body: ; preds = %vector.body, %vector.ph 251 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 252 %next.gep = getelementptr float, ptr %pSrcA, i32 %index 253 %next.gep14 = getelementptr float, ptr %pDst, i32 %index 254 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 255 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 256 %0 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load) 257 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask) 258 %index.next = add i32 %index, 4 259 %1 = icmp eq i32 %index.next, %n.vec 260 br i1 %1, label %for.cond.cleanup, label %vector.body 261 262for.cond.cleanup: ; preds = %vector.body, %entry 263 ret void 264} 265 266declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 267 268declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) #2 269 270declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3 271 272declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3 273 274declare <4 x float> @llvm.round.v4f32(<4 x float>) #3 275 276declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3 277 278declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3 279 280declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1 281 282declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>) #4 283