1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc void @test_fadd(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) { 5; CHECK-LABEL: test_fadd: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: cmp r2, #1 8; CHECK-NEXT: it lt 9; CHECK-NEXT: bxlt lr 10; CHECK-NEXT: .LBB0_1: @ %vector.ph 11; CHECK-NEXT: vmov.f16 r3, s0 12; CHECK-NEXT: .LBB0_2: @ %vector.body 13; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 14; CHECK-NEXT: vldrw.u32 q0, [r0], #16 15; CHECK-NEXT: subs r2, #8 16; CHECK-NEXT: vadd.f16 q0, q0, r3 17; CHECK-NEXT: vstrb.8 q0, [r1], #16 18; CHECK-NEXT: bne .LBB0_2 19; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 20; CHECK-NEXT: bx lr 21entry: 22 %i = and i32 %n, 7 23 %cmp = icmp eq i32 %i, 0 24 tail call void @llvm.assume(i1 %cmp) 25 %cmp18 = icmp sgt i32 %n, 0 26 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup 27 28vector.ph: ; preds = %entry 29 %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 30 %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer 31 br label %vector.body 32 33vector.body: ; preds = %vector.body, %vector.ph 34 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 35 %i1 = getelementptr inbounds half, ptr %A, i32 %index 36 %wide.load = load <8 x half>, ptr %i1, align 4 37 %i3 = fadd fast <8 x half> %wide.load, %broadcast.splat11 38 %i4 = getelementptr inbounds half, ptr %C, i32 %index 39 store <8 x half> %i3, ptr %i4, align 4 40 %index.next = add i32 %index, 8 41 %i6 = icmp eq i32 %index.next, %n 42 br i1 %i6, label %for.cond.cleanup, label %vector.body 43 44for.cond.cleanup: ; preds = %vector.body, %entry 45 ret void 46} 47 48define arm_aapcs_vfpcc void @test_fadd_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) { 49; CHECK-LABEL: test_fadd_r: 50; CHECK: @ %bb.0: @ %entry 51; CHECK-NEXT: cmp r2, #1 52; CHECK-NEXT: it lt 53; CHECK-NEXT: bxlt lr 54; CHECK-NEXT: .LBB1_1: @ %vector.ph 55; CHECK-NEXT: vmov.f16 r3, s0 56; CHECK-NEXT: .LBB1_2: @ %vector.body 57; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 58; CHECK-NEXT: vldrw.u32 q0, [r0], #16 59; CHECK-NEXT: subs r2, #8 60; CHECK-NEXT: vadd.f16 q0, q0, r3 61; CHECK-NEXT: vstrb.8 q0, [r1], #16 62; CHECK-NEXT: bne .LBB1_2 63; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 64; CHECK-NEXT: bx lr 65entry: 66 %i = and i32 %n, 7 67 %cmp = icmp eq i32 %i, 0 68 tail call void @llvm.assume(i1 %cmp) 69 %cmp18 = icmp sgt i32 %n, 0 70 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup 71 72vector.ph: ; preds = %entry 73 %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 74 %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer 75 br label %vector.body 76 77vector.body: ; preds = %vector.body, %vector.ph 78 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 79 %i1 = getelementptr inbounds half, ptr %A, i32 %index 80 %wide.load = load <8 x half>, ptr %i1, align 4 81 %i3 = fadd fast <8 x half> %broadcast.splat11, %wide.load 82 %i4 = getelementptr inbounds half, ptr %C, i32 %index 83 store <8 x half> %i3, ptr %i4, align 4 84 %index.next = add i32 %index, 8 85 %i6 = icmp eq i32 %index.next, %n 86 br i1 %i6, label %for.cond.cleanup, label %vector.body 87 88for.cond.cleanup: ; preds = %vector.body, %entry 89 ret void 90} 91 92define arm_aapcs_vfpcc void @test_fmul(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) { 93; CHECK-LABEL: test_fmul: 94; CHECK: @ %bb.0: @ %entry 95; CHECK-NEXT: cmp r2, #1 96; CHECK-NEXT: it lt 97; CHECK-NEXT: bxlt lr 98; CHECK-NEXT: .LBB2_1: @ %vector.ph 99; CHECK-NEXT: vmov.f16 r3, s0 100; CHECK-NEXT: .LBB2_2: @ %vector.body 101; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 102; CHECK-NEXT: vldrw.u32 q0, [r0], #16 103; CHECK-NEXT: subs r2, #8 104; CHECK-NEXT: vmul.f16 q0, q0, r3 105; CHECK-NEXT: vstrb.8 q0, [r1], #16 106; CHECK-NEXT: bne .LBB2_2 107; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 108; CHECK-NEXT: bx lr 109entry: 110 %i = and i32 %n, 7 111 %cmp = icmp eq i32 %i, 0 112 tail call void @llvm.assume(i1 %cmp) 113 %cmp18 = icmp sgt i32 %n, 0 114 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup 115 116vector.ph: ; preds = %entry 117 %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 118 %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer 119 br label %vector.body 120 121vector.body: ; preds = %vector.body, %vector.ph 122 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 123 %i1 = getelementptr inbounds half, ptr %A, i32 %index 124 %wide.load = load <8 x half>, ptr %i1, align 4 125 %i3 = fmul fast <8 x half> %wide.load, %broadcast.splat11 126 %i4 = getelementptr inbounds half, ptr %C, i32 %index 127 store <8 x half> %i3, ptr %i4, align 4 128 %index.next = add i32 %index, 8 129 %i6 = icmp eq i32 %index.next, %n 130 br i1 %i6, label %for.cond.cleanup, label %vector.body 131 132for.cond.cleanup: ; preds = %vector.body, %entry 133 ret void 134} 135 136define arm_aapcs_vfpcc void @test_fmul_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) { 137; CHECK-LABEL: test_fmul_r: 138; CHECK: @ %bb.0: @ %entry 139; CHECK-NEXT: cmp r2, #1 140; CHECK-NEXT: it lt 141; CHECK-NEXT: bxlt lr 142; CHECK-NEXT: .LBB3_1: @ %vector.ph 143; CHECK-NEXT: vmov.f16 r3, s0 144; CHECK-NEXT: .LBB3_2: @ %vector.body 145; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 146; CHECK-NEXT: vldrw.u32 q0, [r0], #16 147; CHECK-NEXT: subs r2, #8 148; CHECK-NEXT: vmul.f16 q0, q0, r3 149; CHECK-NEXT: vstrb.8 q0, [r1], #16 150; CHECK-NEXT: bne .LBB3_2 151; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 152; CHECK-NEXT: bx lr 153entry: 154 %i = and i32 %n, 7 155 %cmp = icmp eq i32 %i, 0 156 tail call void @llvm.assume(i1 %cmp) 157 %cmp18 = icmp sgt i32 %n, 0 158 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup 159 160vector.ph: ; preds = %entry 161 %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 162 %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer 163 br label %vector.body 164 165vector.body: ; preds = %vector.body, %vector.ph 166 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 167 %i1 = getelementptr inbounds half, ptr %A, i32 %index 168 %wide.load = load <8 x half>, ptr %i1, align 4 169 %i3 = fmul fast <8 x half> %broadcast.splat11, %wide.load 170 %i4 = getelementptr inbounds half, ptr %C, i32 %index 171 store <8 x half> %i3, ptr %i4, align 4 172 %index.next = add i32 %index, 8 173 %i6 = icmp eq i32 %index.next, %n 174 br i1 %i6, label %for.cond.cleanup, label %vector.body 175 176for.cond.cleanup: ; preds = %vector.body, %entry 177 ret void 178} 179 180define arm_aapcs_vfpcc void @test_fsub(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) { 181; CHECK-LABEL: test_fsub: 182; CHECK: @ %bb.0: @ %entry 183; CHECK-NEXT: cmp r2, #1 184; CHECK-NEXT: it lt 185; CHECK-NEXT: bxlt lr 186; CHECK-NEXT: .LBB4_1: @ %vector.ph 187; CHECK-NEXT: vmov.f16 r3, s0 188; CHECK-NEXT: .LBB4_2: @ %vector.body 189; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 190; CHECK-NEXT: vldrw.u32 q0, [r0], #16 191; CHECK-NEXT: subs r2, #8 192; CHECK-NEXT: vsub.f16 q0, q0, r3 193; CHECK-NEXT: vstrb.8 q0, [r1], #16 194; CHECK-NEXT: bne .LBB4_2 195; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 196; CHECK-NEXT: bx lr 197entry: 198 %i = and i32 %n, 7 199 %cmp = icmp eq i32 %i, 0 200 tail call void @llvm.assume(i1 %cmp) 201 %cmp18 = icmp sgt i32 %n, 0 202 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup 203 204vector.ph: ; preds = %entry 205 %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 206 %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer 207 br label %vector.body 208 209vector.body: ; preds = %vector.body, %vector.ph 210 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 211 %i1 = getelementptr inbounds half, ptr %A, i32 %index 212 %wide.load = load <8 x half>, ptr %i1, align 4 213 %i3 = fsub fast <8 x half> %wide.load, %broadcast.splat11 214 %i4 = getelementptr inbounds half, ptr %C, i32 %index 215 store <8 x half> %i3, ptr %i4, align 4 216 %index.next = add i32 %index, 8 217 %i6 = icmp eq i32 %index.next, %n 218 br i1 %i6, label %for.cond.cleanup, label %vector.body 219 220for.cond.cleanup: ; preds = %vector.body, %entry 221 ret void 222} 223 224define arm_aapcs_vfpcc void @test_fsub_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) { 225; CHECK-LABEL: test_fsub_r: 226; CHECK: @ %bb.0: @ %entry 227; CHECK-NEXT: cmp r2, #1 228; CHECK-NEXT: it lt 229; CHECK-NEXT: bxlt lr 230; CHECK-NEXT: .LBB5_1: @ %vector.ph 231; CHECK-NEXT: vmov.f16 r3, s0 232; CHECK-NEXT: vdup.16 q0, r3 233; CHECK-NEXT: .LBB5_2: @ %vector.body 234; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 235; CHECK-NEXT: vldrw.u32 q1, [r0], #16 236; CHECK-NEXT: subs r2, #8 237; CHECK-NEXT: vsub.f16 q1, q0, q1 238; CHECK-NEXT: vstrb.8 q1, [r1], #16 239; CHECK-NEXT: bne .LBB5_2 240; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 241; CHECK-NEXT: bx lr 242entry: 243 %i = and i32 %n, 7 244 %cmp = icmp eq i32 %i, 0 245 tail call void @llvm.assume(i1 %cmp) 246 %cmp18 = icmp sgt i32 %n, 0 247 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup 248 249vector.ph: ; preds = %entry 250 %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 251 %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer 252 br label %vector.body 253 254vector.body: ; preds = %vector.body, %vector.ph 255 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 256 %i1 = getelementptr inbounds half, ptr %A, i32 %index 257 %wide.load = load <8 x half>, ptr %i1, align 4 258 %i3 = fsub fast <8 x half> %broadcast.splat11, %wide.load 259 %i4 = getelementptr inbounds half, ptr %C, i32 %index 260 store <8 x half> %i3, ptr %i4, align 4 261 %index.next = add i32 %index, 8 262 %i6 = icmp eq i32 %index.next, %n 263 br i1 %i6, label %for.cond.cleanup, label %vector.body 264 265for.cond.cleanup: ; preds = %vector.body, %entry 266 ret void 267} 268 269 270define arm_aapcs_vfpcc void @test_fmas(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 271; CHECK-LABEL: test_fmas: 272; CHECK: @ %bb.0: @ %entry 273; CHECK-NEXT: cmp r3, #1 274; CHECK-NEXT: it lt 275; CHECK-NEXT: bxlt lr 276; CHECK-NEXT: .LBB6_1: @ %vector.ph 277; CHECK-NEXT: vmov.f16 r12, s0 278; CHECK-NEXT: .LBB6_2: @ %vector.body 279; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 280; CHECK-NEXT: vldrw.u32 q0, [r0], #16 281; CHECK-NEXT: vldrw.u32 q1, [r1], #16 282; CHECK-NEXT: subs r3, #8 283; CHECK-NEXT: vfmas.f16 q1, q0, r12 284; CHECK-NEXT: vstrb.8 q1, [r2], #16 285; CHECK-NEXT: bne .LBB6_2 286; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 287; CHECK-NEXT: bx lr 288entry: 289 %i = and i32 %n, 7 290 %cmp = icmp eq i32 %i, 0 291 tail call void @llvm.assume(i1 %cmp) 292 %cmp110 = icmp sgt i32 %n, 0 293 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 294 295vector.ph: ; preds = %entry 296 %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 297 %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer 298 br label %vector.body 299 300vector.body: ; preds = %vector.body, %vector.ph 301 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 302 %i1 = getelementptr inbounds half, ptr %A, i32 %index 303 %wide.load = load <8 x half>, ptr %i1, align 4 304 %i3 = getelementptr inbounds half, ptr %B, i32 %index 305 %wide.load12 = load <8 x half>, ptr %i3, align 4 306 %i5 = fmul fast <8 x half> %wide.load12, %wide.load 307 %i6 = fadd fast <8 x half> %i5, %broadcast.splat14 308 %i7 = getelementptr inbounds half, ptr %D, i32 %index 309 store <8 x half> %i6, ptr %i7, align 4 310 %index.next = add i32 %index, 8 311 %i9 = icmp eq i32 %index.next, %n 312 br i1 %i9, label %for.cond.cleanup, label %vector.body 313 314for.cond.cleanup: ; preds = %vector.body, %entry 315 ret void 316} 317 318define arm_aapcs_vfpcc void @test_fmas_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 319; CHECK-LABEL: test_fmas_r: 320; CHECK: @ %bb.0: @ %entry 321; CHECK-NEXT: cmp r3, #1 322; CHECK-NEXT: it lt 323; CHECK-NEXT: bxlt lr 324; CHECK-NEXT: .LBB7_1: @ %vector.ph 325; CHECK-NEXT: vmov.f16 r12, s0 326; CHECK-NEXT: .LBB7_2: @ %vector.body 327; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 328; CHECK-NEXT: vldrw.u32 q0, [r0], #16 329; CHECK-NEXT: vldrw.u32 q1, [r1], #16 330; CHECK-NEXT: subs r3, #8 331; CHECK-NEXT: vfmas.f16 q1, q0, r12 332; CHECK-NEXT: vstrb.8 q1, [r2], #16 333; CHECK-NEXT: bne .LBB7_2 334; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 335; CHECK-NEXT: bx lr 336entry: 337 %i = and i32 %n, 7 338 %cmp = icmp eq i32 %i, 0 339 tail call void @llvm.assume(i1 %cmp) 340 %cmp110 = icmp sgt i32 %n, 0 341 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 342 343vector.ph: ; preds = %entry 344 %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 345 %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer 346 br label %vector.body 347 348vector.body: ; preds = %vector.body, %vector.ph 349 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 350 %i1 = getelementptr inbounds half, ptr %A, i32 %index 351 %wide.load = load <8 x half>, ptr %i1, align 4 352 %i3 = getelementptr inbounds half, ptr %B, i32 %index 353 %wide.load12 = load <8 x half>, ptr %i3, align 4 354 %i5 = fmul fast <8 x half> %wide.load12, %wide.load 355 %i6 = fadd fast <8 x half> %broadcast.splat14, %i5 356 %i7 = getelementptr inbounds half, ptr %D, i32 %index 357 store <8 x half> %i6, ptr %i7, align 4 358 %index.next = add i32 %index, 8 359 %i9 = icmp eq i32 %index.next, %n 360 br i1 %i9, label %for.cond.cleanup, label %vector.body 361 362for.cond.cleanup: ; preds = %vector.body, %entry 363 ret void 364} 365 366define arm_aapcs_vfpcc void @test_fma(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 367; CHECK-LABEL: test_fma: 368; CHECK: @ %bb.0: @ %entry 369; CHECK-NEXT: cmp r3, #1 370; CHECK-NEXT: it lt 371; CHECK-NEXT: bxlt lr 372; CHECK-NEXT: .LBB8_1: @ %vector.ph 373; CHECK-NEXT: vmov.f16 r12, s0 374; CHECK-NEXT: .LBB8_2: @ %vector.body 375; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 376; CHECK-NEXT: vldrw.u32 q0, [r0], #16 377; CHECK-NEXT: vldrw.u32 q1, [r1], #16 378; CHECK-NEXT: subs r3, #8 379; CHECK-NEXT: vfma.f16 q1, q0, r12 380; CHECK-NEXT: vstrb.8 q1, [r2], #16 381; CHECK-NEXT: bne .LBB8_2 382; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 383; CHECK-NEXT: bx lr 384entry: 385 %i = and i32 %n, 7 386 %cmp = icmp eq i32 %i, 0 387 tail call void @llvm.assume(i1 %cmp) 388 %cmp110 = icmp sgt i32 %n, 0 389 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 390 391vector.ph: ; preds = %entry 392 %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 393 %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer 394 br label %vector.body 395 396vector.body: ; preds = %vector.body, %vector.ph 397 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 398 %i1 = getelementptr inbounds half, ptr %A, i32 %index 399 %wide.load = load <8 x half>, ptr %i1, align 4 400 %i3 = fmul fast <8 x half> %wide.load, %broadcast.splat13 401 %i4 = getelementptr inbounds half, ptr %B, i32 %index 402 %wide.load14 = load <8 x half>, ptr %i4, align 4 403 %i6 = fadd fast <8 x half> %i3, %wide.load14 404 %i7 = getelementptr inbounds half, ptr %D, i32 %index 405 store <8 x half> %i6, ptr %i7, align 4 406 %index.next = add i32 %index, 8 407 %i9 = icmp eq i32 %index.next, %n 408 br i1 %i9, label %for.cond.cleanup, label %vector.body 409 410for.cond.cleanup: ; preds = %vector.body, %entry 411 ret void 412} 413 414define arm_aapcs_vfpcc void @test_fma_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 415; CHECK-LABEL: test_fma_r: 416; CHECK: @ %bb.0: @ %entry 417; CHECK-NEXT: cmp r3, #1 418; CHECK-NEXT: it lt 419; CHECK-NEXT: bxlt lr 420; CHECK-NEXT: .LBB9_1: @ %vector.ph 421; CHECK-NEXT: vmov.f16 r12, s0 422; CHECK-NEXT: .LBB9_2: @ %vector.body 423; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 424; CHECK-NEXT: vldrw.u32 q0, [r0], #16 425; CHECK-NEXT: vldrw.u32 q1, [r1], #16 426; CHECK-NEXT: subs r3, #8 427; CHECK-NEXT: vfma.f16 q1, q0, r12 428; CHECK-NEXT: vstrb.8 q1, [r2], #16 429; CHECK-NEXT: bne .LBB9_2 430; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 431; CHECK-NEXT: bx lr 432entry: 433 %i = and i32 %n, 7 434 %cmp = icmp eq i32 %i, 0 435 tail call void @llvm.assume(i1 %cmp) 436 %cmp110 = icmp sgt i32 %n, 0 437 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 438 439vector.ph: ; preds = %entry 440 %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 441 %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer 442 br label %vector.body 443 444vector.body: ; preds = %vector.body, %vector.ph 445 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 446 %i1 = getelementptr inbounds half, ptr %A, i32 %index 447 %wide.load = load <8 x half>, ptr %i1, align 4 448 %i3 = fmul fast <8 x half> %broadcast.splat13, %wide.load 449 %i4 = getelementptr inbounds half, ptr %B, i32 %index 450 %wide.load14 = load <8 x half>, ptr %i4, align 4 451 %i6 = fadd fast <8 x half> %i3, %wide.load14 452 %i7 = getelementptr inbounds half, ptr %D, i32 %index 453 store <8 x half> %i6, ptr %i7, align 4 454 %index.next = add i32 %index, 8 455 %i9 = icmp eq i32 %index.next, %n 456 br i1 %i9, label %for.cond.cleanup, label %vector.body 457 458for.cond.cleanup: ; preds = %vector.body, %entry 459 ret void 460} 461 462 463define arm_aapcs_vfpcc void @test_fmss(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 464; CHECK-LABEL: test_fmss: 465; CHECK: @ %bb.0: @ %entry 466; CHECK-NEXT: cmp r3, #1 467; CHECK-NEXT: it lt 468; CHECK-NEXT: bxlt lr 469; CHECK-NEXT: .LBB10_1: @ %vector.ph 470; CHECK-NEXT: vmov.f16 r12, s0 471; CHECK-NEXT: vdup.16 q0, r12 472; CHECK-NEXT: vneg.f16 q0, q0 473; CHECK-NEXT: .LBB10_2: @ %vector.body 474; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 475; CHECK-NEXT: vldrw.u32 q1, [r0], #16 476; CHECK-NEXT: vldrw.u32 q2, [r1], #16 477; CHECK-NEXT: vmov q3, q0 478; CHECK-NEXT: subs r3, #8 479; CHECK-NEXT: vfma.f16 q3, q2, q1 480; CHECK-NEXT: vstrb.8 q3, [r2], #16 481; CHECK-NEXT: bne .LBB10_2 482; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 483; CHECK-NEXT: bx lr 484entry: 485 %i = and i32 %n, 7 486 %cmp = icmp eq i32 %i, 0 487 tail call void @llvm.assume(i1 %cmp) 488 %cmp110 = icmp sgt i32 %n, 0 489 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 490 491vector.ph: ; preds = %entry 492 %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 493 %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer 494 br label %vector.body 495 496vector.body: ; preds = %vector.body, %vector.ph 497 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 498 %i1 = getelementptr inbounds half, ptr %A, i32 %index 499 %wide.load = load <8 x half>, ptr %i1, align 4 500 %i3 = getelementptr inbounds half, ptr %B, i32 %index 501 %wide.load12 = load <8 x half>, ptr %i3, align 4 502 %i5 = fmul fast <8 x half> %wide.load12, %wide.load 503 %i6 = fsub fast <8 x half> %i5, %broadcast.splat14 504 %i7 = getelementptr inbounds half, ptr %D, i32 %index 505 store <8 x half> %i6, ptr %i7, align 4 506 %index.next = add i32 %index, 8 507 %i9 = icmp eq i32 %index.next, %n 508 br i1 %i9, label %for.cond.cleanup, label %vector.body 509 510for.cond.cleanup: ; preds = %vector.body, %entry 511 ret void 512} 513 514define arm_aapcs_vfpcc void @test_fmss_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 515; CHECK-LABEL: test_fmss_r: 516; CHECK: @ %bb.0: @ %entry 517; CHECK-NEXT: cmp r3, #1 518; CHECK-NEXT: it lt 519; CHECK-NEXT: bxlt lr 520; CHECK-NEXT: .LBB11_1: @ %vector.ph 521; CHECK-NEXT: vmov.f16 r12, s0 522; CHECK-NEXT: vdup.16 q0, r12 523; CHECK-NEXT: .LBB11_2: @ %vector.body 524; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 525; CHECK-NEXT: vldrw.u32 q1, [r0], #16 526; CHECK-NEXT: vldrw.u32 q2, [r1], #16 527; CHECK-NEXT: vmov q3, q0 528; CHECK-NEXT: subs r3, #8 529; CHECK-NEXT: vfms.f16 q3, q2, q1 530; CHECK-NEXT: vstrb.8 q3, [r2], #16 531; CHECK-NEXT: bne .LBB11_2 532; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 533; CHECK-NEXT: bx lr 534entry: 535 %i = and i32 %n, 7 536 %cmp = icmp eq i32 %i, 0 537 tail call void @llvm.assume(i1 %cmp) 538 %cmp110 = icmp sgt i32 %n, 0 539 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 540 541vector.ph: ; preds = %entry 542 %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 543 %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer 544 br label %vector.body 545 546vector.body: ; preds = %vector.body, %vector.ph 547 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 548 %i1 = getelementptr inbounds half, ptr %A, i32 %index 549 %wide.load = load <8 x half>, ptr %i1, align 4 550 %i3 = getelementptr inbounds half, ptr %B, i32 %index 551 %wide.load12 = load <8 x half>, ptr %i3, align 4 552 %i5 = fmul fast <8 x half> %wide.load12, %wide.load 553 %i6 = fsub fast <8 x half> %broadcast.splat14, %i5 554 %i7 = getelementptr inbounds half, ptr %D, i32 %index 555 store <8 x half> %i6, ptr %i7, align 4 556 %index.next = add i32 %index, 8 557 %i9 = icmp eq i32 %index.next, %n 558 br i1 %i9, label %for.cond.cleanup, label %vector.body 559 560for.cond.cleanup: ; preds = %vector.body, %entry 561 ret void 562} 563 564define arm_aapcs_vfpcc void @test_fms(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 565; CHECK-LABEL: test_fms: 566; CHECK: @ %bb.0: @ %entry 567; CHECK-NEXT: cmp r3, #1 568; CHECK-NEXT: it lt 569; CHECK-NEXT: bxlt lr 570; CHECK-NEXT: .LBB12_1: @ %vector.ph 571; CHECK-NEXT: vmov.f16 r12, s0 572; CHECK-NEXT: .LBB12_2: @ %vector.body 573; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 574; CHECK-NEXT: vldrw.u32 q0, [r1], #16 575; CHECK-NEXT: vldrw.u32 q1, [r0], #16 576; CHECK-NEXT: subs r3, #8 577; CHECK-NEXT: vneg.f16 q0, q0 578; CHECK-NEXT: vfma.f16 q0, q1, r12 579; CHECK-NEXT: vstrb.8 q0, [r2], #16 580; CHECK-NEXT: bne .LBB12_2 581; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 582; CHECK-NEXT: bx lr 583entry: 584 %i = and i32 %n, 7 585 %cmp = icmp eq i32 %i, 0 586 tail call void @llvm.assume(i1 %cmp) 587 %cmp110 = icmp sgt i32 %n, 0 588 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 589 590vector.ph: ; preds = %entry 591 %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 592 %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer 593 br label %vector.body 594 595vector.body: ; preds = %vector.body, %vector.ph 596 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 597 %i1 = getelementptr inbounds half, ptr %A, i32 %index 598 %wide.load = load <8 x half>, ptr %i1, align 4 599 %i3 = fmul fast <8 x half> %wide.load, %broadcast.splat13 600 %i4 = getelementptr inbounds half, ptr %B, i32 %index 601 %wide.load14 = load <8 x half>, ptr %i4, align 4 602 %i6 = fsub fast <8 x half> %i3, %wide.load14 603 %i7 = getelementptr inbounds half, ptr %D, i32 %index 604 store <8 x half> %i6, ptr %i7, align 4 605 %index.next = add i32 %index, 8 606 %i9 = icmp eq i32 %index.next, %n 607 br i1 %i9, label %for.cond.cleanup, label %vector.body 608 609for.cond.cleanup: ; preds = %vector.body, %entry 610 ret void 611} 612 613define arm_aapcs_vfpcc void @test_fms_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) { 614; CHECK-LABEL: test_fms_r: 615; CHECK: @ %bb.0: @ %entry 616; CHECK-NEXT: cmp r3, #1 617; CHECK-NEXT: it lt 618; CHECK-NEXT: bxlt lr 619; CHECK-NEXT: .LBB13_1: @ %vector.ph 620; CHECK-NEXT: vmov.f16 r12, s0 621; CHECK-NEXT: .LBB13_2: @ %vector.body 622; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 623; CHECK-NEXT: vldrw.u32 q0, [r1], #16 624; CHECK-NEXT: vldrw.u32 q1, [r0], #16 625; CHECK-NEXT: subs r3, #8 626; CHECK-NEXT: vneg.f16 q0, q0 627; CHECK-NEXT: vfma.f16 q0, q1, r12 628; CHECK-NEXT: vstrb.8 q0, [r2], #16 629; CHECK-NEXT: bne .LBB13_2 630; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 631; CHECK-NEXT: bx lr 632entry: 633 %i = and i32 %n, 7 634 %cmp = icmp eq i32 %i, 0 635 tail call void @llvm.assume(i1 %cmp) 636 %cmp110 = icmp sgt i32 %n, 0 637 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup 638 639vector.ph: ; preds = %entry 640 %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 641 %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer 642 br label %vector.body 643 644vector.body: ; preds = %vector.body, %vector.ph 645 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 646 %i1 = getelementptr inbounds half, ptr %A, i32 %index 647 %wide.load = load <8 x half>, ptr %i1, align 4 648 %i3 = fmul fast <8 x half> %broadcast.splat13, %wide.load 649 %i4 = getelementptr inbounds half, ptr %B, i32 %index 650 %wide.load14 = load <8 x half>, ptr %i4, align 4 651 %i6 = fsub fast <8 x half> %i3, %wide.load14 652 %i7 = getelementptr inbounds half, ptr %D, i32 %index 653 store <8 x half> %i6, ptr %i7, align 4 654 %index.next = add i32 %index, 8 655 %i9 = icmp eq i32 %index.next, %n 656 br i1 %i9, label %for.cond.cleanup, label %vector.body 657 658for.cond.cleanup: ; preds = %vector.body, %entry 659 ret void 660} 661 662 663define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias nocapture readonly %pOutT1, ptr noalias nocapture readonly %pPRT_in, ptr noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr { 664; CHECK-LABEL: test_nested: 665; CHECK: @ %bb.0: @ %for.body.us.preheader 666; CHECK-NEXT: .save {r4, r5, r6, lr} 667; CHECK-NEXT: push {r4, r5, r6, lr} 668; CHECK-NEXT: ldrd lr, r12, [sp, #16] 669; CHECK-NEXT: lsl.w r3, r12, #1 670; CHECK-NEXT: .LBB14_1: @ %for.body.us 671; CHECK-NEXT: @ =>This Loop Header: Depth=1 672; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 673; CHECK-NEXT: ldrh r4, [r1] 674; CHECK-NEXT: mov r5, r2 675; CHECK-NEXT: mov r6, r12 676; CHECK-NEXT: vdup.16 q0, r4 677; CHECK-NEXT: mov r4, r0 678; CHECK-NEXT: .LBB14_2: @ %vector.body 679; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 680; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 681; CHECK-NEXT: vldrw.u32 q1, [r5], #16 682; CHECK-NEXT: vldrw.u32 q2, [r4] 683; CHECK-NEXT: subs r6, #8 684; CHECK-NEXT: vfms.f16 q2, q1, q0 685; CHECK-NEXT: vstrb.8 q2, [r4], #16 686; CHECK-NEXT: bne .LBB14_2 687; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us 688; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 689; CHECK-NEXT: add r0, r3 690; CHECK-NEXT: add r2, r3 691; CHECK-NEXT: adds r1, #2 692; CHECK-NEXT: le lr, .LBB14_1 693; CHECK-NEXT: @ %bb.4: @ %for.end14 694; CHECK-NEXT: pop {r4, r5, r6, pc} 695for.body.us.preheader: 696 %cmp = icmp sgt i32 %numRows, 0 697 tail call void @llvm.assume(i1 %cmp) 698 %cmp1 = icmp sgt i32 %numCols, 0 699 tail call void @llvm.assume(i1 %cmp1) 700 %rem = and i32 %numCols, 7 701 %cmp2 = icmp eq i32 %rem, 0 702 tail call void @llvm.assume(i1 %cmp2) 703 %cmp3 = icmp slt i32 %l, %numCols 704 tail call void @llvm.assume(i1 %cmp3) 705 br label %for.body.us 706 707for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader 708 %pInT1.addr.038.us = phi ptr [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ] 709 %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ] 710 %pOutT1.addr.036.us = phi ptr [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ] 711 %pPRT_in.addr.035.us = phi ptr [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ] 712 %scevgep = getelementptr half, ptr %pPRT_in.addr.035.us, i32 %numCols 713 %i = load half, ptr %pOutT1.addr.036.us, align 4 714 %broadcast.splatinsert47 = insertelement <8 x half> undef, half %i, i32 0 715 %broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer 716 br label %vector.body 717 718vector.body: ; preds = %vector.body, %for.body.us 719 %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ] 720 %next.gep = getelementptr half, ptr %pInT1.addr.038.us, i32 %index 721 %next.gep45 = getelementptr half, ptr %pPRT_in.addr.035.us, i32 %index 722 %wide.load = load <8 x half>, ptr %next.gep, align 4 723 %wide.load46 = load <8 x half>, ptr %next.gep45, align 4 724 %i3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48 725 %i4 = fsub fast <8 x half> %wide.load, %i3 726 store <8 x half> %i4, ptr %next.gep, align 4 727 %index.next = add i32 %index, 8 728 %i5 = icmp eq i32 %index.next, %numCols 729 br i1 %i5, label %for.cond6.for.end_crit_edge.us, label %vector.body 730 731for.cond6.for.end_crit_edge.us: ; preds = %vector.body 732 %incdec.ptr.us = getelementptr inbounds half, ptr %pOutT1.addr.036.us, i32 1 733 %scevgep40 = getelementptr half, ptr %pInT1.addr.038.us, i32 %numCols 734 %inc13.us = add nuw nsw i32 %i.037.us, 1 735 %exitcond41 = icmp eq i32 %inc13.us, %numRows 736 br i1 %exitcond41, label %for.end14, label %for.body.us 737 738for.end14: ; preds = %for.cond6.for.end_crit_edge.us 739 ret void 740} 741 742%struct.arm_fir_instance_f32 = type { i16, ptr, ptr } 743define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr %pDst, i32 %blockSize) { 744; CHECK-LABEL: arm_fir_f32_1_4_mve: 745; CHECK: @ %bb.0: @ %entry 746; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 747; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 748; CHECK-NEXT: .pad #16 749; CHECK-NEXT: sub sp, #16 750; CHECK-NEXT: ldrh.w r9, [r0] 751; CHECK-NEXT: ldr.w r10, [r0, #4] 752; CHECK-NEXT: sub.w r6, r9, #1 753; CHECK-NEXT: cmp r6, #3 754; CHECK-NEXT: bhi .LBB15_6 755; CHECK-NEXT: @ %bb.1: @ %if.then 756; CHECK-NEXT: ldr r7, [r0, #8] 757; CHECK-NEXT: add.w r4, r10, r6, lsl #1 758; CHECK-NEXT: lsrs r5, r3, #2 759; CHECK-NEXT: ldrh.w r8, [r7, #6] 760; CHECK-NEXT: ldrh.w r12, [r7, #4] 761; CHECK-NEXT: ldrh r6, [r7, #2] 762; CHECK-NEXT: ldrh r7, [r7] 763; CHECK-NEXT: wls lr, r5, .LBB15_5 764; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph 765; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill 766; CHECK-NEXT: bic r5, r3, #3 767; CHECK-NEXT: add.w r9, r10, #2 768; CHECK-NEXT: str r5, [sp] @ 4-byte Spill 769; CHECK-NEXT: add.w r5, r2, r5, lsl #1 770; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill 771; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 772; CHECK-NEXT: .LBB15_3: @ %while.body 773; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 774; CHECK-NEXT: vldrw.u32 q0, [r1], #8 775; CHECK-NEXT: sub.w r11, r9, #2 776; CHECK-NEXT: add.w r5, r9, #2 777; CHECK-NEXT: vstrb.8 q0, [r4], #8 778; CHECK-NEXT: vldrw.u32 q0, [r11] 779; CHECK-NEXT: vldrw.u32 q1, [r9] 780; CHECK-NEXT: vmul.f16 q0, q0, r7 781; CHECK-NEXT: vfma.f16 q0, q1, r6 782; CHECK-NEXT: vldrw.u32 q1, [r5] 783; CHECK-NEXT: vfma.f16 q0, q1, r12 784; CHECK-NEXT: vldrw.u32 q1, [r9, #4] 785; CHECK-NEXT: add.w r9, r9, #8 786; CHECK-NEXT: vfma.f16 q0, q1, r8 787; CHECK-NEXT: vstrb.8 q0, [r2], #8 788; CHECK-NEXT: le lr, .LBB15_3 789; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit 790; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload 791; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload 792; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload 793; CHECK-NEXT: add.w r10, r10, r2, lsl #1 794; CHECK-NEXT: add.w r1, r1, r2, lsl #1 795; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload 796; CHECK-NEXT: .LBB15_5: @ %while.end 797; CHECK-NEXT: and r5, r3, #3 798; CHECK-NEXT: vldrw.u32 q0, [r1] 799; CHECK-NEXT: vctp.16 r5 800; CHECK-NEXT: add.w r1, r10, #2 801; CHECK-NEXT: vpst 802; CHECK-NEXT: vstrht.16 q0, [r4] 803; CHECK-NEXT: vldrw.u32 q0, [r10] 804; CHECK-NEXT: vldrw.u32 q1, [r1] 805; CHECK-NEXT: add.w r1, r10, #6 806; CHECK-NEXT: vmul.f16 q0, q0, r7 807; CHECK-NEXT: vfma.f16 q0, q1, r6 808; CHECK-NEXT: vldrw.u32 q1, [r10, #4] 809; CHECK-NEXT: vfma.f16 q0, q1, r12 810; CHECK-NEXT: vldrw.u32 q1, [r1] 811; CHECK-NEXT: vfma.f16 q0, q1, r8 812; CHECK-NEXT: vpst 813; CHECK-NEXT: vstrht.16 q0, [r2] 814; CHECK-NEXT: ldr.w r10, [r0, #4] 815; CHECK-NEXT: .LBB15_6: @ %if.end 816; CHECK-NEXT: add.w r0, r10, r3, lsl #1 817; CHECK-NEXT: lsr.w r1, r9, #2 818; CHECK-NEXT: wls lr, r1, .LBB15_10 819; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader 820; CHECK-NEXT: bic r2, r9, #3 821; CHECK-NEXT: adds r1, r2, r3 822; CHECK-NEXT: mov r3, r10 823; CHECK-NEXT: add.w r1, r10, r1, lsl #1 824; CHECK-NEXT: .LBB15_8: @ %while.body51 825; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 826; CHECK-NEXT: vldrw.u32 q0, [r0], #8 827; CHECK-NEXT: vstrb.8 q0, [r3], #8 828; CHECK-NEXT: le lr, .LBB15_8 829; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit 830; CHECK-NEXT: add.w r10, r10, r2, lsl #1 831; CHECK-NEXT: mov r0, r1 832; CHECK-NEXT: .LBB15_10: @ %while.end55 833; CHECK-NEXT: ands r1, r9, #3 834; CHECK-NEXT: beq .LBB15_12 835; CHECK-NEXT: @ %bb.11: @ %if.then59 836; CHECK-NEXT: vldrw.u32 q0, [r0] 837; CHECK-NEXT: vctp.16 r1 838; CHECK-NEXT: vpst 839; CHECK-NEXT: vstrht.16 q0, [r10] 840; CHECK-NEXT: .LBB15_12: @ %if.end61 841; CHECK-NEXT: add sp, #16 842; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 843entry: 844 %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1 845 %i = load ptr, ptr %pState1, align 4 846 %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2 847 %i1 = load ptr, ptr %pCoeffs2, align 4 848 %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0 849 %i2 = load i16, ptr %numTaps3, align 4 850 %conv = zext i16 %i2 to i32 851 %sub = add nsw i32 %conv, -1 852 %cmp = icmp ult i32 %sub, 4 853 br i1 %cmp, label %if.then, label %if.end 854 855if.then: ; preds = %entry 856 %arrayidx = getelementptr inbounds half, ptr %i, i32 %sub 857 %incdec.ptr = getelementptr inbounds half, ptr %i1, i32 1 858 %i3 = load half, ptr %i1, align 4 859 %incdec.ptr6 = getelementptr inbounds half, ptr %i1, i32 2 860 %i4 = load half, ptr %incdec.ptr, align 4 861 %incdec.ptr7 = getelementptr inbounds half, ptr %i1, i32 3 862 %i5 = load half, ptr %incdec.ptr6, align 4 863 %i6 = load half, ptr %incdec.ptr7, align 4 864 %shr = lshr i32 %blockSize, 2 865 %cmp9146 = icmp eq i32 %shr, 0 866 %.pre161 = insertelement <8 x half> undef, half %i3, i32 0 867 %.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer 868 %.pre163 = insertelement <8 x half> undef, half %i4, i32 0 869 %.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer 870 %.pre165 = insertelement <8 x half> undef, half %i5, i32 0 871 %.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer 872 %.pre167 = insertelement <8 x half> undef, half %i6, i32 0 873 %.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer 874 br i1 %cmp9146, label %while.end, label %while.body.lr.ph 875 876while.body.lr.ph: ; preds = %if.then 877 %i7 = and i32 %blockSize, -4 878 %scevgep158 = getelementptr half, ptr %pDst, i32 %i7 879 br label %while.body 880 881while.body: ; preds = %while.body, %while.body.lr.ph 882 %pStateCur.0151 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ] 883 %pSamples.0150 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr24, %while.body ] 884 %pOutput.0149 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ] 885 %pTempSrc.0148 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ] 886 %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ] 887 %i9 = load <8 x half>, ptr %pTempSrc.0148, align 4 888 store <8 x half> %i9, ptr %pStateCur.0151, align 4 889 %add.ptr = getelementptr inbounds half, ptr %pStateCur.0151, i32 4 890 %add.ptr11 = getelementptr inbounds half, ptr %pTempSrc.0148, i32 4 891 %i12 = load <8 x half>, ptr %pSamples.0150, align 4 892 %i13 = fmul fast <8 x half> %i12, %.pre162 893 %arrayidx12 = getelementptr inbounds half, ptr %pSamples.0150, i32 1 894 %i15 = load <8 x half>, ptr %arrayidx12, align 4 895 %mul = fmul fast <8 x half> %i15, %.pre164 896 %add = fadd fast <8 x half> %mul, %i13 897 %arrayidx13 = getelementptr inbounds half, ptr %pSamples.0150, i32 2 898 %i17 = load <8 x half>, ptr %arrayidx13, align 4 899 %mul16 = fmul fast <8 x half> %i17, %.pre166 900 %add17 = fadd fast <8 x half> %add, %mul16 901 %arrayidx18 = getelementptr inbounds half, ptr %pSamples.0150, i32 3 902 %i19 = load <8 x half>, ptr %arrayidx18, align 4 903 %mul21 = fmul fast <8 x half> %i19, %.pre168 904 %add22 = fadd fast <8 x half> %add17, %mul21 905 store <8 x half> %add22, ptr %pOutput.0149, align 4 906 %add.ptr23 = getelementptr inbounds half, ptr %pOutput.0149, i32 4 907 %add.ptr24 = getelementptr inbounds half, ptr %pSamples.0150, i32 4 908 %dec = add nsw i32 %blkCnt.0147, -1 909 %cmp9 = icmp eq i32 %dec, 0 910 br i1 %cmp9, label %while.end.loopexit, label %while.body 911 912while.end.loopexit: ; preds = %while.body 913 %scevgep157 = getelementptr half, ptr %pSrc, i32 %i7 914 %scevgep159 = getelementptr half, ptr %i, i32 %i7 915 br label %while.end 916 917while.end: ; preds = %while.end.loopexit, %if.then 918 %pTempSrc.0.lcssa = phi ptr [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ] 919 %pOutput.0.lcssa = phi ptr [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ] 920 %pSamples.0.lcssa = phi ptr [ %scevgep159, %while.end.loopexit ], [ %i, %if.then ] 921 %pStateCur.0.lcssa = phi ptr [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ] 922 %and = and i32 %blockSize, 3 923 %i21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and) 924 %i23 = load <8 x half>, ptr %pTempSrc.0.lcssa, align 4 925 tail call void @llvm.masked.store.v8f16.p0(<8 x half> %i23, ptr %pStateCur.0.lcssa, i32 4, <8 x i1> %i21) 926 %i26 = load <8 x half>, ptr %pSamples.0.lcssa, align 4 927 %i27 = fmul fast <8 x half> %i26, %.pre162 928 %arrayidx29 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 1 929 %i29 = load <8 x half>, ptr %arrayidx29, align 4 930 %mul32 = fmul fast <8 x half> %i29, %.pre164 931 %add33 = fadd fast <8 x half> %mul32, %i27 932 %arrayidx34 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 2 933 %i31 = load <8 x half>, ptr %arrayidx34, align 4 934 %mul37 = fmul fast <8 x half> %i31, %.pre166 935 %add38 = fadd fast <8 x half> %add33, %mul37 936 %arrayidx39 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 3 937 %i33 = load <8 x half>, ptr %arrayidx39, align 4 938 %mul42 = fmul fast <8 x half> %i33, %.pre168 939 %add43 = fadd fast <8 x half> %add38, %mul42 940 tail call void @llvm.masked.store.v8f16.p0(<8 x half> %add43, ptr %pOutput.0.lcssa, i32 4, <8 x i1> %i21) 941 %.pre = load ptr, ptr %pState1, align 4 942 br label %if.end 943 944if.end: ; preds = %while.end, %entry 945 %i35 = phi ptr [ %.pre, %while.end ], [ %i, %entry ] 946 %arrayidx45 = getelementptr inbounds half, ptr %i35, i32 %blockSize 947 %shr47 = lshr i32 %conv, 2 948 %cmp49141 = icmp eq i32 %shr47, 0 949 br i1 %cmp49141, label %while.end55, label %while.body51.preheader 950 951while.body51.preheader: ; preds = %if.end 952 %i36 = and i32 %conv, 65532 953 %i37 = add i32 %i36, %blockSize 954 %scevgep = getelementptr half, ptr %i35, i32 %i37 955 br label %while.body51 956 957while.body51: ; preds = %while.body51, %while.body51.preheader 958 %pTempSrc.1144 = phi ptr [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ] 959 %pTempDest.0143 = phi ptr [ %add.ptr53, %while.body51 ], [ %i35, %while.body51.preheader ] 960 %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ] 961 %i39 = load <8 x half>, ptr %pTempSrc.1144, align 4 962 store <8 x half> %i39, ptr %pTempDest.0143, align 4 963 %add.ptr52 = getelementptr inbounds half, ptr %pTempSrc.1144, i32 4 964 %add.ptr53 = getelementptr inbounds half, ptr %pTempDest.0143, i32 4 965 %dec54 = add nsw i32 %blkCnt.1142, -1 966 %cmp49 = icmp eq i32 %dec54, 0 967 br i1 %cmp49, label %while.end55.loopexit, label %while.body51 968 969while.end55.loopexit: ; preds = %while.body51 970 %scevgep156 = getelementptr half, ptr %i35, i32 %i36 971 br label %while.end55 972 973while.end55: ; preds = %while.end55.loopexit, %if.end 974 %pTempDest.0.lcssa = phi ptr [ %i35, %if.end ], [ %scevgep156, %while.end55.loopexit ] 975 %pTempSrc.1.lcssa = phi ptr [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ] 976 %and56 = and i32 %conv, 3 977 %cmp57 = icmp eq i32 %and56, 0 978 br i1 %cmp57, label %if.end61, label %if.then59 979 980if.then59: ; preds = %while.end55 981 %i41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56) 982 %i43 = load <8 x half>, ptr %pTempSrc.1.lcssa, align 4 983 tail call void @llvm.masked.store.v8f16.p0(<8 x half> %i43, ptr %pTempDest.0.lcssa, i32 4, <8 x i1> %i41) 984 br label %if.end61 985 986if.end61: ; preds = %if.then59, %while.end55 987 ret void 988} 989 990 991define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) { 992; CHECK-LABEL: fir: 993; CHECK: @ %bb.0: @ %entry 994; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 995; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 996; CHECK-NEXT: .pad #24 997; CHECK-NEXT: sub sp, #24 998; CHECK-NEXT: cmp r3, #8 999; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 1000; CHECK-NEXT: blo.w .LBB16_12 1001; CHECK-NEXT: @ %bb.1: @ %if.then 1002; CHECK-NEXT: lsrs.w r12, r3, #2 1003; CHECK-NEXT: beq.w .LBB16_12 1004; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph 1005; CHECK-NEXT: ldrh r4, [r0] 1006; CHECK-NEXT: movs r1, #1 1007; CHECK-NEXT: ldrd r5, r3, [r0, #4] 1008; CHECK-NEXT: sub.w r0, r4, #8 1009; CHECK-NEXT: add.w r7, r0, r0, lsr #29 1010; CHECK-NEXT: and r0, r0, #7 1011; CHECK-NEXT: asrs r6, r7, #3 1012; CHECK-NEXT: cmp r6, #1 1013; CHECK-NEXT: it gt 1014; CHECK-NEXT: asrgt r1, r7, #3 1015; CHECK-NEXT: add.w r7, r5, r4, lsl #1 1016; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 1017; CHECK-NEXT: subs r1, r7, #2 1018; CHECK-NEXT: rsbs r7, r4, #0 1019; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill 1020; CHECK-NEXT: add.w r7, r3, #16 1021; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill 1022; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill 1023; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill 1024; CHECK-NEXT: b .LBB16_6 1025; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit 1026; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 1027; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload 1028; CHECK-NEXT: add.w r5, r5, r0, lsl #1 1029; CHECK-NEXT: b .LBB16_5 1030; CHECK-NEXT: .LBB16_4: @ %for.end 1031; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 1032; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload 1033; CHECK-NEXT: wls lr, r0, .LBB16_5 1034; CHECK-NEXT: b .LBB16_10 1035; CHECK-NEXT: .LBB16_5: @ %while.end 1036; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 1037; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload 1038; CHECK-NEXT: subs.w r12, r12, #1 1039; CHECK-NEXT: vstrb.8 q0, [r2], #8 1040; CHECK-NEXT: add.w r0, r5, r0, lsl #1 1041; CHECK-NEXT: add.w r5, r0, #8 1042; CHECK-NEXT: beq.w .LBB16_12 1043; CHECK-NEXT: .LBB16_6: @ %while.body 1044; CHECK-NEXT: @ =>This Loop Header: Depth=1 1045; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 1046; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 1047; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload 1048; CHECK-NEXT: ldrh.w lr, [r3, #14] 1049; CHECK-NEXT: vldrw.u32 q0, [r0], #8 1050; CHECK-NEXT: ldrh.w r8, [r3, #12] 1051; CHECK-NEXT: ldrh r7, [r3, #10] 1052; CHECK-NEXT: ldrh r4, [r3, #8] 1053; CHECK-NEXT: ldrh r6, [r3, #6] 1054; CHECK-NEXT: ldrh.w r9, [r3, #4] 1055; CHECK-NEXT: ldrh.w r11, [r3, #2] 1056; CHECK-NEXT: ldrh.w r10, [r3] 1057; CHECK-NEXT: vstrb.8 q0, [r1], #8 1058; CHECK-NEXT: vldrw.u32 q0, [r5] 1059; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill 1060; CHECK-NEXT: adds r0, r5, #2 1061; CHECK-NEXT: vldrw.u32 q1, [r0] 1062; CHECK-NEXT: vmul.f16 q0, q0, r10 1063; CHECK-NEXT: adds r0, r5, #6 1064; CHECK-NEXT: vfma.f16 q0, q1, r11 1065; CHECK-NEXT: vldrw.u32 q1, [r5, #4] 1066; CHECK-NEXT: vfma.f16 q0, q1, r9 1067; CHECK-NEXT: vldrw.u32 q1, [r0] 1068; CHECK-NEXT: add.w r0, r5, #10 1069; CHECK-NEXT: vfma.f16 q0, q1, r6 1070; CHECK-NEXT: vldrw.u32 q1, [r5, #8] 1071; CHECK-NEXT: vfma.f16 q0, q1, r4 1072; CHECK-NEXT: vldrw.u32 q1, [r0] 1073; CHECK-NEXT: add.w r0, r5, #14 1074; CHECK-NEXT: vfma.f16 q0, q1, r7 1075; CHECK-NEXT: vldrw.u32 q1, [r5, #12] 1076; CHECK-NEXT: adds r5, #16 1077; CHECK-NEXT: vfma.f16 q0, q1, r8 1078; CHECK-NEXT: vldrw.u32 q1, [r0] 1079; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload 1080; CHECK-NEXT: vfma.f16 q0, q1, lr 1081; CHECK-NEXT: cmp r0, #16 1082; CHECK-NEXT: blo .LBB16_9 1083; CHECK-NEXT: @ %bb.7: @ %for.body.preheader 1084; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 1085; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload 1086; CHECK-NEXT: dls lr, r0 1087; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload 1088; CHECK-NEXT: .LBB16_8: @ %for.body 1089; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 1090; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1091; CHECK-NEXT: ldrh r0, [r6], #16 1092; CHECK-NEXT: vldrw.u32 q1, [r5] 1093; CHECK-NEXT: adds r4, r5, #2 1094; CHECK-NEXT: vfma.f16 q0, q1, r0 1095; CHECK-NEXT: vldrw.u32 q1, [r4] 1096; CHECK-NEXT: ldrh r0, [r6, #-14] 1097; CHECK-NEXT: adds r4, r5, #6 1098; CHECK-NEXT: vfma.f16 q0, q1, r0 1099; CHECK-NEXT: ldrh r0, [r6, #-12] 1100; CHECK-NEXT: vldrw.u32 q1, [r5, #4] 1101; CHECK-NEXT: vfma.f16 q0, q1, r0 1102; CHECK-NEXT: vldrw.u32 q1, [r4] 1103; CHECK-NEXT: ldrh r0, [r6, #-10] 1104; CHECK-NEXT: add.w r4, r5, #10 1105; CHECK-NEXT: vfma.f16 q0, q1, r0 1106; CHECK-NEXT: ldrh r0, [r6, #-8] 1107; CHECK-NEXT: vldrw.u32 q1, [r5, #8] 1108; CHECK-NEXT: vfma.f16 q0, q1, r0 1109; CHECK-NEXT: vldrw.u32 q1, [r4] 1110; CHECK-NEXT: ldrh r0, [r6, #-6] 1111; CHECK-NEXT: ldrh r4, [r6, #-2] 1112; CHECK-NEXT: vfma.f16 q0, q1, r0 1113; CHECK-NEXT: ldrh r0, [r6, #-4] 1114; CHECK-NEXT: vldrw.u32 q1, [r5, #12] 1115; CHECK-NEXT: vfma.f16 q0, q1, r0 1116; CHECK-NEXT: add.w r0, r5, #14 1117; CHECK-NEXT: vldrw.u32 q1, [r0] 1118; CHECK-NEXT: adds r5, #16 1119; CHECK-NEXT: vfma.f16 q0, q1, r4 1120; CHECK-NEXT: le lr, .LBB16_8 1121; CHECK-NEXT: b .LBB16_4 1122; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 1123; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload 1124; CHECK-NEXT: b .LBB16_4 1125; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader 1126; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 1127; CHECK-NEXT: mov r0, r5 1128; CHECK-NEXT: .LBB16_11: @ %while.body76 1129; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 1130; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1131; CHECK-NEXT: ldrh r4, [r6], #2 1132; CHECK-NEXT: vldrh.u16 q1, [r0], #2 1133; CHECK-NEXT: vfma.f16 q0, q1, r4 1134; CHECK-NEXT: le lr, .LBB16_11 1135; CHECK-NEXT: b .LBB16_3 1136; CHECK-NEXT: .LBB16_12: @ %if.end 1137; CHECK-NEXT: add sp, #24 1138; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1139entry: 1140 %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1 1141 %i = load ptr, ptr %pState1, align 4 1142 %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2 1143 %i1 = load ptr, ptr %pCoeffs2, align 4 1144 %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0 1145 %i2 = load i16, ptr %numTaps3, align 4 1146 %conv = zext i16 %i2 to i32 1147 %cmp = icmp ugt i32 %blockSize, 7 1148 br i1 %cmp, label %if.then, label %if.end 1149 1150if.then: ; preds = %entry 1151 %shr = lshr i32 %blockSize, 2 1152 %cmp5217 = icmp eq i32 %shr, 0 1153 br i1 %cmp5217, label %if.end, label %while.body.lr.ph 1154 1155while.body.lr.ph: ; preds = %if.then 1156 %sub = add nsw i32 %conv, -1 1157 %arrayidx = getelementptr inbounds half, ptr %i, i32 %sub 1158 %incdec.ptr = getelementptr inbounds half, ptr %i1, i32 1 1159 %incdec.ptr7 = getelementptr inbounds half, ptr %i1, i32 2 1160 %incdec.ptr8 = getelementptr inbounds half, ptr %i1, i32 3 1161 %incdec.ptr9 = getelementptr inbounds half, ptr %i1, i32 4 1162 %incdec.ptr10 = getelementptr inbounds half, ptr %i1, i32 5 1163 %incdec.ptr11 = getelementptr inbounds half, ptr %i1, i32 6 1164 %incdec.ptr12 = getelementptr inbounds half, ptr %i1, i32 7 1165 %sub37 = add nsw i32 %conv, -8 1166 %div = sdiv i32 %sub37, 8 1167 %pCoeffsCur.0199 = getelementptr inbounds half, ptr %i1, i32 8 1168 %cmp38201 = icmp ugt i16 %i2, 15 1169 %and = and i32 %sub37, 7 1170 %cmp74210 = icmp eq i32 %and, 0 1171 %idx.neg = sub nsw i32 0, %conv 1172 %i3 = icmp sgt i32 %div, 1 1173 %smax = select i1 %i3, i32 %div, i32 1 1174 br label %while.body 1175 1176while.body: ; preds = %while.end, %while.body.lr.ph 1177 %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ] 1178 %pStateCur.0221 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ] 1179 %pSamples.0220 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr83, %while.end ] 1180 %pTempSrc.0219 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ] 1181 %pOutput.0218 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ] 1182 %i4 = load half, ptr %i1, align 4 1183 %i5 = load half, ptr %incdec.ptr, align 4 1184 %i6 = load half, ptr %incdec.ptr7, align 4 1185 %i7 = load half, ptr %incdec.ptr8, align 4 1186 %i8 = load half, ptr %incdec.ptr9, align 4 1187 %i9 = load half, ptr %incdec.ptr10, align 4 1188 %i10 = load half, ptr %incdec.ptr11, align 4 1189 %i11 = load half, ptr %incdec.ptr12, align 4 1190 %i13 = load <8 x half>, ptr %pTempSrc.0219, align 4 1191 store <8 x half> %i13, ptr %pStateCur.0221, align 4 1192 %add.ptr = getelementptr inbounds half, ptr %pStateCur.0221, i32 4 1193 %add.ptr14 = getelementptr inbounds half, ptr %pTempSrc.0219, i32 4 1194 %i16 = load <8 x half>, ptr %pSamples.0220, align 4 1195 %.splatinsert = insertelement <8 x half> undef, half %i4, i32 0 1196 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 1197 %i17 = fmul fast <8 x half> %i16, %.splat 1198 %arrayidx15 = getelementptr inbounds half, ptr %pSamples.0220, i32 1 1199 %i19 = load <8 x half>, ptr %arrayidx15, align 4 1200 %.splatinsert16 = insertelement <8 x half> undef, half %i5, i32 0 1201 %.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer 1202 %i20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i19, <8 x half> %.splat17, <8 x half> %i17) 1203 %arrayidx18 = getelementptr inbounds half, ptr %pSamples.0220, i32 2 1204 %i22 = load <8 x half>, ptr %arrayidx18, align 4 1205 %.splatinsert19 = insertelement <8 x half> undef, half %i6, i32 0 1206 %.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer 1207 %i23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i22, <8 x half> %.splat20, <8 x half> %i20) 1208 %arrayidx21 = getelementptr inbounds half, ptr %pSamples.0220, i32 3 1209 %i25 = load <8 x half>, ptr %arrayidx21, align 4 1210 %.splatinsert22 = insertelement <8 x half> undef, half %i7, i32 0 1211 %.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer 1212 %i26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i25, <8 x half> %.splat23, <8 x half> %i23) 1213 %arrayidx24 = getelementptr inbounds half, ptr %pSamples.0220, i32 4 1214 %i28 = load <8 x half>, ptr %arrayidx24, align 4 1215 %.splatinsert25 = insertelement <8 x half> undef, half %i8, i32 0 1216 %.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer 1217 %i29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i28, <8 x half> %.splat26, <8 x half> %i26) 1218 %arrayidx27 = getelementptr inbounds half, ptr %pSamples.0220, i32 5 1219 %i31 = load <8 x half>, ptr %arrayidx27, align 4 1220 %.splatinsert28 = insertelement <8 x half> undef, half %i9, i32 0 1221 %.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer 1222 %i32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i31, <8 x half> %.splat29, <8 x half> %i29) 1223 %arrayidx30 = getelementptr inbounds half, ptr %pSamples.0220, i32 6 1224 %i34 = load <8 x half>, ptr %arrayidx30, align 4 1225 %.splatinsert31 = insertelement <8 x half> undef, half %i10, i32 0 1226 %.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer 1227 %i35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i34, <8 x half> %.splat32, <8 x half> %i32) 1228 %arrayidx33 = getelementptr inbounds half, ptr %pSamples.0220, i32 7 1229 %i37 = load <8 x half>, ptr %arrayidx33, align 4 1230 %.splatinsert34 = insertelement <8 x half> undef, half %i11, i32 0 1231 %.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer 1232 %i38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i37, <8 x half> %.splat35, <8 x half> %i35) 1233 %pSamples.1200 = getelementptr inbounds half, ptr %pSamples.0220, i32 8 1234 br i1 %cmp38201, label %for.body, label %for.end 1235 1236for.body: ; preds = %for.body, %while.body 1237 %pSamples.1207 = phi ptr [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ] 1238 %pCoeffsCur.0206 = phi ptr [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ] 1239 %.pn205 = phi ptr [ %pCoeffsCur.0206, %for.body ], [ %i1, %while.body ] 1240 %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ] 1241 %vecAcc0.0203 = phi <8 x half> [ %i70, %for.body ], [ %i38, %while.body ] 1242 %pSamples.0.pn202 = phi ptr [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ] 1243 %incdec.ptr40 = getelementptr inbounds half, ptr %.pn205, i32 9 1244 %i39 = load half, ptr %pCoeffsCur.0206, align 4 1245 %incdec.ptr41 = getelementptr inbounds half, ptr %.pn205, i32 10 1246 %i40 = load half, ptr %incdec.ptr40, align 4 1247 %incdec.ptr42 = getelementptr inbounds half, ptr %.pn205, i32 11 1248 %i41 = load half, ptr %incdec.ptr41, align 4 1249 %incdec.ptr43 = getelementptr inbounds half, ptr %.pn205, i32 12 1250 %i42 = load half, ptr %incdec.ptr42, align 4 1251 %incdec.ptr44 = getelementptr inbounds half, ptr %.pn205, i32 13 1252 %i43 = load half, ptr %incdec.ptr43, align 4 1253 %incdec.ptr45 = getelementptr inbounds half, ptr %.pn205, i32 14 1254 %i44 = load half, ptr %incdec.ptr44, align 4 1255 %incdec.ptr46 = getelementptr inbounds half, ptr %.pn205, i32 15 1256 %i45 = load half, ptr %incdec.ptr45, align 4 1257 %i46 = load half, ptr %incdec.ptr46, align 4 1258 %i48 = load <8 x half>, ptr %pSamples.1207, align 4 1259 %.splatinsert48 = insertelement <8 x half> undef, half %i39, i32 0 1260 %.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer 1261 %i49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203) 1262 %arrayidx50 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 9 1263 %i51 = load <8 x half>, ptr %arrayidx50, align 4 1264 %.splatinsert51 = insertelement <8 x half> undef, half %i40, i32 0 1265 %.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer 1266 %i52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i51, <8 x half> %.splat52, <8 x half> %i49) 1267 %arrayidx53 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 10 1268 %i54 = load <8 x half>, ptr %arrayidx53, align 4 1269 %.splatinsert54 = insertelement <8 x half> undef, half %i41, i32 0 1270 %.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer 1271 %i55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i54, <8 x half> %.splat55, <8 x half> %i52) 1272 %arrayidx56 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 11 1273 %i57 = load <8 x half>, ptr %arrayidx56, align 4 1274 %.splatinsert57 = insertelement <8 x half> undef, half %i42, i32 0 1275 %.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer 1276 %i58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i57, <8 x half> %.splat58, <8 x half> %i55) 1277 %arrayidx59 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 12 1278 %i60 = load <8 x half>, ptr %arrayidx59, align 4 1279 %.splatinsert60 = insertelement <8 x half> undef, half %i43, i32 0 1280 %.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer 1281 %i61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i60, <8 x half> %.splat61, <8 x half> %i58) 1282 %arrayidx62 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 13 1283 %i63 = load <8 x half>, ptr %arrayidx62, align 4 1284 %.splatinsert63 = insertelement <8 x half> undef, half %i44, i32 0 1285 %.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer 1286 %i64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i63, <8 x half> %.splat64, <8 x half> %i61) 1287 %arrayidx65 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 14 1288 %i66 = load <8 x half>, ptr %arrayidx65, align 4 1289 %.splatinsert66 = insertelement <8 x half> undef, half %i45, i32 0 1290 %.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer 1291 %i67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i66, <8 x half> %.splat67, <8 x half> %i64) 1292 %arrayidx68 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 15 1293 %i69 = load <8 x half>, ptr %arrayidx68, align 4 1294 %.splatinsert69 = insertelement <8 x half> undef, half %i46, i32 0 1295 %.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer 1296 %i70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i69, <8 x half> %.splat70, <8 x half> %i67) 1297 %inc = add nuw nsw i32 %i.0204, 1 1298 %pCoeffsCur.0 = getelementptr inbounds half, ptr %pCoeffsCur.0206, i32 8 1299 %pSamples.1 = getelementptr inbounds half, ptr %pSamples.1207, i32 8 1300 %exitcond = icmp eq i32 %inc, %smax 1301 br i1 %exitcond, label %for.end, label %for.body 1302 1303for.end: ; preds = %for.body, %while.body 1304 %vecAcc0.0.lcssa = phi <8 x half> [ %i38, %while.body ], [ %i70, %for.body ] 1305 %pCoeffsCur.0.lcssa = phi ptr [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ] 1306 %pSamples.1.lcssa = phi ptr [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ] 1307 br i1 %cmp74210, label %while.end, label %while.body76 1308 1309while.body76: ; preds = %while.body76, %for.end 1310 %pCoeffsCur.1214 = phi ptr [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ] 1311 %vecAcc0.1213 = phi <8 x half> [ %i74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ] 1312 %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ] 1313 %pSamples.2211 = phi ptr [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ] 1314 %incdec.ptr77 = getelementptr inbounds half, ptr %pCoeffsCur.1214, i32 1 1315 %i71 = load half, ptr %pCoeffsCur.1214, align 4 1316 %i73 = load <8 x half>, ptr %pSamples.2211, align 4 1317 %.splatinsert78 = insertelement <8 x half> undef, half %i71, i32 0 1318 %.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer 1319 %i74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213) 1320 %incdec.ptr80 = getelementptr inbounds half, ptr %pSamples.2211, i32 1 1321 %dec = add nsw i32 %numCnt.0212, -1 1322 %cmp74 = icmp sgt i32 %numCnt.0212, 1 1323 br i1 %cmp74, label %while.body76, label %while.end.loopexit 1324 1325while.end.loopexit: ; preds = %while.body76 1326 %scevgep = getelementptr half, ptr %pSamples.1.lcssa, i32 %and 1327 br label %while.end 1328 1329while.end: ; preds = %while.end.loopexit, %for.end 1330 %pSamples.2.lcssa = phi ptr [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ] 1331 %vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %i74, %while.end.loopexit ] 1332 store <8 x half> %vecAcc0.1.lcssa, ptr %pOutput.0218, align 4 1333 %add.ptr81 = getelementptr inbounds half, ptr %pOutput.0218, i32 4 1334 %add.ptr82 = getelementptr inbounds half, ptr %pSamples.2.lcssa, i32 4 1335 %add.ptr83 = getelementptr inbounds half, ptr %add.ptr82, i32 %idx.neg 1336 %dec84 = add nsw i32 %blkCnt.0222, -1 1337 %cmp5 = icmp eq i32 %dec84, 0 1338 br i1 %cmp5, label %if.end, label %while.body 1339 1340if.end: ; preds = %while.end, %if.then, %entry 1341 ret void 1342} 1343 1344%struct.arm_biquad_cascade_df2T_instance_f16 = type { i8, ptr, ptr } 1345define void @arm_biquad_cascade_df2T_f16(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) { 1346; CHECK-LABEL: arm_biquad_cascade_df2T_f16: 1347; CHECK: @ %bb.0: @ %entry 1348; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} 1349; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 1350; CHECK-NEXT: .pad #4 1351; CHECK-NEXT: sub sp, #4 1352; CHECK-NEXT: .vsave {d8, d9, d10, d11} 1353; CHECK-NEXT: vpush {d8, d9, d10, d11} 1354; CHECK-NEXT: vmov.i32 q0, #0x0 1355; CHECK-NEXT: ldrd r6, r12, [r0, #4] 1356; CHECK-NEXT: ldrb.w r9, [r0] 1357; CHECK-NEXT: vldr.16 s0, .LCPI17_0 1358; CHECK-NEXT: lsr.w r8, r3, #1 1359; CHECK-NEXT: b .LBB17_3 1360; CHECK-NEXT: .LBB17_1: @ %if.else 1361; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 1362; CHECK-NEXT: vmovx.f16 s5, s4 1363; CHECK-NEXT: vstr.16 s4, [r6] 1364; CHECK-NEXT: .LBB17_2: @ %if.end 1365; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 1366; CHECK-NEXT: vstr.16 s5, [r6, #2] 1367; CHECK-NEXT: add.w r12, r12, #10 1368; CHECK-NEXT: adds r6, #4 1369; CHECK-NEXT: subs.w r9, r9, #1 1370; CHECK-NEXT: mov r1, r2 1371; CHECK-NEXT: beq .LBB17_8 1372; CHECK-NEXT: .LBB17_3: @ %do.body 1373; CHECK-NEXT: @ =>This Loop Header: Depth=1 1374; CHECK-NEXT: @ Child Loop BB17_5 Depth 2 1375; CHECK-NEXT: vldrh.u16 q2, [r12] 1376; CHECK-NEXT: movs r5, #0 1377; CHECK-NEXT: vmov q4, q2 1378; CHECK-NEXT: vshlc q4, r5, #16 1379; CHECK-NEXT: vldrh.u16 q3, [r12, #4] 1380; CHECK-NEXT: vmov q5, q3 1381; CHECK-NEXT: vshlc q5, r5, #16 1382; CHECK-NEXT: vldrh.u16 q1, [r6] 1383; CHECK-NEXT: vmov.f32 s5, s1 1384; CHECK-NEXT: mov r5, r2 1385; CHECK-NEXT: wls lr, r8, .LBB17_6 1386; CHECK-NEXT: @ %bb.4: @ %while.body.preheader 1387; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 1388; CHECK-NEXT: mov r5, r2 1389; CHECK-NEXT: .LBB17_5: @ %while.body 1390; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1 1391; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1392; CHECK-NEXT: ldrh r7, [r1], #4 1393; CHECK-NEXT: vmov r0, s0 1394; CHECK-NEXT: vfma.f16 q1, q2, r7 1395; CHECK-NEXT: ldrh r4, [r1, #-2] 1396; CHECK-NEXT: vmov.u16 r7, q1[0] 1397; CHECK-NEXT: vfma.f16 q1, q3, r7 1398; CHECK-NEXT: vins.f16 s5, s0 1399; CHECK-NEXT: vfma.f16 q1, q4, r4 1400; CHECK-NEXT: vmov.u16 r4, q1[1] 1401; CHECK-NEXT: vfma.f16 q1, q5, r4 1402; CHECK-NEXT: strh r4, [r5, #2] 1403; CHECK-NEXT: vmov.f32 s4, s5 1404; CHECK-NEXT: strh r7, [r5], #4 1405; CHECK-NEXT: vmov.16 q1[2], r0 1406; CHECK-NEXT: le lr, .LBB17_5 1407; CHECK-NEXT: .LBB17_6: @ %while.end 1408; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 1409; CHECK-NEXT: lsls r0, r3, #31 1410; CHECK-NEXT: beq .LBB17_1 1411; CHECK-NEXT: @ %bb.7: @ %if.then 1412; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 1413; CHECK-NEXT: ldrh r0, [r1] 1414; CHECK-NEXT: vfma.f16 q1, q2, r0 1415; CHECK-NEXT: vmov.u16 r0, q1[0] 1416; CHECK-NEXT: vfma.f16 q1, q3, r0 1417; CHECK-NEXT: strh r0, [r5] 1418; CHECK-NEXT: vmovx.f16 s2, s4 1419; CHECK-NEXT: vstr.16 s2, [r6] 1420; CHECK-NEXT: b .LBB17_2 1421; CHECK-NEXT: .LBB17_8: @ %do.end 1422; CHECK-NEXT: vpop {d8, d9, d10, d11} 1423; CHECK-NEXT: add sp, #4 1424; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} 1425; CHECK-NEXT: .p2align 1 1426; CHECK-NEXT: @ %bb.9: 1427; CHECK-NEXT: .LCPI17_0: 1428; CHECK-NEXT: .short 0x0000 @ half 0 1429entry: 1430 %pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 1 1431 %i = load ptr, ptr %pState1, align 4 1432 %numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 0 1433 %i1 = load i8, ptr %numStages, align 4 1434 %conv = zext i8 %i1 to i32 1435 %pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 2 1436 %i2 = load ptr, ptr %pCoeffs, align 4 1437 %div = lshr i32 %blockSize, 1 1438 %cmp.not90 = icmp eq i32 %div, 0 1439 %and = and i32 %blockSize, 1 1440 %tobool.not = icmp eq i32 %and, 0 1441 br label %do.body 1442 1443do.body: ; preds = %if.end, %entry 1444 %stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ] 1445 %pCurCoeffs.0 = phi ptr [ %i2, %entry ], [ %add.ptr2, %if.end ] 1446 %pState.0 = phi ptr [ %i, %entry ], [ %pState.1, %if.end ] 1447 %pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end ] 1448 %i4 = load <8 x half>, ptr %pCurCoeffs.0, align 2 1449 %add.ptr = getelementptr inbounds half, ptr %pCurCoeffs.0, i32 2 1450 %i6 = load <8 x half>, ptr %add.ptr, align 2 1451 %add.ptr2 = getelementptr inbounds half, ptr %pCurCoeffs.0, i32 5 1452 %i8 = load <8 x half>, ptr %pState.0, align 2 1453 %i9 = shufflevector <8 x half> %i8, <8 x half> <half poison, half poison, half 0xH0000, half 0xH0000, half poison, half poison, half poison, half poison>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> 1454 %i10 = bitcast <8 x half> %i4 to <8 x i16> 1455 %i11 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %i10, i32 0, i32 16) 1456 %i12 = extractvalue { i32, <8 x i16> } %i11, 0 1457 %i13 = extractvalue { i32, <8 x i16> } %i11, 1 1458 %i14 = bitcast <8 x i16> %i13 to <8 x half> 1459 %i15 = bitcast <8 x half> %i6 to <8 x i16> 1460 %i16 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %i15, i32 %i12, i32 16) 1461 %i17 = extractvalue { i32, <8 x i16> } %i16, 1 1462 %i18 = bitcast <8 x i16> %i17 to <8 x half> 1463 br i1 %cmp.not90, label %while.end, label %while.body 1464 1465while.body: ; preds = %while.body, %do.body 1466 %pIn.194 = phi ptr [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ] 1467 %state.093 = phi <8 x half> [ %i30, %while.body ], [ %i9, %do.body ] 1468 %pOut.192 = phi ptr [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ] 1469 %sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ] 1470 %incdec.ptr = getelementptr inbounds half, ptr %pIn.194, i32 1 1471 %i19 = load half, ptr %pIn.194, align 2 1472 %incdec.ptr4 = getelementptr inbounds half, ptr %pIn.194, i32 2 1473 %i20 = load half, ptr %incdec.ptr, align 2 1474 %.splatinsert = insertelement <8 x half> poison, half %i19, i32 0 1475 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> poison, <8 x i32> zeroinitializer 1476 %i21 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i4, <8 x half> %.splat, <8 x half> %state.093) 1477 %i22 = extractelement <8 x half> %i21, i32 0 1478 %.splat6 = shufflevector <8 x half> %i21, <8 x half> poison, <8 x i32> zeroinitializer 1479 %i23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i6, <8 x half> %.splat6, <8 x half> %i21) 1480 %i24 = insertelement <8 x half> %i23, half 0xH0000, i32 3 1481 %.splatinsert7 = insertelement <8 x half> poison, half %i20, i32 0 1482 %.splat8 = shufflevector <8 x half> %.splatinsert7, <8 x half> poison, <8 x i32> zeroinitializer 1483 %i25 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i14, <8 x half> %.splat8, <8 x half> %i24) 1484 %i26 = extractelement <8 x half> %i25, i32 1 1485 %.splat10 = shufflevector <8 x half> %i25, <8 x half> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1486 %i27 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i18, <8 x half> %.splat10, <8 x half> %i25) 1487 %i28 = shufflevector <8 x half> %i27, <8 x half> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 3, i32 4, i32 5, i32 6, i32 7> 1488 %i29 = insertelement <8 x half> %i28, half 0xH0000, i32 2 1489 %i30 = shufflevector <8 x half> %i29, <8 x half> %i27, <8 x i32> <i32 0, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1490 %incdec.ptr11 = getelementptr inbounds half, ptr %pOut.192, i32 1 1491 store half %i22, ptr %pOut.192, align 2 1492 %incdec.ptr12 = getelementptr inbounds half, ptr %pOut.192, i32 2 1493 store half %i26, ptr %incdec.ptr11, align 2 1494 %dec = add nsw i32 %sample.091, -1 1495 %cmp.not = icmp eq i32 %dec, 0 1496 br i1 %cmp.not, label %while.end, label %while.body 1497 1498while.end: ; preds = %while.body, %do.body 1499 %pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ] 1500 %state.0.lcssa = phi <8 x half> [ %i9, %do.body ], [ %i30, %while.body ] 1501 %pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ] 1502 br i1 %tobool.not, label %if.else, label %if.then 1503 1504if.then: ; preds = %while.end 1505 %i31 = load half, ptr %pIn.1.lcssa, align 2 1506 %.splatinsert14 = insertelement <8 x half> poison, half %i31, i32 0 1507 %.splat15 = shufflevector <8 x half> %.splatinsert14, <8 x half> poison, <8 x i32> zeroinitializer 1508 %i32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i4, <8 x half> %.splat15, <8 x half> %state.0.lcssa) 1509 %i33 = extractelement <8 x half> %i32, i32 0 1510 %.splat17 = shufflevector <8 x half> %i32, <8 x half> poison, <8 x i32> zeroinitializer 1511 %i34 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i6, <8 x half> %.splat17, <8 x half> %i32) 1512 store half %i33, ptr %pOut.1.lcssa, align 2 1513 %i35 = extractelement <8 x half> %i34, i32 1 1514 store half %i35, ptr %pState.0, align 2 1515 %i36 = extractelement <8 x half> %i34, i32 2 1516 br label %if.end 1517 1518if.else: ; preds = %while.end 1519 %i37 = extractelement <8 x half> %state.0.lcssa, i32 0 1520 store half %i37, ptr %pState.0, align 2 1521 %i38 = extractelement <8 x half> %state.0.lcssa, i32 1 1522 br label %if.end 1523 1524if.end: ; preds = %if.else, %if.then 1525 %.sink = phi half [ %i38, %if.else ], [ %i36, %if.then ] 1526 %i39 = getelementptr inbounds half, ptr %pState.0, i32 1 1527 store half %.sink, ptr %i39, align 2 1528 %pState.1 = getelementptr inbounds half, ptr %pState.0, i32 2 1529 %dec23 = add i32 %stage.0, -1 1530 %cmp24.not = icmp eq i32 %dec23, 0 1531 br i1 %cmp24.not, label %do.end, label %do.body 1532 1533do.end: ; preds = %if.end 1534 ret void 1535} 1536 1537define arm_aapcs_vfpcc half @vecAddAcrossF16Mve(<8 x half> %in) { 1538; CHECK-LABEL: vecAddAcrossF16Mve: 1539; CHECK: @ %bb.0: @ %entry 1540; CHECK-NEXT: vrev32.16 q1, q0 1541; CHECK-NEXT: vadd.f16 q0, q1, q0 1542; CHECK-NEXT: vrev64.32 q1, q0 1543; CHECK-NEXT: vadd.f16 q0, q0, q1 1544; CHECK-NEXT: vadd.f16 s0, s0, s2 1545; CHECK-NEXT: bx lr 1546entry: 1547 %i = shufflevector <8 x half> %in, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 1548 %i1 = fadd fast <8 x half> %i, %in 1549 %i2 = bitcast <8 x half> %i1 to <4 x i32> 1550 %i3 = shufflevector <4 x i32> %i2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef> 1551 %i4 = bitcast <4 x i32> %i3 to <8 x half> 1552 %i5 = fadd fast <8 x half> %i1, %i4 1553 %i6 = extractelement <8 x half> %i5, i32 0 1554 %i7 = extractelement <8 x half> %i5, i32 4 1555 %add = fadd fast half %i6, %i7 1556 ret half %add 1557} 1558 1559declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32) 1560declare void @llvm.assume(i1) 1561declare <8 x i1> @llvm.arm.mve.vctp16(i32) 1562declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) 1563declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>) 1564