1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4; F32 5 6define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) { 7; CHECK-LABEL: maxf32: 8; CHECK: @ %bb.0: 9; CHECK-NEXT: vmaxnma.f32 q0, q1 10; CHECK-NEXT: bx lr 11 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 12 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 13 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb) 14 ret <4 x float> %c 15} 16 17define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) { 18; CHECK-LABEL: maxf32_c: 19; CHECK: @ %bb.0: 20; CHECK-NEXT: vmaxnma.f32 q0, q1 21; CHECK-NEXT: bx lr 22 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 23 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 24 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa) 25 ret <4 x float> %c 26} 27 28define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) { 29; CHECK-LABEL: minf32: 30; CHECK: @ %bb.0: 31; CHECK-NEXT: vminnma.f32 q0, q1 32; CHECK-NEXT: bx lr 33 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 34 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 35 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb) 36 ret <4 x float> %c 37} 38 39define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) { 40; CHECK-LABEL: minf32_c: 41; CHECK: @ %bb.0: 42; CHECK-NEXT: vminnma.f32 q0, q1 43; CHECK-NEXT: bx lr 44 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 45 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 46 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa) 47 ret <4 x float> %c 48} 49 50 51define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) { 52; CHECK-LABEL: maxpredf32: 53; CHECK: @ %bb.0: 54; CHECK-NEXT: vpt.f32 gt, q1, q0 55; CHECK-NEXT: vmaxnmat.f32 q0, q1 56; CHECK-NEXT: bx lr 57 %c = fcmp olt <4 x float> %a, %b 58 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) 59 ret <4 x float> %s 60} 61 62define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) { 63; CHECK-LABEL: maxpredf32_c: 64; CHECK: @ %bb.0: 65; CHECK-NEXT: vpt.f32 gt, q1, q0 66; CHECK-NEXT: vmaxnmat.f32 q1, q0 67; CHECK-NEXT: vmov q0, q1 68; CHECK-NEXT: bx lr 69 %c = fcmp olt <4 x float> %a, %b 70 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c) 71 ret <4 x float> %s 72} 73 74define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) { 75; CHECK-LABEL: minpredf32: 76; CHECK: @ %bb.0: 77; CHECK-NEXT: vpt.f32 gt, q1, q0 78; CHECK-NEXT: vminnmat.f32 q0, q1 79; CHECK-NEXT: bx lr 80 %c = fcmp olt <4 x float> %a, %b 81 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) 82 ret <4 x float> %s 83} 84 85define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) { 86; CHECK-LABEL: minpredf32_c: 87; CHECK: @ %bb.0: 88; CHECK-NEXT: vpt.f32 gt, q1, q0 89; CHECK-NEXT: vminnmat.f32 q1, q0 90; CHECK-NEXT: vmov q0, q1 91; CHECK-NEXT: bx lr 92 %c = fcmp olt <4 x float> %a, %b 93 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c) 94 ret <4 x float> %s 95} 96 97 98 99; F16 100 101define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) { 102; CHECK-LABEL: maxf16: 103; CHECK: @ %bb.0: 104; CHECK-NEXT: vmaxnma.f16 q0, q1 105; CHECK-NEXT: bx lr 106 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 107 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 108 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb) 109 ret <8 x half> %c 110} 111 112define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) { 113; CHECK-LABEL: maxf16_c: 114; CHECK: @ %bb.0: 115; CHECK-NEXT: vmaxnma.f16 q0, q1 116; CHECK-NEXT: bx lr 117 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 118 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 119 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa) 120 ret <8 x half> %c 121} 122 123define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) { 124; CHECK-LABEL: minf16: 125; CHECK: @ %bb.0: 126; CHECK-NEXT: vminnma.f16 q0, q1 127; CHECK-NEXT: bx lr 128 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 129 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 130 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb) 131 ret <8 x half> %c 132} 133 134define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) { 135; CHECK-LABEL: minf16_c: 136; CHECK: @ %bb.0: 137; CHECK-NEXT: vminnma.f16 q0, q1 138; CHECK-NEXT: bx lr 139 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 140 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 141 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa) 142 ret <8 x half> %c 143} 144 145define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) { 146; CHECK-LABEL: maxpredf16: 147; CHECK: @ %bb.0: 148; CHECK-NEXT: vpt.f16 gt, q1, q0 149; CHECK-NEXT: vmaxnmat.f16 q0, q1 150; CHECK-NEXT: bx lr 151 %c = fcmp olt <8 x half> %a, %b 152 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c) 153 ret <8 x half> %s 154} 155 156define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) { 157; CHECK-LABEL: maxpredf16_c: 158; CHECK: @ %bb.0: 159; CHECK-NEXT: vpt.f16 gt, q1, q0 160; CHECK-NEXT: vmaxnmat.f16 q1, q0 161; CHECK-NEXT: vmov q0, q1 162; CHECK-NEXT: bx lr 163 %c = fcmp olt <8 x half> %a, %b 164 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c) 165 ret <8 x half> %s 166} 167 168define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) { 169; CHECK-LABEL: minpredf16: 170; CHECK: @ %bb.0: 171; CHECK-NEXT: vpt.f16 gt, q1, q0 172; CHECK-NEXT: vminnmat.f16 q0, q1 173; CHECK-NEXT: bx lr 174 %c = fcmp olt <8 x half> %a, %b 175 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c) 176 ret <8 x half> %s 177} 178 179define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) { 180; CHECK-LABEL: minpredf16_c: 181; CHECK: @ %bb.0: 182; CHECK-NEXT: vpt.f16 gt, q1, q0 183; CHECK-NEXT: vminnmat.f16 q1, q0 184; CHECK-NEXT: vmov q0, q1 185; CHECK-NEXT: bx lr 186 %c = fcmp olt <8 x half> %a, %b 187 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c) 188 ret <8 x half> %s 189} 190 191 192; Loops 193 194define void @loop_absmax32(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) { 195; CHECK-LABEL: loop_absmax32: 196; CHECK: @ %bb.0: 197; CHECK-NEXT: .save {r7, lr} 198; CHECK-NEXT: push {r7, lr} 199; CHECK-NEXT: vmov.i32 q0, #0x0 200; CHECK-NEXT: lsrs r1, r1, #3 201; CHECK-NEXT: wls lr, r1, .LBB16_3 202; CHECK-NEXT: @ %bb.1: @ %.preheader 203; CHECK-NEXT: vmov.i32 q0, #0x0 204; CHECK-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1 205; CHECK-NEXT: vldrw.u32 q1, [r0], #16 206; CHECK-NEXT: vabs.f32 q1, q1 207; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 208; CHECK-NEXT: le lr, .LBB16_2 209; CHECK-NEXT: .LBB16_3: 210; CHECK-NEXT: vldr s4, .LCPI16_0 211; CHECK-NEXT: vmov r0, s4 212; CHECK-NEXT: vmaxnmav.f32 r0, q0 213; CHECK-NEXT: vmov s0, r0 214; CHECK-NEXT: vstr s0, [r2] 215; CHECK-NEXT: pop {r7, pc} 216; CHECK-NEXT: .p2align 2 217; CHECK-NEXT: @ %bb.4: 218; CHECK-NEXT: .LCPI16_0: 219; CHECK-NEXT: .long 0x00000000 @ float 0 220 %4 = lshr i32 %1, 3 221 %5 = icmp eq i32 %4, 0 222 br i1 %5, label %18, label %6 223 2246: ; preds = %3, %6 225 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 226 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ] 227 %9 = phi ptr [ %12, %6 ], [ %0, %3 ] 228 %10 = bitcast ptr %9 to ptr 229 %11 = load <4 x float>, ptr %10, align 4 230 %12 = getelementptr inbounds float, ptr %9, i32 4 231 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11) 232 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8) 233 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13) 234 %16 = add nsw i32 %7, -1 235 %17 = icmp eq i32 %16, 0 236 br i1 %17, label %18, label %6 237 23818: ; preds = %6, %3 239 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ] 240 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19) 241 store float %20, ptr %2, align 4 242 ret void 243} 244 245define void @loop_absmax32_c(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) { 246; CHECK-LABEL: loop_absmax32_c: 247; CHECK: @ %bb.0: 248; CHECK-NEXT: .save {r7, lr} 249; CHECK-NEXT: push {r7, lr} 250; CHECK-NEXT: vmov.i32 q0, #0x0 251; CHECK-NEXT: lsrs r1, r1, #3 252; CHECK-NEXT: wls lr, r1, .LBB17_3 253; CHECK-NEXT: @ %bb.1: @ %.preheader 254; CHECK-NEXT: vmov.i32 q0, #0x0 255; CHECK-NEXT: .LBB17_2: @ =>This Inner Loop Header: Depth=1 256; CHECK-NEXT: vldrw.u32 q1, [r0], #16 257; CHECK-NEXT: vabs.f32 q1, q1 258; CHECK-NEXT: vmaxnm.f32 q0, q1, q0 259; CHECK-NEXT: le lr, .LBB17_2 260; CHECK-NEXT: .LBB17_3: 261; CHECK-NEXT: vldr s4, .LCPI17_0 262; CHECK-NEXT: vmov r0, s4 263; CHECK-NEXT: vmaxnmav.f32 r0, q0 264; CHECK-NEXT: vmov s0, r0 265; CHECK-NEXT: vstr s0, [r2] 266; CHECK-NEXT: pop {r7, pc} 267; CHECK-NEXT: .p2align 2 268; CHECK-NEXT: @ %bb.4: 269; CHECK-NEXT: .LCPI17_0: 270; CHECK-NEXT: .long 0x00000000 @ float 0 271 %4 = lshr i32 %1, 3 272 %5 = icmp eq i32 %4, 0 273 br i1 %5, label %18, label %6 274 2756: ; preds = %3, %6 276 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 277 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ] 278 %9 = phi ptr [ %12, %6 ], [ %0, %3 ] 279 %10 = bitcast ptr %9 to ptr 280 %11 = load <4 x float>, ptr %10, align 4 281 %12 = getelementptr inbounds float, ptr %9, i32 4 282 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11) 283 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8) 284 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14) 285 %16 = add nsw i32 %7, -1 286 %17 = icmp eq i32 %16, 0 287 br i1 %17, label %18, label %6 288 28918: ; preds = %6, %3 290 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ] 291 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19) 292 store float %20, ptr %2, align 4 293 ret void 294} 295 296define void @loop_absmax32_pred(ptr %0, i32 %1, ptr nocapture %2) { 297; CHECK-LABEL: loop_absmax32_pred: 298; CHECK: @ %bb.0: 299; CHECK-NEXT: .save {r7, lr} 300; CHECK-NEXT: push {r7, lr} 301; CHECK-NEXT: vmov.i32 q0, #0x0 302; CHECK-NEXT: dlstp.32 lr, r1 303; CHECK-NEXT: .LBB18_1: @ =>This Inner Loop Header: Depth=1 304; CHECK-NEXT: vldrw.u32 q1, [r0], #16 305; CHECK-NEXT: vmaxnma.f32 q0, q1 306; CHECK-NEXT: letp lr, .LBB18_1 307; CHECK-NEXT: @ %bb.2: 308; CHECK-NEXT: vldr s4, .LCPI18_0 309; CHECK-NEXT: vmov r0, s4 310; CHECK-NEXT: vmaxnmav.f32 r0, q0 311; CHECK-NEXT: vmov s0, r0 312; CHECK-NEXT: vstr s0, [r2] 313; CHECK-NEXT: pop {r7, pc} 314; CHECK-NEXT: .p2align 2 315; CHECK-NEXT: @ %bb.3: 316; CHECK-NEXT: .LCPI18_0: 317; CHECK-NEXT: .long 0x00000000 @ float 0 318 br label %4 319 3204: ; preds = %4, %3 321 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ] 322 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 323 %7 = phi ptr [ %0, %3 ], [ %11, %4 ] 324 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) 325 %9 = bitcast ptr %7 to ptr 326 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer) 327 %11 = getelementptr inbounds float, ptr %7, i32 4 328 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8) 329 %13 = add nsw i32 %6, -4 330 %14 = icmp sgt i32 %6, 4 331 br i1 %14, label %4, label %15 332 33315: ; preds = %4 334 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12) 335 store float %16, ptr %2, align 4 336 ret void 337} 338 339define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) { 340; CHECK-LABEL: loop_absmax32_pred_c: 341; CHECK: @ %bb.0: 342; CHECK-NEXT: .save {r7, lr} 343; CHECK-NEXT: push {r7, lr} 344; CHECK-NEXT: vmov.i32 q0, #0x0 345; CHECK-NEXT: dlstp.32 lr, r1 346; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 347; CHECK-NEXT: vldrw.u32 q1, [r0], #16 348; CHECK-NEXT: vmaxnma.f32 q1, q0 349; CHECK-NEXT: vmov q0, q1 350; CHECK-NEXT: letp lr, .LBB19_1 351; CHECK-NEXT: @ %bb.2: 352; CHECK-NEXT: vldr s0, .LCPI19_0 353; CHECK-NEXT: vmov r0, s0 354; CHECK-NEXT: vmaxnmav.f32 r0, q1 355; CHECK-NEXT: vmov s0, r0 356; CHECK-NEXT: vstr s0, [r2] 357; CHECK-NEXT: pop {r7, pc} 358; CHECK-NEXT: .p2align 2 359; CHECK-NEXT: @ %bb.3: 360; CHECK-NEXT: .LCPI19_0: 361; CHECK-NEXT: .long 0x00000000 @ float 0 362 br label %4 363 3644: ; preds = %4, %3 365 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ] 366 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 367 %7 = phi ptr [ %0, %3 ], [ %11, %4 ] 368 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) 369 %9 = bitcast ptr %7 to ptr 370 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer) 371 %11 = getelementptr inbounds float, ptr %7, i32 4 372 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8) 373 %13 = add nsw i32 %6, -4 374 %14 = icmp sgt i32 %6, 4 375 br i1 %14, label %4, label %15 376 37715: ; preds = %4 378 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12) 379 store float %16, ptr %2, align 4 380 ret void 381} 382 383 384 385 386 387 388define void @loop_absmax16(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) { 389; CHECK-LABEL: loop_absmax16: 390; CHECK: @ %bb.0: 391; CHECK-NEXT: .save {r7, lr} 392; CHECK-NEXT: push {r7, lr} 393; CHECK-NEXT: vmov.i32 q0, #0x0 394; CHECK-NEXT: lsrs r1, r1, #3 395; CHECK-NEXT: wls lr, r1, .LBB20_3 396; CHECK-NEXT: @ %bb.1: @ %.preheader 397; CHECK-NEXT: vmov.i32 q0, #0x0 398; CHECK-NEXT: .LBB20_2: @ =>This Inner Loop Header: Depth=1 399; CHECK-NEXT: vldrw.u32 q1, [r0], #8 400; CHECK-NEXT: vabs.f16 q1, q1 401; CHECK-NEXT: vmaxnm.f16 q0, q0, q1 402; CHECK-NEXT: le lr, .LBB20_2 403; CHECK-NEXT: .LBB20_3: 404; CHECK-NEXT: vldr.16 s4, .LCPI20_0 405; CHECK-NEXT: vmov r0, s4 406; CHECK-NEXT: vmaxnmav.f16 r0, q0 407; CHECK-NEXT: vmov s0, r0 408; CHECK-NEXT: vstr.16 s0, [r2] 409; CHECK-NEXT: pop {r7, pc} 410; CHECK-NEXT: .p2align 1 411; CHECK-NEXT: @ %bb.4: 412; CHECK-NEXT: .LCPI20_0: 413; CHECK-NEXT: .short 0x0000 @ half 0 414 %4 = lshr i32 %1, 3 415 %5 = icmp eq i32 %4, 0 416 br i1 %5, label %18, label %6 417 4186: ; preds = %3, %6 419 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 420 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ] 421 %9 = phi ptr [ %12, %6 ], [ %0, %3 ] 422 %10 = bitcast ptr %9 to ptr 423 %11 = load <8 x half>, ptr %10, align 4 424 %12 = getelementptr inbounds half, ptr %9, i32 4 425 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11) 426 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8) 427 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13) 428 %16 = add nsw i32 %7, -1 429 %17 = icmp eq i32 %16, 0 430 br i1 %17, label %18, label %6 431 43218: ; preds = %6, %3 433 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ] 434 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19) 435 store half %20, ptr %2, align 4 436 ret void 437} 438 439define void @loop_absmax16_c(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) { 440; CHECK-LABEL: loop_absmax16_c: 441; CHECK: @ %bb.0: 442; CHECK-NEXT: .save {r7, lr} 443; CHECK-NEXT: push {r7, lr} 444; CHECK-NEXT: vmov.i32 q0, #0x0 445; CHECK-NEXT: lsrs r1, r1, #3 446; CHECK-NEXT: wls lr, r1, .LBB21_3 447; CHECK-NEXT: @ %bb.1: @ %.preheader 448; CHECK-NEXT: vmov.i32 q0, #0x0 449; CHECK-NEXT: .LBB21_2: @ =>This Inner Loop Header: Depth=1 450; CHECK-NEXT: vldrw.u32 q1, [r0], #8 451; CHECK-NEXT: vabs.f16 q1, q1 452; CHECK-NEXT: vmaxnm.f16 q0, q1, q0 453; CHECK-NEXT: le lr, .LBB21_2 454; CHECK-NEXT: .LBB21_3: 455; CHECK-NEXT: vldr.16 s4, .LCPI21_0 456; CHECK-NEXT: vmov r0, s4 457; CHECK-NEXT: vmaxnmav.f16 r0, q0 458; CHECK-NEXT: vmov s0, r0 459; CHECK-NEXT: vstr.16 s0, [r2] 460; CHECK-NEXT: pop {r7, pc} 461; CHECK-NEXT: .p2align 1 462; CHECK-NEXT: @ %bb.4: 463; CHECK-NEXT: .LCPI21_0: 464; CHECK-NEXT: .short 0x0000 @ half 0 465 %4 = lshr i32 %1, 3 466 %5 = icmp eq i32 %4, 0 467 br i1 %5, label %18, label %6 468 4696: ; preds = %3, %6 470 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 471 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ] 472 %9 = phi ptr [ %12, %6 ], [ %0, %3 ] 473 %10 = bitcast ptr %9 to ptr 474 %11 = load <8 x half>, ptr %10, align 4 475 %12 = getelementptr inbounds half, ptr %9, i32 4 476 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11) 477 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8) 478 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14) 479 %16 = add nsw i32 %7, -1 480 %17 = icmp eq i32 %16, 0 481 br i1 %17, label %18, label %6 482 48318: ; preds = %6, %3 484 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ] 485 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19) 486 store half %20, ptr %2, align 4 487 ret void 488} 489 490define void @loop_absmax16_pred(ptr %0, i32 %1, ptr nocapture %2) { 491; CHECK-LABEL: loop_absmax16_pred: 492; CHECK: @ %bb.0: 493; CHECK-NEXT: .save {r7, lr} 494; CHECK-NEXT: push {r7, lr} 495; CHECK-NEXT: vmov.i32 q0, #0x0 496; CHECK-NEXT: dlstp.16 lr, r1 497; CHECK-NEXT: .LBB22_1: @ =>This Inner Loop Header: Depth=1 498; CHECK-NEXT: vldrh.u16 q1, [r0], #8 499; CHECK-NEXT: vmaxnma.f16 q0, q1 500; CHECK-NEXT: letp lr, .LBB22_1 501; CHECK-NEXT: @ %bb.2: 502; CHECK-NEXT: vldr.16 s4, .LCPI22_0 503; CHECK-NEXT: vmov r0, s4 504; CHECK-NEXT: vmaxnmav.f16 r0, q0 505; CHECK-NEXT: vmov s0, r0 506; CHECK-NEXT: vstr.16 s0, [r2] 507; CHECK-NEXT: pop {r7, pc} 508; CHECK-NEXT: .p2align 1 509; CHECK-NEXT: @ %bb.3: 510; CHECK-NEXT: .LCPI22_0: 511; CHECK-NEXT: .short 0x0000 @ half 0 512 br label %4 513 5144: ; preds = %4, %3 515 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ] 516 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 517 %7 = phi ptr [ %0, %3 ], [ %11, %4 ] 518 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6) 519 %9 = bitcast ptr %7 to ptr 520 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer) 521 %11 = getelementptr inbounds half, ptr %7, i32 4 522 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8) 523 %13 = add nsw i32 %6, -8 524 %14 = icmp sgt i32 %6, 8 525 br i1 %14, label %4, label %15 526 52715: ; preds = %4 528 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12) 529 store half %16, ptr %2, align 4 530 ret void 531} 532 533define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) { 534; CHECK-LABEL: loop_absmax16_pred_c: 535; CHECK: @ %bb.0: 536; CHECK-NEXT: .save {r7, lr} 537; CHECK-NEXT: push {r7, lr} 538; CHECK-NEXT: vmov.i32 q0, #0x0 539; CHECK-NEXT: dlstp.16 lr, r1 540; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 541; CHECK-NEXT: vldrh.u16 q1, [r0], #8 542; CHECK-NEXT: vmaxnma.f16 q1, q0 543; CHECK-NEXT: vmov q0, q1 544; CHECK-NEXT: letp lr, .LBB23_1 545; CHECK-NEXT: @ %bb.2: 546; CHECK-NEXT: vldr.16 s0, .LCPI23_0 547; CHECK-NEXT: vmov r0, s0 548; CHECK-NEXT: vmaxnmav.f16 r0, q1 549; CHECK-NEXT: vmov s0, r0 550; CHECK-NEXT: vstr.16 s0, [r2] 551; CHECK-NEXT: pop {r7, pc} 552; CHECK-NEXT: .p2align 1 553; CHECK-NEXT: @ %bb.3: 554; CHECK-NEXT: .LCPI23_0: 555; CHECK-NEXT: .short 0x0000 @ half 0 556 br label %4 557 5584: ; preds = %4, %3 559 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ] 560 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 561 %7 = phi ptr [ %0, %3 ], [ %11, %4 ] 562 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6) 563 %9 = bitcast ptr %7 to ptr 564 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer) 565 %11 = getelementptr inbounds half, ptr %7, i32 4 566 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8) 567 %13 = add nsw i32 %6, -8 568 %14 = icmp sgt i32 %6, 8 569 br i1 %14, label %4, label %15 570 57115: ; preds = %4 572 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12) 573 store half %16, ptr %2, align 4 574 ret void 575} 576 577 578 579 580 581declare <4 x i1> @llvm.arm.mve.vctp32(i32) 582declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) 583declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>) 584declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>) 585declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>) 586declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 587declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) 588declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) 589 590declare <8 x i1> @llvm.arm.mve.vctp16(i32) 591declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>) 592declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>) 593declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>) 594declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>) 595declare <8 x half> @llvm.fabs.v8f16(<8 x half>) 596declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) 597declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) 598 599 600