1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -float-abi=hard -verify-machineinstrs %s -o - | FileCheck %s 3 4define float @add_f32(<8 x float> %a, <4 x float> %b) { 5; CHECK-LABEL: add_f32: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vadd.f32 q0, q0, q1 8; CHECK-NEXT: vadd.f32 q0, q0, q2 9; CHECK-NEXT: vadd.f32 s2, s2, s3 10; CHECK-NEXT: vadd.f32 s0, s0, s1 11; CHECK-NEXT: vadd.f32 s0, s0, s2 12; CHECK-NEXT: bx lr 13 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) 14 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 15 %r = fadd fast float %r1, %r2 16 ret float %r 17} 18 19define float @fmul_f32(<8 x float> %a, <4 x float> %b) { 20; CHECK-LABEL: fmul_f32: 21; CHECK: @ %bb.0: 22; CHECK-NEXT: vmul.f32 q0, q0, q1 23; CHECK-NEXT: vmul.f32 q0, q0, q2 24; CHECK-NEXT: vmul.f32 s2, s2, s3 25; CHECK-NEXT: vmul.f32 s0, s0, s1 26; CHECK-NEXT: vmul.f32 s0, s0, s2 27; CHECK-NEXT: bx lr 28 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) 29 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 30 %r = fmul fast float %r1, %r2 31 ret float %r 32} 33 34define float @fmin_f32(<8 x float> %a, <4 x float> %b) { 35; CHECK-LABEL: fmin_f32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vminnm.f32 q0, q0, q1 38; CHECK-NEXT: vminnm.f32 q0, q0, q2 39; CHECK-NEXT: vminnm.f32 s2, s2, s3 40; CHECK-NEXT: vminnm.f32 s0, s0, s1 41; CHECK-NEXT: vminnm.f32 s0, s0, s2 42; CHECK-NEXT: bx lr 43 %r1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a) 44 %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) 45 %r = call float @llvm.minnum.f32(float %r1, float %r2) 46 ret float %r 47} 48 49define float @fmax_f32(<8 x float> %a, <4 x float> %b) { 50; CHECK-LABEL: fmax_f32: 51; CHECK: @ %bb.0: 52; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 53; CHECK-NEXT: vmaxnm.f32 q0, q0, q2 54; CHECK-NEXT: vmaxnm.f32 s2, s2, s3 55; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 56; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 57; CHECK-NEXT: bx lr 58 %r1 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a) 59 %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 60 %r = call float @llvm.maxnum.f32(float %r1, float %r2) 61 ret float %r 62} 63 64 65define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) { 66; CHECK-LABEL: add_i32: 67; CHECK: @ %bb.0: 68; CHECK-NEXT: vaddv.u32 r0, q1 69; CHECK-NEXT: vaddva.u32 r0, q0 70; CHECK-NEXT: vaddva.u32 r0, q2 71; CHECK-NEXT: bx lr 72 %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a) 73 %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) 74 %r = add i32 %r1, %r2 75 ret i32 %r 76} 77 78define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { 79; CHECK-LABEL: add_ext_i16: 80; CHECK: @ %bb.0: 81; CHECK-NEXT: vaddv.u8 r0, q1 82; CHECK-NEXT: vaddva.u8 r0, q0 83; CHECK-NEXT: bx lr 84 %ae = zext <16 x i8> %a to <16 x i16> 85 %be = zext <16 x i8> %b to <16 x i16> 86 %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae) 87 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) 88 %r = add i16 %r1, %r2 89 ret i16 %r 90} 91 92define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { 93; CHECK-LABEL: add_ext_v32i16: 94; CHECK: @ %bb.0: 95; CHECK-NEXT: .pad #32 96; CHECK-NEXT: sub sp, #32 97; CHECK-NEXT: mov r1, sp 98; CHECK-NEXT: add r2, sp, #16 99; CHECK-NEXT: vstrw.32 q0, [r1] 100; CHECK-NEXT: vstrw.32 q1, [r2] 101; CHECK-NEXT: vldrb.u16 q1, [r2] 102; CHECK-NEXT: vldrb.u16 q0, [r1] 103; CHECK-NEXT: vaddv.u16 r0, q1 104; CHECK-NEXT: vaddva.u16 r0, q0 105; CHECK-NEXT: vldrb.u16 q0, [r1, #8] 106; CHECK-NEXT: vaddva.u16 r0, q0 107; CHECK-NEXT: vldrb.u16 q0, [r2, #8] 108; CHECK-NEXT: vaddva.u16 r0, q0 109; CHECK-NEXT: vaddva.u8 r0, q2 110; CHECK-NEXT: add sp, #32 111; CHECK-NEXT: bx lr 112 %ae = zext <32 x i8> %a to <32 x i16> 113 %be = zext <16 x i8> %b to <16 x i16> 114 %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae) 115 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) 116 %r = add i16 %r1, %r2 117 ret i16 %r 118} 119 120define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { 121; CHECK-LABEL: mul_i32: 122; CHECK: @ %bb.0: 123; CHECK-NEXT: vmul.i32 q0, q0, q1 124; CHECK-NEXT: vmul.i32 q0, q0, q2 125; CHECK-NEXT: vmov r0, r1, d1 126; CHECK-NEXT: vmov r2, r3, d0 127; CHECK-NEXT: muls r0, r1, r0 128; CHECK-NEXT: mul r1, r2, r3 129; CHECK-NEXT: muls r0, r1, r0 130; CHECK-NEXT: bx lr 131 %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a) 132 %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) 133 %r = mul i32 %r1, %r2 134 ret i32 %r 135} 136 137define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { 138; CHECK-LABEL: and_i32: 139; CHECK: @ %bb.0: 140; CHECK-NEXT: vand q0, q0, q1 141; CHECK-NEXT: vand q0, q0, q2 142; CHECK-NEXT: vmov r0, r1, d1 143; CHECK-NEXT: vmov r2, r3, d0 144; CHECK-NEXT: ands r0, r1 145; CHECK-NEXT: and.w r1, r2, r3 146; CHECK-NEXT: ands r0, r1 147; CHECK-NEXT: bx lr 148 %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) 149 %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) 150 %r = and i32 %r1, %r2 151 ret i32 %r 152} 153 154define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { 155; CHECK-LABEL: or_i32: 156; CHECK: @ %bb.0: 157; CHECK-NEXT: vorr q0, q0, q1 158; CHECK-NEXT: vorr q0, q0, q2 159; CHECK-NEXT: vmov r0, r1, d1 160; CHECK-NEXT: vmov r2, r3, d0 161; CHECK-NEXT: orrs r0, r1 162; CHECK-NEXT: orr.w r1, r2, r3 163; CHECK-NEXT: orrs r0, r1 164; CHECK-NEXT: bx lr 165 %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) 166 %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) 167 %r = or i32 %r1, %r2 168 ret i32 %r 169} 170 171define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { 172; CHECK-LABEL: xor_i32: 173; CHECK: @ %bb.0: 174; CHECK-NEXT: veor q0, q0, q1 175; CHECK-NEXT: veor q0, q0, q2 176; CHECK-NEXT: vmov r0, r1, d1 177; CHECK-NEXT: vmov r2, r3, d0 178; CHECK-NEXT: eors r0, r1 179; CHECK-NEXT: eor.w r1, r2, r3 180; CHECK-NEXT: eors r0, r1 181; CHECK-NEXT: bx lr 182 %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) 183 %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) 184 %r = xor i32 %r1, %r2 185 ret i32 %r 186} 187 188define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { 189; CHECK-LABEL: umin_i32: 190; CHECK: @ %bb.0: 191; CHECK-NEXT: vmin.u32 q0, q0, q1 192; CHECK-NEXT: mov.w r0, #-1 193; CHECK-NEXT: vmin.u32 q0, q0, q2 194; CHECK-NEXT: vminv.u32 r0, q0 195; CHECK-NEXT: bx lr 196 %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a) 197 %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) 198 %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) 199 ret i32 %r 200} 201 202define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { 203; CHECK-LABEL: umax_i32: 204; CHECK: @ %bb.0: 205; CHECK-NEXT: vmax.u32 q0, q0, q1 206; CHECK-NEXT: movs r0, #0 207; CHECK-NEXT: vmax.u32 q0, q0, q2 208; CHECK-NEXT: vmaxv.u32 r0, q0 209; CHECK-NEXT: bx lr 210 %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a) 211 %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) 212 %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) 213 ret i32 %r 214} 215 216define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { 217; CHECK-LABEL: smin_i32: 218; CHECK: @ %bb.0: 219; CHECK-NEXT: vmin.s32 q0, q0, q1 220; CHECK-NEXT: mvn r0, #-2147483648 221; CHECK-NEXT: vmin.s32 q0, q0, q2 222; CHECK-NEXT: vminv.s32 r0, q0 223; CHECK-NEXT: bx lr 224 %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a) 225 %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) 226 %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) 227 ret i32 %r 228} 229 230define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) { 231; CHECK-LABEL: smax_i32: 232; CHECK: @ %bb.0: 233; CHECK-NEXT: vmax.s32 q0, q0, q1 234; CHECK-NEXT: mov.w r0, #-2147483648 235; CHECK-NEXT: vmax.s32 q0, q0, q2 236; CHECK-NEXT: vmaxv.s32 r0, q0 237; CHECK-NEXT: bx lr 238 %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a) 239 %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) 240 %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) 241 ret i32 %r 242} 243 244define float @nested_add_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) { 245; CHECK-LABEL: nested_add_f32: 246; CHECK: @ %bb.0: 247; CHECK-NEXT: vadd.f32 s6, s6, s7 248; CHECK-NEXT: vadd.f32 s4, s4, s5 249; CHECK-NEXT: vadd.f32 s2, s2, s3 250; CHECK-NEXT: vadd.f32 s0, s0, s1 251; CHECK-NEXT: vadd.f32 s4, s4, s6 252; CHECK-NEXT: vadd.f32 s0, s0, s2 253; CHECK-NEXT: vadd.f32 s2, s4, s9 254; CHECK-NEXT: vadd.f32 s0, s0, s8 255; CHECK-NEXT: vadd.f32 s0, s0, s2 256; CHECK-NEXT: bx lr 257 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 258 %a1 = fadd fast float %r1, %c 259 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 260 %a2 = fadd fast float %r2, %d 261 %r = fadd fast float %a1, %a2 262 ret float %r 263} 264 265define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) { 266; CHECK-LABEL: nested_mul_f32: 267; CHECK: @ %bb.0: 268; CHECK-NEXT: vmul.f32 s6, s6, s7 269; CHECK-NEXT: vmul.f32 s4, s4, s5 270; CHECK-NEXT: vmul.f32 s2, s2, s3 271; CHECK-NEXT: vmul.f32 s0, s0, s1 272; CHECK-NEXT: vmul.f32 s4, s4, s6 273; CHECK-NEXT: vmul.f32 s0, s0, s2 274; CHECK-NEXT: vmul.f32 s2, s4, s9 275; CHECK-NEXT: vmul.f32 s0, s0, s8 276; CHECK-NEXT: vmul.f32 s0, s0, s2 277; CHECK-NEXT: bx lr 278 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 279 %a1 = fmul fast float %r1, %c 280 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 281 %a2 = fmul fast float %r2, %d 282 %r = fmul fast float %a1, %a2 283 ret float %r 284} 285 286define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 287; CHECK-LABEL: nested_add_i32: 288; CHECK: @ %bb.0: 289; CHECK-NEXT: add r0, r1 290; CHECK-NEXT: vaddva.u32 r0, q0 291; CHECK-NEXT: vaddva.u32 r0, q1 292; CHECK-NEXT: bx lr 293 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 294 %a1 = add i32 %r1, %c 295 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b) 296 %a2 = add i32 %r2, %d 297 %r = add i32 %a1, %a2 298 ret i32 %r 299} 300 301define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 302; CHECK-LABEL: nested_mul_i32: 303; CHECK: @ %bb.0: 304; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 305; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 306; CHECK-NEXT: vmov r8, r3, d2 307; CHECK-NEXT: vmov r4, r5, d1 308; CHECK-NEXT: vmov r6, r7, d0 309; CHECK-NEXT: vmov r12, lr, d3 310; CHECK-NEXT: mul r3, r8, r3 311; CHECK-NEXT: muls r5, r4, r5 312; CHECK-NEXT: mul r2, r12, lr 313; CHECK-NEXT: muls r7, r6, r7 314; CHECK-NEXT: muls r2, r3, r2 315; CHECK-NEXT: mul r3, r7, r5 316; CHECK-NEXT: muls r1, r2, r1 317; CHECK-NEXT: muls r0, r3, r0 318; CHECK-NEXT: muls r0, r1, r0 319; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 320 %r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a) 321 %a1 = mul i32 %r1, %c 322 %r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b) 323 %a2 = mul i32 %r2, %d 324 %r = mul i32 %a1, %a2 325 ret i32 %r 326} 327 328define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 329; CHECK-LABEL: nested_and_i32: 330; CHECK: @ %bb.0: 331; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 332; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 333; CHECK-NEXT: vmov r2, r3, d2 334; CHECK-NEXT: vmov r12, lr, d3 335; CHECK-NEXT: vmov r8, r5, d1 336; CHECK-NEXT: vmov r6, r7, d0 337; CHECK-NEXT: ands r2, r3 338; CHECK-NEXT: and.w r4, r12, lr 339; CHECK-NEXT: ands r2, r4 340; CHECK-NEXT: ands r1, r2 341; CHECK-NEXT: and.w r2, r8, r5 342; CHECK-NEXT: and.w r3, r6, r7 343; CHECK-NEXT: ands r2, r3 344; CHECK-NEXT: ands r0, r2 345; CHECK-NEXT: ands r0, r1 346; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 347 %r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) 348 %a1 = and i32 %r1, %c 349 %r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b) 350 %a2 = and i32 %r2, %d 351 %r = and i32 %a1, %a2 352 ret i32 %r 353} 354 355define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 356; CHECK-LABEL: nested_or_i32: 357; CHECK: @ %bb.0: 358; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 359; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 360; CHECK-NEXT: vmov r2, r3, d2 361; CHECK-NEXT: vmov r12, lr, d3 362; CHECK-NEXT: vmov r8, r5, d1 363; CHECK-NEXT: vmov r6, r7, d0 364; CHECK-NEXT: orrs r2, r3 365; CHECK-NEXT: orr.w r4, r12, lr 366; CHECK-NEXT: orrs r2, r4 367; CHECK-NEXT: orrs r1, r2 368; CHECK-NEXT: orr.w r2, r8, r5 369; CHECK-NEXT: orr.w r3, r6, r7 370; CHECK-NEXT: orrs r2, r3 371; CHECK-NEXT: orrs r0, r2 372; CHECK-NEXT: orrs r0, r1 373; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 374 %r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) 375 %a1 = or i32 %r1, %c 376 %r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b) 377 %a2 = or i32 %r2, %d 378 %r = or i32 %a1, %a2 379 ret i32 %r 380} 381 382define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 383; CHECK-LABEL: nested_xor_i32: 384; CHECK: @ %bb.0: 385; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 386; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 387; CHECK-NEXT: vmov r2, r3, d2 388; CHECK-NEXT: vmov r12, lr, d3 389; CHECK-NEXT: vmov r8, r5, d1 390; CHECK-NEXT: vmov r6, r7, d0 391; CHECK-NEXT: eors r2, r3 392; CHECK-NEXT: eor.w r4, r12, lr 393; CHECK-NEXT: eors r2, r4 394; CHECK-NEXT: eors r1, r2 395; CHECK-NEXT: eor.w r2, r8, r5 396; CHECK-NEXT: eor.w r3, r6, r7 397; CHECK-NEXT: eors r2, r3 398; CHECK-NEXT: eors r0, r2 399; CHECK-NEXT: eors r0, r1 400; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 401 %r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) 402 %a1 = xor i32 %r1, %c 403 %r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b) 404 %a2 = xor i32 %r2, %d 405 %r = xor i32 %a1, %a2 406 ret i32 %r 407} 408 409define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 410; CHECK-LABEL: nested_smin_i32: 411; CHECK: @ %bb.0: 412; CHECK-NEXT: mvn r3, #-2147483648 413; CHECK-NEXT: mvn r2, #-2147483648 414; CHECK-NEXT: vminv.s32 r3, q1 415; CHECK-NEXT: vminv.s32 r2, q0 416; CHECK-NEXT: cmp r3, r1 417; CHECK-NEXT: csel r1, r3, r1, lt 418; CHECK-NEXT: cmp r2, r0 419; CHECK-NEXT: csel r0, r2, r0, lt 420; CHECK-NEXT: cmp r0, r1 421; CHECK-NEXT: csel r0, r0, r1, lt 422; CHECK-NEXT: bx lr 423 %r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) 424 %a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c) 425 %r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b) 426 %a2 = call i32 @llvm.smin.i32(i32 %r2, i32 %d) 427 %r = call i32 @llvm.smin.i32(i32 %a1, i32 %a2) 428 ret i32 %r 429} 430 431define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 432; CHECK-LABEL: nested_smax_i32: 433; CHECK: @ %bb.0: 434; CHECK-NEXT: mov.w r3, #-2147483648 435; CHECK-NEXT: mov.w r2, #-2147483648 436; CHECK-NEXT: vmaxv.s32 r3, q1 437; CHECK-NEXT: vmaxv.s32 r2, q0 438; CHECK-NEXT: cmp r3, r1 439; CHECK-NEXT: csel r1, r3, r1, gt 440; CHECK-NEXT: cmp r2, r0 441; CHECK-NEXT: csel r0, r2, r0, gt 442; CHECK-NEXT: cmp r0, r1 443; CHECK-NEXT: csel r0, r0, r1, gt 444; CHECK-NEXT: bx lr 445 %r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) 446 %a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c) 447 %r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b) 448 %a2 = call i32 @llvm.smax.i32(i32 %r2, i32 %d) 449 %r = call i32 @llvm.smax.i32(i32 %a1, i32 %a2) 450 ret i32 %r 451} 452 453define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 454; CHECK-LABEL: nested_umin_i32: 455; CHECK: @ %bb.0: 456; CHECK-NEXT: mov.w r3, #-1 457; CHECK-NEXT: mov.w r2, #-1 458; CHECK-NEXT: vminv.u32 r3, q1 459; CHECK-NEXT: vminv.u32 r2, q0 460; CHECK-NEXT: cmp r3, r1 461; CHECK-NEXT: csel r1, r3, r1, lo 462; CHECK-NEXT: cmp r2, r0 463; CHECK-NEXT: csel r0, r2, r0, lo 464; CHECK-NEXT: cmp r0, r1 465; CHECK-NEXT: csel r0, r0, r1, lo 466; CHECK-NEXT: bx lr 467 %r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) 468 %a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c) 469 %r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b) 470 %a2 = call i32 @llvm.umin.i32(i32 %r2, i32 %d) 471 %r = call i32 @llvm.umin.i32(i32 %a1, i32 %a2) 472 ret i32 %r 473} 474 475define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 476; CHECK-LABEL: nested_umax_i32: 477; CHECK: @ %bb.0: 478; CHECK-NEXT: movs r3, #0 479; CHECK-NEXT: movs r2, #0 480; CHECK-NEXT: vmaxv.u32 r3, q1 481; CHECK-NEXT: vmaxv.u32 r2, q0 482; CHECK-NEXT: cmp r3, r1 483; CHECK-NEXT: csel r1, r3, r1, hi 484; CHECK-NEXT: cmp r2, r0 485; CHECK-NEXT: csel r0, r2, r0, hi 486; CHECK-NEXT: cmp r0, r1 487; CHECK-NEXT: csel r0, r0, r1, hi 488; CHECK-NEXT: bx lr 489 %r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) 490 %a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c) 491 %r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b) 492 %a2 = call i32 @llvm.umax.i32(i32 %r2, i32 %d) 493 %r = call i32 @llvm.umax.i32(i32 %a1, i32 %a2) 494 ret i32 %r 495} 496 497define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) { 498; CHECK-LABEL: nested_fmin_float: 499; CHECK: @ %bb.0: 500; CHECK-NEXT: vminnm.f32 s2, s2, s3 501; CHECK-NEXT: vminnm.f32 s0, s0, s1 502; CHECK-NEXT: vminnm.f32 s0, s0, s2 503; CHECK-NEXT: vminnm.f32 s2, s6, s7 504; CHECK-NEXT: vminnm.f32 s4, s4, s5 505; CHECK-NEXT: vminnm.f32 s0, s0, s8 506; CHECK-NEXT: vminnm.f32 s2, s4, s2 507; CHECK-NEXT: vminnm.f32 s2, s2, s9 508; CHECK-NEXT: vminnm.f32 s0, s0, s2 509; CHECK-NEXT: bx lr 510 %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) 511 %a1 = call float @llvm.minnum.f32(float %r1, float %c) 512 %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) 513 %a2 = call float @llvm.minnum.f32(float %r2, float %d) 514 %r = call float @llvm.minnum.f32(float %a1, float %a2) 515 ret float %r 516} 517 518define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) { 519; CHECK-LABEL: nested_fmax_float: 520; CHECK: @ %bb.0: 521; CHECK-NEXT: vmaxnm.f32 s2, s2, s3 522; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 523; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 524; CHECK-NEXT: vmaxnm.f32 s2, s6, s7 525; CHECK-NEXT: vmaxnm.f32 s4, s4, s5 526; CHECK-NEXT: vmaxnm.f32 s0, s0, s8 527; CHECK-NEXT: vmaxnm.f32 s2, s4, s2 528; CHECK-NEXT: vmaxnm.f32 s2, s2, s9 529; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 530; CHECK-NEXT: bx lr 531 %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 532 %a1 = call float @llvm.maxnum.f32(float %r1, float %c) 533 %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 534 %a2 = call float @llvm.maxnum.f32(float %r2, float %d) 535 %r = call float @llvm.maxnum.f32(float %a1, float %a2) 536 ret float %r 537} 538 539declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 540declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 541declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 542declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 543declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) 544declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 545declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) 546declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 547declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>) 548declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) 549declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>) 550declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) 551declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>) 552declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>) 553declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>) 554declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>) 555declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>) 556declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>) 557declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>) 558declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>) 559declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>) 560declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>) 561declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>) 562declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>) 563declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>) 564declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>) 565declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>) 566declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>) 567declare float @llvm.minnum.f32(float, float) 568declare float @llvm.maxnum.f32(float, float) 569declare i32 @llvm.umin.i32(i32, i32) 570declare i32 @llvm.umax.i32(i32, i32) 571declare i32 @llvm.smin.i32(i32, i32) 572declare i32 @llvm.smax.i32(i32, i32) 573