1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3 4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x) { 5; CHECK-LABEL: add_v4i32_v4i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vaddv.u32 r0, q0 8; CHECK-NEXT: bx lr 9entry: 10 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 11 ret i32 %z 12} 13 14define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x) { 15; CHECK-LABEL: add_v4i32_v4i64_zext: 16; CHECK: @ %bb.0: @ %entry 17; CHECK-NEXT: vaddlv.u32 r0, r1, q0 18; CHECK-NEXT: bx lr 19entry: 20 %xx = zext <4 x i32> %x to <4 x i64> 21 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 22 ret i64 %z 23} 24 25define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x) { 26; CHECK-LABEL: add_v4i32_v4i64_sext: 27; CHECK: @ %bb.0: @ %entry 28; CHECK-NEXT: vaddlv.s32 r0, r1, q0 29; CHECK-NEXT: bx lr 30entry: 31 %xx = sext <4 x i32> %x to <4 x i64> 32 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 33 ret i64 %z 34} 35 36define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { 37; CHECK-LABEL: add_v2i32_v2i64_zext: 38; CHECK: @ %bb.0: @ %entry 39; CHECK-NEXT: vmov.i64 q1, #0xffffffff 40; CHECK-NEXT: vand q0, q0, q1 41; CHECK-NEXT: vmov r0, r1, d1 42; CHECK-NEXT: vmov r2, r3, d0 43; CHECK-NEXT: adds r0, r0, r2 44; CHECK-NEXT: adcs r1, r3 45; CHECK-NEXT: bx lr 46entry: 47 %xx = zext <2 x i32> %x to <2 x i64> 48 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 49 ret i64 %z 50} 51 52define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) { 53; CHECK-LABEL: add_v2i32_v2i64_sext: 54; CHECK: @ %bb.0: @ %entry 55; CHECK-NEXT: vmov r0, s0 56; CHECK-NEXT: vmov r2, s2 57; CHECK-NEXT: asrs r1, r0, #31 58; CHECK-NEXT: adds r0, r0, r2 59; CHECK-NEXT: adc.w r1, r1, r2, asr #31 60; CHECK-NEXT: bx lr 61entry: 62 %xx = sext <2 x i32> %x to <2 x i64> 63 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 64 ret i64 %z 65} 66 67define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x) { 68; CHECK-LABEL: add_v8i16_v8i32_zext: 69; CHECK: @ %bb.0: @ %entry 70; CHECK-NEXT: vaddv.u16 r0, q0 71; CHECK-NEXT: bx lr 72entry: 73 %xx = zext <8 x i16> %x to <8 x i32> 74 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 75 ret i32 %z 76} 77 78define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x) { 79; CHECK-LABEL: add_v8i16_v8i32_sext: 80; CHECK: @ %bb.0: @ %entry 81; CHECK-NEXT: vaddv.s16 r0, q0 82; CHECK-NEXT: bx lr 83entry: 84 %xx = sext <8 x i16> %x to <8 x i32> 85 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 86 ret i32 %z 87} 88 89define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x) { 90; CHECK-LABEL: add_v4i16_v4i32_zext: 91; CHECK: @ %bb.0: @ %entry 92; CHECK-NEXT: vmovlb.u16 q0, q0 93; CHECK-NEXT: vaddv.u32 r0, q0 94; CHECK-NEXT: bx lr 95entry: 96 %xx = zext <4 x i16> %x to <4 x i32> 97 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 98 ret i32 %z 99} 100 101define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x) { 102; CHECK-LABEL: add_v4i16_v4i32_sext: 103; CHECK: @ %bb.0: @ %entry 104; CHECK-NEXT: vmovlb.s16 q0, q0 105; CHECK-NEXT: vaddv.u32 r0, q0 106; CHECK-NEXT: bx lr 107entry: 108 %xx = sext <4 x i16> %x to <4 x i32> 109 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 110 ret i32 %z 111} 112 113define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) { 114; CHECK-LABEL: add_v8i16_v8i16: 115; CHECK: @ %bb.0: @ %entry 116; CHECK-NEXT: vaddv.u16 r0, q0 117; CHECK-NEXT: uxth r0, r0 118; CHECK-NEXT: bx lr 119entry: 120 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) 121 ret i16 %z 122} 123 124define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) { 125; CHECK-LABEL: add_v8i16_v8i64_zext: 126; CHECK: @ %bb.0: @ %entry 127; CHECK-NEXT: vmov.u16 r0, q0[1] 128; CHECK-NEXT: vmov.u16 r1, q0[0] 129; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 130; CHECK-NEXT: vmov.i64 q1, #0xffff 131; CHECK-NEXT: vand q2, q2, q1 132; CHECK-NEXT: vmov.u16 r3, q0[2] 133; CHECK-NEXT: vmov r0, s10 134; CHECK-NEXT: vmov r1, r2, d4 135; CHECK-NEXT: add r0, r1 136; CHECK-NEXT: vmov.u16 r1, q0[3] 137; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 138; CHECK-NEXT: vmov.u16 r3, q0[4] 139; CHECK-NEXT: vand q2, q2, q1 140; CHECK-NEXT: vmov r1, s8 141; CHECK-NEXT: add r0, r1 142; CHECK-NEXT: vmov r1, s10 143; CHECK-NEXT: add r0, r1 144; CHECK-NEXT: vmov.u16 r1, q0[5] 145; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 146; CHECK-NEXT: vand q2, q2, q1 147; CHECK-NEXT: vmov r1, s8 148; CHECK-NEXT: add r0, r1 149; CHECK-NEXT: vmov r1, r3, d5 150; CHECK-NEXT: adds r0, r0, r1 151; CHECK-NEXT: adc.w r1, r2, r3 152; CHECK-NEXT: vmov.u16 r2, q0[7] 153; CHECK-NEXT: vmov.u16 r3, q0[6] 154; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 155; CHECK-NEXT: vand q0, q0, q1 156; CHECK-NEXT: vmov r2, r3, d0 157; CHECK-NEXT: adds r0, r0, r2 158; CHECK-NEXT: adcs r1, r3 159; CHECK-NEXT: vmov r2, r3, d1 160; CHECK-NEXT: adds r0, r0, r2 161; CHECK-NEXT: adcs r1, r3 162; CHECK-NEXT: bx lr 163entry: 164 %xx = zext <8 x i16> %x to <8 x i64> 165 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 166 ret i64 %z 167} 168 169define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) { 170; CHECK-LABEL: add_v8i16_v8i64_sext: 171; CHECK: @ %bb.0: @ %entry 172; CHECK-NEXT: vmov.s16 r0, q0[0] 173; CHECK-NEXT: vmov.s16 r2, q0[1] 174; CHECK-NEXT: asrs r1, r0, #31 175; CHECK-NEXT: adds r0, r0, r2 176; CHECK-NEXT: adc.w r1, r1, r2, asr #31 177; CHECK-NEXT: vmov.s16 r2, q0[2] 178; CHECK-NEXT: adds r0, r0, r2 179; CHECK-NEXT: adc.w r1, r1, r2, asr #31 180; CHECK-NEXT: vmov.s16 r2, q0[3] 181; CHECK-NEXT: adds r0, r0, r2 182; CHECK-NEXT: adc.w r1, r1, r2, asr #31 183; CHECK-NEXT: vmov.s16 r2, q0[4] 184; CHECK-NEXT: adds r0, r0, r2 185; CHECK-NEXT: adc.w r1, r1, r2, asr #31 186; CHECK-NEXT: vmov.s16 r2, q0[5] 187; CHECK-NEXT: adds r0, r0, r2 188; CHECK-NEXT: adc.w r1, r1, r2, asr #31 189; CHECK-NEXT: vmov.s16 r2, q0[6] 190; CHECK-NEXT: adds r0, r0, r2 191; CHECK-NEXT: adc.w r1, r1, r2, asr #31 192; CHECK-NEXT: vmov.s16 r2, q0[7] 193; CHECK-NEXT: adds r0, r0, r2 194; CHECK-NEXT: adc.w r1, r1, r2, asr #31 195; CHECK-NEXT: bx lr 196entry: 197 %xx = sext <8 x i16> %x to <8 x i64> 198 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 199 ret i64 %z 200} 201 202define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x) { 203; CHECK-LABEL: add_v4i16_v4i64_zext: 204; CHECK: @ %bb.0: @ %entry 205; CHECK-NEXT: vmovlb.u16 q0, q0 206; CHECK-NEXT: vaddlv.u32 r0, r1, q0 207; CHECK-NEXT: bx lr 208entry: 209 %xx = zext <4 x i16> %x to <4 x i64> 210 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 211 ret i64 %z 212} 213 214define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x) { 215; CHECK-LABEL: add_v4i16_v4i64_sext: 216; CHECK: @ %bb.0: @ %entry 217; CHECK-NEXT: vmovlb.s16 q0, q0 218; CHECK-NEXT: vaddlv.s32 r0, r1, q0 219; CHECK-NEXT: bx lr 220entry: 221 %xx = sext <4 x i16> %x to <4 x i64> 222 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 223 ret i64 %z 224} 225 226define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { 227; CHECK-LABEL: add_v2i16_v2i64_zext: 228; CHECK: @ %bb.0: @ %entry 229; CHECK-NEXT: vmov.i64 q1, #0xffff 230; CHECK-NEXT: vand q0, q0, q1 231; CHECK-NEXT: vmov r0, s2 232; CHECK-NEXT: vmov r2, r1, d0 233; CHECK-NEXT: add r0, r2 234; CHECK-NEXT: bx lr 235entry: 236 %xx = zext <2 x i16> %x to <2 x i64> 237 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 238 ret i64 %z 239} 240 241define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) { 242; CHECK-LABEL: add_v2i16_v2i64_sext: 243; CHECK: @ %bb.0: @ %entry 244; CHECK-NEXT: vmov r0, s0 245; CHECK-NEXT: vmov r2, s2 246; CHECK-NEXT: sxth r0, r0 247; CHECK-NEXT: asrs r1, r0, #31 248; CHECK-NEXT: sxth r2, r2 249; CHECK-NEXT: adds r0, r0, r2 250; CHECK-NEXT: adc.w r1, r1, r2, asr #31 251; CHECK-NEXT: bx lr 252entry: 253 %xx = sext <2 x i16> %x to <2 x i64> 254 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 255 ret i64 %z 256} 257 258define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x) { 259; CHECK-LABEL: add_v16i8_v16i32_zext: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: vaddv.u8 r0, q0 262; CHECK-NEXT: bx lr 263entry: 264 %xx = zext <16 x i8> %x to <16 x i32> 265 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 266 ret i32 %z 267} 268 269define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x) { 270; CHECK-LABEL: add_v16i8_v16i32_sext: 271; CHECK: @ %bb.0: @ %entry 272; CHECK-NEXT: vaddv.s8 r0, q0 273; CHECK-NEXT: bx lr 274entry: 275 %xx = sext <16 x i8> %x to <16 x i32> 276 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 277 ret i32 %z 278} 279 280define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x) { 281; CHECK-LABEL: add_v8i8_v8i32_zext: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: vmovlb.u8 q0, q0 284; CHECK-NEXT: vaddv.u16 r0, q0 285; CHECK-NEXT: bx lr 286entry: 287 %xx = zext <8 x i8> %x to <8 x i32> 288 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 289 ret i32 %z 290} 291 292define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x) { 293; CHECK-LABEL: add_v8i8_v8i32_sext: 294; CHECK: @ %bb.0: @ %entry 295; CHECK-NEXT: vmovlb.s8 q0, q0 296; CHECK-NEXT: vaddv.s16 r0, q0 297; CHECK-NEXT: bx lr 298entry: 299 %xx = sext <8 x i8> %x to <8 x i32> 300 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 301 ret i32 %z 302} 303 304define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x) { 305; CHECK-LABEL: add_v4i8_v4i32_zext: 306; CHECK: @ %bb.0: @ %entry 307; CHECK-NEXT: vmov.i32 q1, #0xff 308; CHECK-NEXT: vand q0, q0, q1 309; CHECK-NEXT: vaddv.u32 r0, q0 310; CHECK-NEXT: bx lr 311entry: 312 %xx = zext <4 x i8> %x to <4 x i32> 313 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 314 ret i32 %z 315} 316 317define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x) { 318; CHECK-LABEL: add_v4i8_v4i32_sext: 319; CHECK: @ %bb.0: @ %entry 320; CHECK-NEXT: vmovlb.s8 q0, q0 321; CHECK-NEXT: vmovlb.s16 q0, q0 322; CHECK-NEXT: vaddv.u32 r0, q0 323; CHECK-NEXT: bx lr 324entry: 325 %xx = sext <4 x i8> %x to <4 x i32> 326 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 327 ret i32 %z 328} 329 330define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) { 331; CHECK-LABEL: add_v16i8_v16i16_zext: 332; CHECK: @ %bb.0: @ %entry 333; CHECK-NEXT: vaddv.u8 r0, q0 334; CHECK-NEXT: uxth r0, r0 335; CHECK-NEXT: bx lr 336entry: 337 %xx = zext <16 x i8> %x to <16 x i16> 338 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 339 ret i16 %z 340} 341 342define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) { 343; CHECK-LABEL: add_v16i8_v16i16_sext: 344; CHECK: @ %bb.0: @ %entry 345; CHECK-NEXT: vaddv.s8 r0, q0 346; CHECK-NEXT: sxth r0, r0 347; CHECK-NEXT: bx lr 348entry: 349 %xx = sext <16 x i8> %x to <16 x i16> 350 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 351 ret i16 %z 352} 353 354define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) { 355; CHECK-LABEL: add_v8i8_v8i16_zext: 356; CHECK: @ %bb.0: @ %entry 357; CHECK-NEXT: vmovlb.u8 q0, q0 358; CHECK-NEXT: vaddv.u16 r0, q0 359; CHECK-NEXT: uxth r0, r0 360; CHECK-NEXT: bx lr 361entry: 362 %xx = zext <8 x i8> %x to <8 x i16> 363 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 364 ret i16 %z 365} 366 367define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) { 368; CHECK-LABEL: add_v8i8_v8i16_sext: 369; CHECK: @ %bb.0: @ %entry 370; CHECK-NEXT: vmovlb.s8 q0, q0 371; CHECK-NEXT: vaddv.u16 r0, q0 372; CHECK-NEXT: sxth r0, r0 373; CHECK-NEXT: bx lr 374entry: 375 %xx = sext <8 x i8> %x to <8 x i16> 376 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 377 ret i16 %z 378} 379 380define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) { 381; CHECK-LABEL: add_v16i8_v16i8: 382; CHECK: @ %bb.0: @ %entry 383; CHECK-NEXT: vaddv.u8 r0, q0 384; CHECK-NEXT: uxtb r0, r0 385; CHECK-NEXT: bx lr 386entry: 387 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) 388 ret i8 %z 389} 390 391define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { 392; CHECK-LABEL: add_v16i8_v16i64_zext: 393; CHECK: @ %bb.0: @ %entry 394; CHECK-NEXT: vmov.u8 r0, q0[1] 395; CHECK-NEXT: vmov.u8 r1, q0[0] 396; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 397; CHECK-NEXT: vmov.i64 q1, #0xff 398; CHECK-NEXT: vand q2, q2, q1 399; CHECK-NEXT: vmov.u8 r3, q0[2] 400; CHECK-NEXT: vmov r0, s10 401; CHECK-NEXT: vmov r1, r2, d4 402; CHECK-NEXT: add r0, r1 403; CHECK-NEXT: vmov.u8 r1, q0[3] 404; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 405; CHECK-NEXT: vmov.u8 r3, q0[4] 406; CHECK-NEXT: vand q2, q2, q1 407; CHECK-NEXT: vmov r1, s8 408; CHECK-NEXT: add r0, r1 409; CHECK-NEXT: vmov r1, s10 410; CHECK-NEXT: add r0, r1 411; CHECK-NEXT: vmov.u8 r1, q0[5] 412; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 413; CHECK-NEXT: vand q2, q2, q1 414; CHECK-NEXT: vmov r1, s8 415; CHECK-NEXT: add r0, r1 416; CHECK-NEXT: vmov r1, r3, d5 417; CHECK-NEXT: adds r0, r0, r1 418; CHECK-NEXT: adc.w r1, r2, r3 419; CHECK-NEXT: vmov.u8 r2, q0[7] 420; CHECK-NEXT: vmov.u8 r3, q0[6] 421; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 422; CHECK-NEXT: vand q2, q2, q1 423; CHECK-NEXT: vmov r2, r3, d4 424; CHECK-NEXT: adds r0, r0, r2 425; CHECK-NEXT: adcs r1, r3 426; CHECK-NEXT: vmov r2, r3, d5 427; CHECK-NEXT: adds r0, r0, r2 428; CHECK-NEXT: vmov.u8 r2, q0[9] 429; CHECK-NEXT: adcs r1, r3 430; CHECK-NEXT: vmov.u8 r3, q0[8] 431; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 432; CHECK-NEXT: vand q2, q2, q1 433; CHECK-NEXT: vmov r2, r3, d4 434; CHECK-NEXT: adds r0, r0, r2 435; CHECK-NEXT: adcs r1, r3 436; CHECK-NEXT: vmov r2, r3, d5 437; CHECK-NEXT: adds r0, r0, r2 438; CHECK-NEXT: vmov.u8 r2, q0[11] 439; CHECK-NEXT: adcs r1, r3 440; CHECK-NEXT: vmov.u8 r3, q0[10] 441; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 442; CHECK-NEXT: vand q2, q2, q1 443; CHECK-NEXT: vmov r2, r3, d4 444; CHECK-NEXT: adds r0, r0, r2 445; CHECK-NEXT: adcs r1, r3 446; CHECK-NEXT: vmov r2, r3, d5 447; CHECK-NEXT: adds r0, r0, r2 448; CHECK-NEXT: vmov.u8 r2, q0[13] 449; CHECK-NEXT: adcs r1, r3 450; CHECK-NEXT: vmov.u8 r3, q0[12] 451; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 452; CHECK-NEXT: vand q2, q2, q1 453; CHECK-NEXT: vmov r2, r3, d4 454; CHECK-NEXT: adds r0, r0, r2 455; CHECK-NEXT: adcs r1, r3 456; CHECK-NEXT: vmov r2, r3, d5 457; CHECK-NEXT: adds r0, r0, r2 458; CHECK-NEXT: vmov.u8 r2, q0[15] 459; CHECK-NEXT: adcs r1, r3 460; CHECK-NEXT: vmov.u8 r3, q0[14] 461; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 462; CHECK-NEXT: vand q0, q0, q1 463; CHECK-NEXT: vmov r2, r3, d0 464; CHECK-NEXT: adds r0, r0, r2 465; CHECK-NEXT: adcs r1, r3 466; CHECK-NEXT: vmov r2, r3, d1 467; CHECK-NEXT: adds r0, r0, r2 468; CHECK-NEXT: adcs r1, r3 469; CHECK-NEXT: bx lr 470entry: 471 %xx = zext <16 x i8> %x to <16 x i64> 472 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 473 ret i64 %z 474} 475 476define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) { 477; CHECK-LABEL: add_v16i8_v16i64_sext: 478; CHECK: @ %bb.0: @ %entry 479; CHECK-NEXT: vmov.s8 r0, q0[0] 480; CHECK-NEXT: vmov.s8 r2, q0[1] 481; CHECK-NEXT: asrs r1, r0, #31 482; CHECK-NEXT: adds r0, r0, r2 483; CHECK-NEXT: adc.w r1, r1, r2, asr #31 484; CHECK-NEXT: vmov.s8 r2, q0[2] 485; CHECK-NEXT: adds r0, r0, r2 486; CHECK-NEXT: adc.w r1, r1, r2, asr #31 487; CHECK-NEXT: vmov.s8 r2, q0[3] 488; CHECK-NEXT: adds r0, r0, r2 489; CHECK-NEXT: adc.w r1, r1, r2, asr #31 490; CHECK-NEXT: vmov.s8 r2, q0[4] 491; CHECK-NEXT: adds r0, r0, r2 492; CHECK-NEXT: adc.w r1, r1, r2, asr #31 493; CHECK-NEXT: vmov.s8 r2, q0[5] 494; CHECK-NEXT: adds r0, r0, r2 495; CHECK-NEXT: adc.w r1, r1, r2, asr #31 496; CHECK-NEXT: vmov.s8 r2, q0[6] 497; CHECK-NEXT: adds r0, r0, r2 498; CHECK-NEXT: adc.w r1, r1, r2, asr #31 499; CHECK-NEXT: vmov.s8 r2, q0[7] 500; CHECK-NEXT: adds r0, r0, r2 501; CHECK-NEXT: adc.w r1, r1, r2, asr #31 502; CHECK-NEXT: vmov.s8 r2, q0[8] 503; CHECK-NEXT: adds r0, r0, r2 504; CHECK-NEXT: adc.w r1, r1, r2, asr #31 505; CHECK-NEXT: vmov.s8 r2, q0[9] 506; CHECK-NEXT: adds r0, r0, r2 507; CHECK-NEXT: adc.w r1, r1, r2, asr #31 508; CHECK-NEXT: vmov.s8 r2, q0[10] 509; CHECK-NEXT: adds r0, r0, r2 510; CHECK-NEXT: adc.w r1, r1, r2, asr #31 511; CHECK-NEXT: vmov.s8 r2, q0[11] 512; CHECK-NEXT: adds r0, r0, r2 513; CHECK-NEXT: adc.w r1, r1, r2, asr #31 514; CHECK-NEXT: vmov.s8 r2, q0[12] 515; CHECK-NEXT: adds r0, r0, r2 516; CHECK-NEXT: adc.w r1, r1, r2, asr #31 517; CHECK-NEXT: vmov.s8 r2, q0[13] 518; CHECK-NEXT: adds r0, r0, r2 519; CHECK-NEXT: adc.w r1, r1, r2, asr #31 520; CHECK-NEXT: vmov.s8 r2, q0[14] 521; CHECK-NEXT: adds r0, r0, r2 522; CHECK-NEXT: adc.w r1, r1, r2, asr #31 523; CHECK-NEXT: vmov.s8 r2, q0[15] 524; CHECK-NEXT: adds r0, r0, r2 525; CHECK-NEXT: adc.w r1, r1, r2, asr #31 526; CHECK-NEXT: bx lr 527entry: 528 %xx = sext <16 x i8> %x to <16 x i64> 529 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 530 ret i64 %z 531} 532 533define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x) { 534; CHECK-LABEL: add_v8i8_v8i64_zext: 535; CHECK: @ %bb.0: @ %entry 536; CHECK-NEXT: vmovlb.u8 q0, q0 537; CHECK-NEXT: vmov.i64 q1, #0xffff 538; CHECK-NEXT: vmov.u16 r0, q0[1] 539; CHECK-NEXT: vmov.u16 r1, q0[0] 540; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 541; CHECK-NEXT: vmov.u16 r3, q0[2] 542; CHECK-NEXT: vand q2, q2, q1 543; CHECK-NEXT: vmov r0, s10 544; CHECK-NEXT: vmov r1, r2, d4 545; CHECK-NEXT: add r0, r1 546; CHECK-NEXT: vmov.u16 r1, q0[3] 547; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 548; CHECK-NEXT: vmov.u16 r3, q0[4] 549; CHECK-NEXT: vand q2, q2, q1 550; CHECK-NEXT: vmov r1, s8 551; CHECK-NEXT: add r0, r1 552; CHECK-NEXT: vmov r1, s10 553; CHECK-NEXT: add r0, r1 554; CHECK-NEXT: vmov.u16 r1, q0[5] 555; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 556; CHECK-NEXT: vand q2, q2, q1 557; CHECK-NEXT: vmov r1, s8 558; CHECK-NEXT: add r0, r1 559; CHECK-NEXT: vmov r1, r3, d5 560; CHECK-NEXT: adds r0, r0, r1 561; CHECK-NEXT: adc.w r1, r2, r3 562; CHECK-NEXT: vmov.u16 r2, q0[7] 563; CHECK-NEXT: vmov.u16 r3, q0[6] 564; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 565; CHECK-NEXT: vand q0, q0, q1 566; CHECK-NEXT: vmov r2, r3, d0 567; CHECK-NEXT: adds r0, r0, r2 568; CHECK-NEXT: adcs r1, r3 569; CHECK-NEXT: vmov r2, r3, d1 570; CHECK-NEXT: adds r0, r0, r2 571; CHECK-NEXT: adcs r1, r3 572; CHECK-NEXT: bx lr 573entry: 574 %xx = zext <8 x i8> %x to <8 x i64> 575 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 576 ret i64 %z 577} 578 579define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) { 580; CHECK-LABEL: add_v8i8_v8i64_sext: 581; CHECK: @ %bb.0: @ %entry 582; CHECK-NEXT: vmov.u16 r0, q0[0] 583; CHECK-NEXT: vmov.u16 r2, q0[1] 584; CHECK-NEXT: sxtb r0, r0 585; CHECK-NEXT: sxtb r2, r2 586; CHECK-NEXT: asrs r1, r0, #31 587; CHECK-NEXT: adds r0, r0, r2 588; CHECK-NEXT: adc.w r1, r1, r2, asr #31 589; CHECK-NEXT: vmov.u16 r2, q0[2] 590; CHECK-NEXT: sxtb r2, r2 591; CHECK-NEXT: adds r0, r0, r2 592; CHECK-NEXT: adc.w r1, r1, r2, asr #31 593; CHECK-NEXT: vmov.u16 r2, q0[3] 594; CHECK-NEXT: sxtb r2, r2 595; CHECK-NEXT: adds r0, r0, r2 596; CHECK-NEXT: adc.w r1, r1, r2, asr #31 597; CHECK-NEXT: vmov.u16 r2, q0[4] 598; CHECK-NEXT: sxtb r2, r2 599; CHECK-NEXT: adds r0, r0, r2 600; CHECK-NEXT: adc.w r1, r1, r2, asr #31 601; CHECK-NEXT: vmov.u16 r2, q0[5] 602; CHECK-NEXT: sxtb r2, r2 603; CHECK-NEXT: adds r0, r0, r2 604; CHECK-NEXT: adc.w r1, r1, r2, asr #31 605; CHECK-NEXT: vmov.u16 r2, q0[6] 606; CHECK-NEXT: sxtb r2, r2 607; CHECK-NEXT: adds r0, r0, r2 608; CHECK-NEXT: adc.w r1, r1, r2, asr #31 609; CHECK-NEXT: vmov.u16 r2, q0[7] 610; CHECK-NEXT: sxtb r2, r2 611; CHECK-NEXT: adds r0, r0, r2 612; CHECK-NEXT: adc.w r1, r1, r2, asr #31 613; CHECK-NEXT: bx lr 614entry: 615 %xx = sext <8 x i8> %x to <8 x i64> 616 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 617 ret i64 %z 618} 619 620define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x) { 621; CHECK-LABEL: add_v4i8_v4i64_zext: 622; CHECK: @ %bb.0: @ %entry 623; CHECK-NEXT: vmov.i32 q1, #0xff 624; CHECK-NEXT: vand q0, q0, q1 625; CHECK-NEXT: vaddlv.u32 r0, r1, q0 626; CHECK-NEXT: bx lr 627entry: 628 %xx = zext <4 x i8> %x to <4 x i64> 629 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 630 ret i64 %z 631} 632 633define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x) { 634; CHECK-LABEL: add_v4i8_v4i64_sext: 635; CHECK: @ %bb.0: @ %entry 636; CHECK-NEXT: vmovlb.s8 q0, q0 637; CHECK-NEXT: vmovlb.s16 q0, q0 638; CHECK-NEXT: vaddlv.s32 r0, r1, q0 639; CHECK-NEXT: bx lr 640entry: 641 %xx = sext <4 x i8> %x to <4 x i64> 642 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 643 ret i64 %z 644} 645 646define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { 647; CHECK-LABEL: add_v2i8_v2i64_zext: 648; CHECK: @ %bb.0: @ %entry 649; CHECK-NEXT: vmov.i64 q1, #0xff 650; CHECK-NEXT: vand q0, q0, q1 651; CHECK-NEXT: vmov r0, s2 652; CHECK-NEXT: vmov r2, r1, d0 653; CHECK-NEXT: add r0, r2 654; CHECK-NEXT: bx lr 655entry: 656 %xx = zext <2 x i8> %x to <2 x i64> 657 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 658 ret i64 %z 659} 660 661define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) { 662; CHECK-LABEL: add_v2i8_v2i64_sext: 663; CHECK: @ %bb.0: @ %entry 664; CHECK-NEXT: vmov r0, s0 665; CHECK-NEXT: vmov r2, s2 666; CHECK-NEXT: sxtb r0, r0 667; CHECK-NEXT: asrs r1, r0, #31 668; CHECK-NEXT: sxtb r2, r2 669; CHECK-NEXT: adds r0, r0, r2 670; CHECK-NEXT: adc.w r1, r1, r2, asr #31 671; CHECK-NEXT: bx lr 672entry: 673 %xx = sext <2 x i8> %x to <2 x i64> 674 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 675 ret i64 %z 676} 677 678define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x) { 679; CHECK-LABEL: add_v2i64_v2i64: 680; CHECK: @ %bb.0: @ %entry 681; CHECK-NEXT: vmov r0, r1, d1 682; CHECK-NEXT: vmov r2, r3, d0 683; CHECK-NEXT: adds r0, r0, r2 684; CHECK-NEXT: adcs r1, r3 685; CHECK-NEXT: bx lr 686entry: 687 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) 688 ret i64 %z 689} 690 691define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) { 692; CHECK-LABEL: add_v4i32_v4i32_acc: 693; CHECK: @ %bb.0: @ %entry 694; CHECK-NEXT: vaddva.u32 r0, q0 695; CHECK-NEXT: bx lr 696entry: 697 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 698 %r = add i32 %z, %a 699 ret i32 %r 700} 701 702define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) { 703; CHECK-LABEL: add_v4i32_v4i64_acc_zext: 704; CHECK: @ %bb.0: @ %entry 705; CHECK-NEXT: vaddlva.u32 r0, r1, q0 706; CHECK-NEXT: bx lr 707entry: 708 %xx = zext <4 x i32> %x to <4 x i64> 709 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 710 %r = add i64 %z, %a 711 ret i64 %r 712} 713 714define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) { 715; CHECK-LABEL: add_v4i32_v4i64_acc_sext: 716; CHECK: @ %bb.0: @ %entry 717; CHECK-NEXT: vaddlva.s32 r0, r1, q0 718; CHECK-NEXT: bx lr 719entry: 720 %xx = sext <4 x i32> %x to <4 x i64> 721 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 722 %r = add i64 %z, %a 723 ret i64 %r 724} 725 726define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { 727; CHECK-LABEL: add_v2i32_v2i64_acc_zext: 728; CHECK: @ %bb.0: @ %entry 729; CHECK-NEXT: .save {r7, lr} 730; CHECK-NEXT: push {r7, lr} 731; CHECK-NEXT: vmov.i64 q1, #0xffffffff 732; CHECK-NEXT: vand q0, q0, q1 733; CHECK-NEXT: vmov lr, r12, d1 734; CHECK-NEXT: vmov r3, r2, d0 735; CHECK-NEXT: adds.w r3, r3, lr 736; CHECK-NEXT: adc.w r2, r2, r12 737; CHECK-NEXT: adds r0, r0, r3 738; CHECK-NEXT: adcs r1, r2 739; CHECK-NEXT: pop {r7, pc} 740entry: 741 %xx = zext <2 x i32> %x to <2 x i64> 742 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 743 %r = add i64 %z, %a 744 ret i64 %r 745} 746 747define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) { 748; CHECK-LABEL: add_v2i32_v2i64_acc_sext: 749; CHECK: @ %bb.0: @ %entry 750; CHECK-NEXT: vmov r2, s0 751; CHECK-NEXT: vmov r3, s2 752; CHECK-NEXT: asr.w r12, r2, #31 753; CHECK-NEXT: adds r2, r2, r3 754; CHECK-NEXT: adc.w r3, r12, r3, asr #31 755; CHECK-NEXT: adds r0, r0, r2 756; CHECK-NEXT: adcs r1, r3 757; CHECK-NEXT: bx lr 758entry: 759 %xx = sext <2 x i32> %x to <2 x i64> 760 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 761 %r = add i64 %z, %a 762 ret i64 %r 763} 764 765define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) { 766; CHECK-LABEL: add_v8i16_v8i32_acc_zext: 767; CHECK: @ %bb.0: @ %entry 768; CHECK-NEXT: vaddva.u16 r0, q0 769; CHECK-NEXT: bx lr 770entry: 771 %xx = zext <8 x i16> %x to <8 x i32> 772 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 773 %r = add i32 %z, %a 774 ret i32 %r 775} 776 777define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) { 778; CHECK-LABEL: add_v8i16_v8i32_acc_sext: 779; CHECK: @ %bb.0: @ %entry 780; CHECK-NEXT: vaddva.s16 r0, q0 781; CHECK-NEXT: bx lr 782entry: 783 %xx = sext <8 x i16> %x to <8 x i32> 784 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 785 %r = add i32 %z, %a 786 ret i32 %r 787} 788 789define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) { 790; CHECK-LABEL: add_v4i16_v4i32_acc_zext: 791; CHECK: @ %bb.0: @ %entry 792; CHECK-NEXT: vmovlb.u16 q0, q0 793; CHECK-NEXT: vaddva.u32 r0, q0 794; CHECK-NEXT: bx lr 795entry: 796 %xx = zext <4 x i16> %x to <4 x i32> 797 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 798 %r = add i32 %z, %a 799 ret i32 %r 800} 801 802define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) { 803; CHECK-LABEL: add_v4i16_v4i32_acc_sext: 804; CHECK: @ %bb.0: @ %entry 805; CHECK-NEXT: vmovlb.s16 q0, q0 806; CHECK-NEXT: vaddva.u32 r0, q0 807; CHECK-NEXT: bx lr 808entry: 809 %xx = sext <4 x i16> %x to <4 x i32> 810 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 811 %r = add i32 %z, %a 812 ret i32 %r 813} 814 815define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) { 816; CHECK-LABEL: add_v8i16_v8i16_acc: 817; CHECK: @ %bb.0: @ %entry 818; CHECK-NEXT: vaddva.u16 r0, q0 819; CHECK-NEXT: uxth r0, r0 820; CHECK-NEXT: bx lr 821entry: 822 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) 823 %r = add i16 %z, %a 824 ret i16 %r 825} 826 827define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { 828; CHECK-LABEL: add_v8i16_v8i64_acc_zext: 829; CHECK: @ %bb.0: @ %entry 830; CHECK-NEXT: .save {r7, lr} 831; CHECK-NEXT: push {r7, lr} 832; CHECK-NEXT: vmov.u16 r2, q0[1] 833; CHECK-NEXT: vmov.u16 r3, q0[0] 834; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 835; CHECK-NEXT: vmov.i64 q1, #0xffff 836; CHECK-NEXT: vand q2, q2, q1 837; CHECK-NEXT: vmov r2, s10 838; CHECK-NEXT: vmov r3, r12, d4 839; CHECK-NEXT: add.w lr, r3, r2 840; CHECK-NEXT: vmov.u16 r3, q0[3] 841; CHECK-NEXT: vmov.u16 r2, q0[2] 842; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 843; CHECK-NEXT: vand q2, q2, q1 844; CHECK-NEXT: vmov r2, s8 845; CHECK-NEXT: vmov r3, s10 846; CHECK-NEXT: add r2, lr 847; CHECK-NEXT: add.w lr, r2, r3 848; CHECK-NEXT: vmov.u16 r3, q0[5] 849; CHECK-NEXT: vmov.u16 r2, q0[4] 850; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 851; CHECK-NEXT: vand q2, q2, q1 852; CHECK-NEXT: vmov r2, s8 853; CHECK-NEXT: add lr, r2 854; CHECK-NEXT: vmov r3, r2, d5 855; CHECK-NEXT: adds.w lr, lr, r3 856; CHECK-NEXT: vmov.u16 r3, q0[6] 857; CHECK-NEXT: adc.w r12, r12, r2 858; CHECK-NEXT: vmov.u16 r2, q0[7] 859; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 860; CHECK-NEXT: vand q0, q0, q1 861; CHECK-NEXT: vmov r2, r3, d0 862; CHECK-NEXT: adds.w lr, lr, r2 863; CHECK-NEXT: adc.w r12, r12, r3 864; CHECK-NEXT: vmov r2, r3, d1 865; CHECK-NEXT: adds.w r2, r2, lr 866; CHECK-NEXT: adc.w r3, r3, r12 867; CHECK-NEXT: adds r0, r0, r2 868; CHECK-NEXT: adcs r1, r3 869; CHECK-NEXT: pop {r7, pc} 870entry: 871 %xx = zext <8 x i16> %x to <8 x i64> 872 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 873 %r = add i64 %z, %a 874 ret i64 %r 875} 876 877define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { 878; CHECK-LABEL: add_v8i16_v8i64_acc_sext: 879; CHECK: @ %bb.0: @ %entry 880; CHECK-NEXT: .save {r7, lr} 881; CHECK-NEXT: push {r7, lr} 882; CHECK-NEXT: vmov.s16 r2, q0[0] 883; CHECK-NEXT: vmov.s16 r3, q0[1] 884; CHECK-NEXT: asr.w r12, r2, #31 885; CHECK-NEXT: adds.w lr, r2, r3 886; CHECK-NEXT: vmov.s16 r2, q0[2] 887; CHECK-NEXT: adc.w r3, r12, r3, asr #31 888; CHECK-NEXT: adds.w r12, lr, r2 889; CHECK-NEXT: adc.w r2, r3, r2, asr #31 890; CHECK-NEXT: vmov.s16 r3, q0[3] 891; CHECK-NEXT: adds.w r12, r12, r3 892; CHECK-NEXT: adc.w r2, r2, r3, asr #31 893; CHECK-NEXT: vmov.s16 r3, q0[4] 894; CHECK-NEXT: adds.w r12, r12, r3 895; CHECK-NEXT: adc.w r2, r2, r3, asr #31 896; CHECK-NEXT: vmov.s16 r3, q0[5] 897; CHECK-NEXT: adds.w r12, r12, r3 898; CHECK-NEXT: adc.w r2, r2, r3, asr #31 899; CHECK-NEXT: vmov.s16 r3, q0[6] 900; CHECK-NEXT: adds.w r12, r12, r3 901; CHECK-NEXT: adc.w lr, r2, r3, asr #31 902; CHECK-NEXT: vmov.s16 r3, q0[7] 903; CHECK-NEXT: adds.w r2, r12, r3 904; CHECK-NEXT: adc.w r3, lr, r3, asr #31 905; CHECK-NEXT: adds r0, r0, r2 906; CHECK-NEXT: adcs r1, r3 907; CHECK-NEXT: pop {r7, pc} 908entry: 909 %xx = sext <8 x i16> %x to <8 x i64> 910 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 911 %r = add i64 %z, %a 912 ret i64 %r 913} 914 915define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) { 916; CHECK-LABEL: add_v4i16_v4i64_acc_zext: 917; CHECK: @ %bb.0: @ %entry 918; CHECK-NEXT: vmovlb.u16 q0, q0 919; CHECK-NEXT: vaddlva.u32 r0, r1, q0 920; CHECK-NEXT: bx lr 921entry: 922 %xx = zext <4 x i16> %x to <4 x i64> 923 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 924 %r = add i64 %z, %a 925 ret i64 %r 926} 927 928define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) { 929; CHECK-LABEL: add_v4i16_v4i64_acc_sext: 930; CHECK: @ %bb.0: @ %entry 931; CHECK-NEXT: vmovlb.s16 q0, q0 932; CHECK-NEXT: vaddlva.s32 r0, r1, q0 933; CHECK-NEXT: bx lr 934entry: 935 %xx = sext <4 x i16> %x to <4 x i64> 936 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 937 %r = add i64 %z, %a 938 ret i64 %r 939} 940 941define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { 942; CHECK-LABEL: add_v2i16_v2i64_acc_zext: 943; CHECK: @ %bb.0: @ %entry 944; CHECK-NEXT: vmov.i64 q1, #0xffff 945; CHECK-NEXT: vand q0, q0, q1 946; CHECK-NEXT: vmov r2, s2 947; CHECK-NEXT: vmov r3, r12, d0 948; CHECK-NEXT: add r2, r3 949; CHECK-NEXT: adds r0, r0, r2 950; CHECK-NEXT: adc.w r1, r1, r12 951; CHECK-NEXT: bx lr 952entry: 953 %xx = zext <2 x i16> %x to <2 x i64> 954 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 955 %r = add i64 %z, %a 956 ret i64 %r 957} 958 959define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) { 960; CHECK-LABEL: add_v2i16_v2i64_acc_sext: 961; CHECK: @ %bb.0: @ %entry 962; CHECK-NEXT: vmov r2, s0 963; CHECK-NEXT: vmov r3, s2 964; CHECK-NEXT: sxth r2, r2 965; CHECK-NEXT: asr.w r12, r2, #31 966; CHECK-NEXT: sxth r3, r3 967; CHECK-NEXT: adds r2, r2, r3 968; CHECK-NEXT: adc.w r3, r12, r3, asr #31 969; CHECK-NEXT: adds r0, r0, r2 970; CHECK-NEXT: adcs r1, r3 971; CHECK-NEXT: bx lr 972entry: 973 %xx = sext <2 x i16> %x to <2 x i64> 974 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 975 %r = add i64 %z, %a 976 ret i64 %r 977} 978 979define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) { 980; CHECK-LABEL: add_v16i8_v16i32_acc_zext: 981; CHECK: @ %bb.0: @ %entry 982; CHECK-NEXT: vaddva.u8 r0, q0 983; CHECK-NEXT: bx lr 984entry: 985 %xx = zext <16 x i8> %x to <16 x i32> 986 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 987 %r = add i32 %z, %a 988 ret i32 %r 989} 990 991define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) { 992; CHECK-LABEL: add_v16i8_v16i32_acc_sext: 993; CHECK: @ %bb.0: @ %entry 994; CHECK-NEXT: vaddva.s8 r0, q0 995; CHECK-NEXT: bx lr 996entry: 997 %xx = sext <16 x i8> %x to <16 x i32> 998 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 999 %r = add i32 %z, %a 1000 ret i32 %r 1001} 1002 1003define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) { 1004; CHECK-LABEL: add_v8i8_v8i32_acc_zext: 1005; CHECK: @ %bb.0: @ %entry 1006; CHECK-NEXT: vmovlb.u8 q0, q0 1007; CHECK-NEXT: vaddva.u16 r0, q0 1008; CHECK-NEXT: bx lr 1009entry: 1010 %xx = zext <8 x i8> %x to <8 x i32> 1011 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 1012 %r = add i32 %z, %a 1013 ret i32 %r 1014} 1015 1016define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) { 1017; CHECK-LABEL: add_v8i8_v8i32_acc_sext: 1018; CHECK: @ %bb.0: @ %entry 1019; CHECK-NEXT: vmovlb.s8 q0, q0 1020; CHECK-NEXT: vaddva.s16 r0, q0 1021; CHECK-NEXT: bx lr 1022entry: 1023 %xx = sext <8 x i8> %x to <8 x i32> 1024 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 1025 %r = add i32 %z, %a 1026 ret i32 %r 1027} 1028 1029define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) { 1030; CHECK-LABEL: add_v4i8_v4i32_acc_zext: 1031; CHECK: @ %bb.0: @ %entry 1032; CHECK-NEXT: vmov.i32 q1, #0xff 1033; CHECK-NEXT: vand q0, q0, q1 1034; CHECK-NEXT: vaddva.u32 r0, q0 1035; CHECK-NEXT: bx lr 1036entry: 1037 %xx = zext <4 x i8> %x to <4 x i32> 1038 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 1039 %r = add i32 %z, %a 1040 ret i32 %r 1041} 1042 1043define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) { 1044; CHECK-LABEL: add_v4i8_v4i32_acc_sext: 1045; CHECK: @ %bb.0: @ %entry 1046; CHECK-NEXT: vmovlb.s8 q0, q0 1047; CHECK-NEXT: vmovlb.s16 q0, q0 1048; CHECK-NEXT: vaddva.u32 r0, q0 1049; CHECK-NEXT: bx lr 1050entry: 1051 %xx = sext <4 x i8> %x to <4 x i32> 1052 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 1053 %r = add i32 %z, %a 1054 ret i32 %r 1055} 1056 1057define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) { 1058; CHECK-LABEL: add_v16i8_v16i16_acc_zext: 1059; CHECK: @ %bb.0: @ %entry 1060; CHECK-NEXT: vaddva.u8 r0, q0 1061; CHECK-NEXT: uxth r0, r0 1062; CHECK-NEXT: bx lr 1063entry: 1064 %xx = zext <16 x i8> %x to <16 x i16> 1065 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 1066 %r = add i16 %z, %a 1067 ret i16 %r 1068} 1069 1070define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) { 1071; CHECK-LABEL: add_v16i8_v16i16_acc_sext: 1072; CHECK: @ %bb.0: @ %entry 1073; CHECK-NEXT: vaddva.s8 r0, q0 1074; CHECK-NEXT: sxth r0, r0 1075; CHECK-NEXT: bx lr 1076entry: 1077 %xx = sext <16 x i8> %x to <16 x i16> 1078 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 1079 %r = add i16 %z, %a 1080 ret i16 %r 1081} 1082 1083define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) { 1084; CHECK-LABEL: add_v8i8_v8i16_acc_zext: 1085; CHECK: @ %bb.0: @ %entry 1086; CHECK-NEXT: vmovlb.u8 q0, q0 1087; CHECK-NEXT: vaddva.u16 r0, q0 1088; CHECK-NEXT: uxth r0, r0 1089; CHECK-NEXT: bx lr 1090entry: 1091 %xx = zext <8 x i8> %x to <8 x i16> 1092 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 1093 %r = add i16 %z, %a 1094 ret i16 %r 1095} 1096 1097define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) { 1098; CHECK-LABEL: add_v8i8_v8i16_acc_sext: 1099; CHECK: @ %bb.0: @ %entry 1100; CHECK-NEXT: vmovlb.s8 q0, q0 1101; CHECK-NEXT: vaddva.u16 r0, q0 1102; CHECK-NEXT: sxth r0, r0 1103; CHECK-NEXT: bx lr 1104entry: 1105 %xx = sext <8 x i8> %x to <8 x i16> 1106 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 1107 %r = add i16 %z, %a 1108 ret i16 %r 1109} 1110 1111define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) { 1112; CHECK-LABEL: add_v16i8_v16i8_acc: 1113; CHECK: @ %bb.0: @ %entry 1114; CHECK-NEXT: vaddva.u8 r0, q0 1115; CHECK-NEXT: uxtb r0, r0 1116; CHECK-NEXT: bx lr 1117entry: 1118 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) 1119 %r = add i8 %z, %a 1120 ret i8 %r 1121} 1122 1123define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { 1124; CHECK-LABEL: add_v16i8_v16i64_acc_zext: 1125; CHECK: @ %bb.0: @ %entry 1126; CHECK-NEXT: .save {r7, lr} 1127; CHECK-NEXT: push {r7, lr} 1128; CHECK-NEXT: vmov.u8 r2, q0[1] 1129; CHECK-NEXT: vmov.u8 r3, q0[0] 1130; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 1131; CHECK-NEXT: vmov.i64 q1, #0xff 1132; CHECK-NEXT: vand q2, q2, q1 1133; CHECK-NEXT: vmov r2, s10 1134; CHECK-NEXT: vmov r3, r12, d4 1135; CHECK-NEXT: add.w lr, r3, r2 1136; CHECK-NEXT: vmov.u8 r3, q0[3] 1137; CHECK-NEXT: vmov.u8 r2, q0[2] 1138; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 1139; CHECK-NEXT: vand q2, q2, q1 1140; CHECK-NEXT: vmov r2, s8 1141; CHECK-NEXT: vmov r3, s10 1142; CHECK-NEXT: add r2, lr 1143; CHECK-NEXT: add.w lr, r2, r3 1144; CHECK-NEXT: vmov.u8 r3, q0[5] 1145; CHECK-NEXT: vmov.u8 r2, q0[4] 1146; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 1147; CHECK-NEXT: vand q2, q2, q1 1148; CHECK-NEXT: vmov r2, s8 1149; CHECK-NEXT: add lr, r2 1150; CHECK-NEXT: vmov r3, r2, d5 1151; CHECK-NEXT: adds.w lr, lr, r3 1152; CHECK-NEXT: vmov.u8 r3, q0[6] 1153; CHECK-NEXT: adc.w r12, r12, r2 1154; CHECK-NEXT: vmov.u8 r2, q0[7] 1155; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 1156; CHECK-NEXT: vand q2, q2, q1 1157; CHECK-NEXT: vmov r2, r3, d4 1158; CHECK-NEXT: adds.w lr, lr, r2 1159; CHECK-NEXT: adc.w r12, r12, r3 1160; CHECK-NEXT: vmov r2, r3, d5 1161; CHECK-NEXT: adds.w lr, lr, r2 1162; CHECK-NEXT: vmov.u8 r2, q0[9] 1163; CHECK-NEXT: adc.w r12, r12, r3 1164; CHECK-NEXT: vmov.u8 r3, q0[8] 1165; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 1166; CHECK-NEXT: vand q2, q2, q1 1167; CHECK-NEXT: vmov r2, r3, d4 1168; CHECK-NEXT: adds.w lr, lr, r2 1169; CHECK-NEXT: adc.w r12, r12, r3 1170; CHECK-NEXT: vmov r2, r3, d5 1171; CHECK-NEXT: adds.w lr, lr, r2 1172; CHECK-NEXT: vmov.u8 r2, q0[11] 1173; CHECK-NEXT: adc.w r12, r12, r3 1174; CHECK-NEXT: vmov.u8 r3, q0[10] 1175; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 1176; CHECK-NEXT: vand q2, q2, q1 1177; CHECK-NEXT: vmov r2, r3, d4 1178; CHECK-NEXT: adds.w lr, lr, r2 1179; CHECK-NEXT: adc.w r12, r12, r3 1180; CHECK-NEXT: vmov r2, r3, d5 1181; CHECK-NEXT: adds.w lr, lr, r2 1182; CHECK-NEXT: vmov.u8 r2, q0[13] 1183; CHECK-NEXT: adc.w r12, r12, r3 1184; CHECK-NEXT: vmov.u8 r3, q0[12] 1185; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 1186; CHECK-NEXT: vand q2, q2, q1 1187; CHECK-NEXT: vmov r2, r3, d4 1188; CHECK-NEXT: adds.w lr, lr, r2 1189; CHECK-NEXT: adc.w r12, r12, r3 1190; CHECK-NEXT: vmov r2, r3, d5 1191; CHECK-NEXT: adds.w lr, lr, r2 1192; CHECK-NEXT: vmov.u8 r2, q0[15] 1193; CHECK-NEXT: adc.w r12, r12, r3 1194; CHECK-NEXT: vmov.u8 r3, q0[14] 1195; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1196; CHECK-NEXT: vand q0, q0, q1 1197; CHECK-NEXT: vmov r2, r3, d0 1198; CHECK-NEXT: adds.w lr, lr, r2 1199; CHECK-NEXT: adc.w r12, r12, r3 1200; CHECK-NEXT: vmov r2, r3, d1 1201; CHECK-NEXT: adds.w r2, r2, lr 1202; CHECK-NEXT: adc.w r3, r3, r12 1203; CHECK-NEXT: adds r0, r0, r2 1204; CHECK-NEXT: adcs r1, r3 1205; CHECK-NEXT: pop {r7, pc} 1206entry: 1207 %xx = zext <16 x i8> %x to <16 x i64> 1208 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 1209 %r = add i64 %z, %a 1210 ret i64 %r 1211} 1212 1213define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { 1214; CHECK-LABEL: add_v16i8_v16i64_acc_sext: 1215; CHECK: @ %bb.0: @ %entry 1216; CHECK-NEXT: .save {r7, lr} 1217; CHECK-NEXT: push {r7, lr} 1218; CHECK-NEXT: vmov.s8 r2, q0[0] 1219; CHECK-NEXT: vmov.s8 r3, q0[1] 1220; CHECK-NEXT: asr.w r12, r2, #31 1221; CHECK-NEXT: adds.w lr, r2, r3 1222; CHECK-NEXT: vmov.s8 r2, q0[2] 1223; CHECK-NEXT: adc.w r3, r12, r3, asr #31 1224; CHECK-NEXT: adds.w r12, lr, r2 1225; CHECK-NEXT: adc.w r2, r3, r2, asr #31 1226; CHECK-NEXT: vmov.s8 r3, q0[3] 1227; CHECK-NEXT: adds.w r12, r12, r3 1228; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1229; CHECK-NEXT: vmov.s8 r3, q0[4] 1230; CHECK-NEXT: adds.w r12, r12, r3 1231; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1232; CHECK-NEXT: vmov.s8 r3, q0[5] 1233; CHECK-NEXT: adds.w r12, r12, r3 1234; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1235; CHECK-NEXT: vmov.s8 r3, q0[6] 1236; CHECK-NEXT: adds.w r12, r12, r3 1237; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1238; CHECK-NEXT: vmov.s8 r3, q0[7] 1239; CHECK-NEXT: adds.w r12, r12, r3 1240; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1241; CHECK-NEXT: vmov.s8 r3, q0[8] 1242; CHECK-NEXT: adds.w r12, r12, r3 1243; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1244; CHECK-NEXT: vmov.s8 r3, q0[9] 1245; CHECK-NEXT: adds.w r12, r12, r3 1246; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1247; CHECK-NEXT: vmov.s8 r3, q0[10] 1248; CHECK-NEXT: adds.w r12, r12, r3 1249; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1250; CHECK-NEXT: vmov.s8 r3, q0[11] 1251; CHECK-NEXT: adds.w r12, r12, r3 1252; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1253; CHECK-NEXT: vmov.s8 r3, q0[12] 1254; CHECK-NEXT: adds.w r12, r12, r3 1255; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1256; CHECK-NEXT: vmov.s8 r3, q0[13] 1257; CHECK-NEXT: adds.w r12, r12, r3 1258; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1259; CHECK-NEXT: vmov.s8 r3, q0[14] 1260; CHECK-NEXT: adds.w r12, r12, r3 1261; CHECK-NEXT: adc.w lr, r2, r3, asr #31 1262; CHECK-NEXT: vmov.s8 r3, q0[15] 1263; CHECK-NEXT: adds.w r2, r12, r3 1264; CHECK-NEXT: adc.w r3, lr, r3, asr #31 1265; CHECK-NEXT: adds r0, r0, r2 1266; CHECK-NEXT: adcs r1, r3 1267; CHECK-NEXT: pop {r7, pc} 1268entry: 1269 %xx = sext <16 x i8> %x to <16 x i64> 1270 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 1271 %r = add i64 %z, %a 1272 ret i64 %r 1273} 1274 1275define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { 1276; CHECK-LABEL: add_v8i8_v8i64_acc_zext: 1277; CHECK: @ %bb.0: @ %entry 1278; CHECK-NEXT: .save {r7, lr} 1279; CHECK-NEXT: push {r7, lr} 1280; CHECK-NEXT: vmovlb.u8 q0, q0 1281; CHECK-NEXT: vmov.i64 q1, #0xffff 1282; CHECK-NEXT: vmov.u16 r2, q0[1] 1283; CHECK-NEXT: vmov.u16 r3, q0[0] 1284; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 1285; CHECK-NEXT: vand q2, q2, q1 1286; CHECK-NEXT: vmov r2, s10 1287; CHECK-NEXT: vmov r3, r12, d4 1288; CHECK-NEXT: add.w lr, r3, r2 1289; CHECK-NEXT: vmov.u16 r3, q0[3] 1290; CHECK-NEXT: vmov.u16 r2, q0[2] 1291; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 1292; CHECK-NEXT: vand q2, q2, q1 1293; CHECK-NEXT: vmov r2, s8 1294; CHECK-NEXT: vmov r3, s10 1295; CHECK-NEXT: add r2, lr 1296; CHECK-NEXT: add.w lr, r2, r3 1297; CHECK-NEXT: vmov.u16 r3, q0[5] 1298; CHECK-NEXT: vmov.u16 r2, q0[4] 1299; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 1300; CHECK-NEXT: vand q2, q2, q1 1301; CHECK-NEXT: vmov r2, s8 1302; CHECK-NEXT: add lr, r2 1303; CHECK-NEXT: vmov r3, r2, d5 1304; CHECK-NEXT: adds.w lr, lr, r3 1305; CHECK-NEXT: vmov.u16 r3, q0[6] 1306; CHECK-NEXT: adc.w r12, r12, r2 1307; CHECK-NEXT: vmov.u16 r2, q0[7] 1308; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1309; CHECK-NEXT: vand q0, q0, q1 1310; CHECK-NEXT: vmov r2, r3, d0 1311; CHECK-NEXT: adds.w lr, lr, r2 1312; CHECK-NEXT: adc.w r12, r12, r3 1313; CHECK-NEXT: vmov r2, r3, d1 1314; CHECK-NEXT: adds.w r2, r2, lr 1315; CHECK-NEXT: adc.w r3, r3, r12 1316; CHECK-NEXT: adds r0, r0, r2 1317; CHECK-NEXT: adcs r1, r3 1318; CHECK-NEXT: pop {r7, pc} 1319entry: 1320 %xx = zext <8 x i8> %x to <8 x i64> 1321 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 1322 %r = add i64 %z, %a 1323 ret i64 %r 1324} 1325 1326define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) { 1327; CHECK-LABEL: add_v8i8_v8i64_acc_sext: 1328; CHECK: @ %bb.0: @ %entry 1329; CHECK-NEXT: .save {r7, lr} 1330; CHECK-NEXT: push {r7, lr} 1331; CHECK-NEXT: vmov.u16 r2, q0[0] 1332; CHECK-NEXT: vmov.u16 r3, q0[1] 1333; CHECK-NEXT: sxtb r2, r2 1334; CHECK-NEXT: sxtb r3, r3 1335; CHECK-NEXT: asr.w r12, r2, #31 1336; CHECK-NEXT: adds.w lr, r2, r3 1337; CHECK-NEXT: vmov.u16 r2, q0[2] 1338; CHECK-NEXT: adc.w r3, r12, r3, asr #31 1339; CHECK-NEXT: sxtb r2, r2 1340; CHECK-NEXT: adds.w r12, lr, r2 1341; CHECK-NEXT: adc.w r2, r3, r2, asr #31 1342; CHECK-NEXT: vmov.u16 r3, q0[3] 1343; CHECK-NEXT: sxtb r3, r3 1344; CHECK-NEXT: adds.w r12, r12, r3 1345; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1346; CHECK-NEXT: vmov.u16 r3, q0[4] 1347; CHECK-NEXT: sxtb r3, r3 1348; CHECK-NEXT: adds.w r12, r12, r3 1349; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1350; CHECK-NEXT: vmov.u16 r3, q0[5] 1351; CHECK-NEXT: sxtb r3, r3 1352; CHECK-NEXT: adds.w r12, r12, r3 1353; CHECK-NEXT: adc.w r2, r2, r3, asr #31 1354; CHECK-NEXT: vmov.u16 r3, q0[6] 1355; CHECK-NEXT: sxtb r3, r3 1356; CHECK-NEXT: adds.w r12, r12, r3 1357; CHECK-NEXT: adc.w lr, r2, r3, asr #31 1358; CHECK-NEXT: vmov.u16 r3, q0[7] 1359; CHECK-NEXT: sxtb r3, r3 1360; CHECK-NEXT: adds.w r2, r12, r3 1361; CHECK-NEXT: adc.w r3, lr, r3, asr #31 1362; CHECK-NEXT: adds r0, r0, r2 1363; CHECK-NEXT: adcs r1, r3 1364; CHECK-NEXT: pop {r7, pc} 1365entry: 1366 %xx = sext <8 x i8> %x to <8 x i64> 1367 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 1368 %r = add i64 %z, %a 1369 ret i64 %r 1370} 1371 1372define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) { 1373; CHECK-LABEL: add_v4i8_v4i64_acc_zext: 1374; CHECK: @ %bb.0: @ %entry 1375; CHECK-NEXT: vmov.i32 q1, #0xff 1376; CHECK-NEXT: vand q0, q0, q1 1377; CHECK-NEXT: vaddlva.u32 r0, r1, q0 1378; CHECK-NEXT: bx lr 1379entry: 1380 %xx = zext <4 x i8> %x to <4 x i64> 1381 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1382 %r = add i64 %z, %a 1383 ret i64 %r 1384} 1385 1386define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) { 1387; CHECK-LABEL: add_v4i8_v4i64_acc_sext: 1388; CHECK: @ %bb.0: @ %entry 1389; CHECK-NEXT: vmovlb.s8 q0, q0 1390; CHECK-NEXT: vmovlb.s16 q0, q0 1391; CHECK-NEXT: vaddlva.s32 r0, r1, q0 1392; CHECK-NEXT: bx lr 1393entry: 1394 %xx = sext <4 x i8> %x to <4 x i64> 1395 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1396 %r = add i64 %z, %a 1397 ret i64 %r 1398} 1399 1400define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { 1401; CHECK-LABEL: add_v2i8_v2i64_acc_zext: 1402; CHECK: @ %bb.0: @ %entry 1403; CHECK-NEXT: vmov.i64 q1, #0xff 1404; CHECK-NEXT: vand q0, q0, q1 1405; CHECK-NEXT: vmov r2, s2 1406; CHECK-NEXT: vmov r3, r12, d0 1407; CHECK-NEXT: add r2, r3 1408; CHECK-NEXT: adds r0, r0, r2 1409; CHECK-NEXT: adc.w r1, r1, r12 1410; CHECK-NEXT: bx lr 1411entry: 1412 %xx = zext <2 x i8> %x to <2 x i64> 1413 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1414 %r = add i64 %z, %a 1415 ret i64 %r 1416} 1417 1418define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) { 1419; CHECK-LABEL: add_v2i8_v2i64_acc_sext: 1420; CHECK: @ %bb.0: @ %entry 1421; CHECK-NEXT: vmov r2, s0 1422; CHECK-NEXT: vmov r3, s2 1423; CHECK-NEXT: sxtb r2, r2 1424; CHECK-NEXT: asr.w r12, r2, #31 1425; CHECK-NEXT: sxtb r3, r3 1426; CHECK-NEXT: adds r2, r2, r3 1427; CHECK-NEXT: adc.w r3, r12, r3, asr #31 1428; CHECK-NEXT: adds r0, r0, r2 1429; CHECK-NEXT: adcs r1, r3 1430; CHECK-NEXT: bx lr 1431entry: 1432 %xx = sext <2 x i8> %x to <2 x i64> 1433 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1434 %r = add i64 %z, %a 1435 ret i64 %r 1436} 1437 1438define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) { 1439; CHECK-LABEL: add_v2i64_v2i64_acc: 1440; CHECK: @ %bb.0: @ %entry 1441; CHECK-NEXT: .save {r7, lr} 1442; CHECK-NEXT: push {r7, lr} 1443; CHECK-NEXT: vmov lr, r12, d1 1444; CHECK-NEXT: vmov r3, r2, d0 1445; CHECK-NEXT: adds.w r3, r3, lr 1446; CHECK-NEXT: adc.w r2, r2, r12 1447; CHECK-NEXT: adds r0, r0, r3 1448; CHECK-NEXT: adcs r1, r2 1449; CHECK-NEXT: pop {r7, pc} 1450entry: 1451 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) 1452 %r = add i64 %z, %a 1453 ret i64 %r 1454} 1455 1456declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 1457declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 1458declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 1459declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 1460declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 1461declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 1462declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 1463declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 1464declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 1465declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 1466