1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-BASE 3; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT 4; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE 5; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT 6 7define i32 @addv_v2i32(<2 x i32> %a) { 8; CHECK-LABEL: addv_v2i32: 9; CHECK: // %bb.0: // %entry 10; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 11; CHECK-NEXT: fmov w0, s0 12; CHECK-NEXT: ret 13entry: 14 %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) 15 ret i32 %arg1 16} 17 18define i16 @addv_v4i16(<4 x i16> %a) { 19; CHECK-LABEL: addv_v4i16: 20; CHECK: // %bb.0: // %entry 21; CHECK-NEXT: addv h0, v0.4h 22; CHECK-NEXT: fmov w0, s0 23; CHECK-NEXT: ret 24entry: 25 %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) 26 ret i16 %arg1 27} 28 29define i32 @add_v4i32_v4i32(<4 x i32> %x) { 30; CHECK-LABEL: add_v4i32_v4i32: 31; CHECK: // %bb.0: // %entry 32; CHECK-NEXT: addv s0, v0.4s 33; CHECK-NEXT: fmov w0, s0 34; CHECK-NEXT: ret 35entry: 36 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 37 ret i32 %z 38} 39 40define i8 @addv_v8i8(<8 x i8> %a) { 41; CHECK-LABEL: addv_v8i8: 42; CHECK: // %bb.0: // %entry 43; CHECK-NEXT: addv b0, v0.8b 44; CHECK-NEXT: fmov w0, s0 45; CHECK-NEXT: ret 46entry: 47 %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) 48 ret i8 %arg1 49} 50 51define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) { 52; CHECK-LABEL: add_v4i32_v4i64_zext: 53; CHECK: // %bb.0: // %entry 54; CHECK-NEXT: uaddlv d0, v0.4s 55; CHECK-NEXT: fmov x0, d0 56; CHECK-NEXT: ret 57entry: 58 %xx = zext <4 x i32> %x to <4 x i64> 59 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 60 ret i64 %z 61} 62 63define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) { 64; CHECK-LABEL: add_v4i32_v4i64_sext: 65; CHECK: // %bb.0: // %entry 66; CHECK-NEXT: saddlv d0, v0.4s 67; CHECK-NEXT: fmov x0, d0 68; CHECK-NEXT: ret 69entry: 70 %xx = sext <4 x i32> %x to <4 x i64> 71 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 72 ret i64 %z 73} 74 75define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) { 76; CHECK-LABEL: add_v4i32_v4i64_zsext: 77; CHECK: // %bb.0: // %entry 78; CHECK-NEXT: ushll v1.2d, v0.2s, #0 79; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s 80; CHECK-NEXT: addp d0, v0.2d 81; CHECK-NEXT: fmov x0, d0 82; CHECK-NEXT: ret 83entry: 84 %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 0, i32 1> 85 %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 2, i32 3> 86 %xx = zext <2 x i32> %x to <2 x i64> 87 %yy = sext <2 x i32> %y to <2 x i64> 88 %zz = add <2 x i64> %xx, %yy 89 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz) 90 ret i64 %z 91} 92 93define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { 94; CHECK-LABEL: add_v2i32_v2i64_zext: 95; CHECK: // %bb.0: // %entry 96; CHECK-NEXT: ushll v0.2d, v0.2s, #0 97; CHECK-NEXT: addp d0, v0.2d 98; CHECK-NEXT: fmov x0, d0 99; CHECK-NEXT: ret 100entry: 101 %xx = zext <2 x i32> %x to <2 x i64> 102 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 103 ret i64 %z 104} 105 106define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) { 107; CHECK-LABEL: add_v2i32_v2i64_sext: 108; CHECK: // %bb.0: // %entry 109; CHECK-NEXT: sshll v0.2d, v0.2s, #0 110; CHECK-NEXT: addp d0, v0.2d 111; CHECK-NEXT: fmov x0, d0 112; CHECK-NEXT: ret 113entry: 114 %xx = sext <2 x i32> %x to <2 x i64> 115 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 116 ret i64 %z 117} 118 119define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) { 120; CHECK-LABEL: add_v8i16_v8i32_zext: 121; CHECK: // %bb.0: // %entry 122; CHECK-NEXT: uaddlv s0, v0.8h 123; CHECK-NEXT: fmov w0, s0 124; CHECK-NEXT: ret 125entry: 126 %xx = zext <8 x i16> %x to <8 x i32> 127 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 128 ret i32 %z 129} 130 131define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) { 132; CHECK-LABEL: add_v8i16_v8i32_sext: 133; CHECK: // %bb.0: // %entry 134; CHECK-NEXT: saddlv s0, v0.8h 135; CHECK-NEXT: fmov w0, s0 136; CHECK-NEXT: ret 137entry: 138 %xx = sext <8 x i16> %x to <8 x i32> 139 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 140 ret i32 %z 141} 142 143define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) { 144; CHECK-SD-LABEL: add_v4i16_v4i32_zext: 145; CHECK-SD: // %bb.0: // %entry 146; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 147; CHECK-SD-NEXT: addv s0, v0.4s 148; CHECK-SD-NEXT: fmov w0, s0 149; CHECK-SD-NEXT: ret 150; 151; CHECK-GI-LABEL: add_v4i16_v4i32_zext: 152; CHECK-GI: // %bb.0: // %entry 153; CHECK-GI-NEXT: uaddlv s0, v0.4h 154; CHECK-GI-NEXT: fmov w0, s0 155; CHECK-GI-NEXT: ret 156entry: 157 %xx = zext <4 x i16> %x to <4 x i32> 158 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 159 ret i32 %z 160} 161 162define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) { 163; CHECK-SD-LABEL: add_v4i16_v4i32_sext: 164; CHECK-SD: // %bb.0: // %entry 165; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 166; CHECK-SD-NEXT: addv s0, v0.4s 167; CHECK-SD-NEXT: fmov w0, s0 168; CHECK-SD-NEXT: ret 169; 170; CHECK-GI-LABEL: add_v4i16_v4i32_sext: 171; CHECK-GI: // %bb.0: // %entry 172; CHECK-GI-NEXT: saddlv s0, v0.4h 173; CHECK-GI-NEXT: fmov w0, s0 174; CHECK-GI-NEXT: ret 175entry: 176 %xx = sext <4 x i16> %x to <4 x i32> 177 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 178 ret i32 %z 179} 180 181define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) { 182; CHECK-SD-LABEL: add_v8i16_v8i16: 183; CHECK-SD: // %bb.0: // %entry 184; CHECK-SD-NEXT: addv h0, v0.8h 185; CHECK-SD-NEXT: fmov w0, s0 186; CHECK-SD-NEXT: ret 187; 188; CHECK-GI-LABEL: add_v8i16_v8i16: 189; CHECK-GI: // %bb.0: // %entry 190; CHECK-GI-NEXT: addv h0, v0.8h 191; CHECK-GI-NEXT: fmov w8, s0 192; CHECK-GI-NEXT: uxth w0, w8 193; CHECK-GI-NEXT: ret 194entry: 195 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) 196 ret i16 %z 197} 198 199define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) { 200; CHECK-SD-LABEL: add_v8i16_v8i64_zext: 201; CHECK-SD: // %bb.0: // %entry 202; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 203; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 204; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s 205; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s 206; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 207; CHECK-SD-NEXT: addp d0, v0.2d 208; CHECK-SD-NEXT: fmov x0, d0 209; CHECK-SD-NEXT: ret 210; 211; CHECK-GI-LABEL: add_v8i16_v8i64_zext: 212; CHECK-GI: // %bb.0: // %entry 213; CHECK-GI-NEXT: uaddlv s0, v0.8h 214; CHECK-GI-NEXT: mov w0, v0.s[0] 215; CHECK-GI-NEXT: ret 216entry: 217 %xx = zext <8 x i16> %x to <8 x i64> 218 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 219 ret i64 %z 220} 221 222define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) { 223; CHECK-SD-LABEL: add_v8i16_v8i64_sext: 224; CHECK-SD: // %bb.0: // %entry 225; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 226; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 227; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s 228; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 229; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 230; CHECK-SD-NEXT: addp d0, v0.2d 231; CHECK-SD-NEXT: fmov x0, d0 232; CHECK-SD-NEXT: ret 233; 234; CHECK-GI-LABEL: add_v8i16_v8i64_sext: 235; CHECK-GI: // %bb.0: // %entry 236; CHECK-GI-NEXT: saddlv s0, v0.8h 237; CHECK-GI-NEXT: smov x0, v0.s[0] 238; CHECK-GI-NEXT: ret 239entry: 240 %xx = sext <8 x i16> %x to <8 x i64> 241 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 242 ret i64 %z 243} 244 245define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) { 246; CHECK-SD-LABEL: add_v4i16_v4i64_zext: 247; CHECK-SD: // %bb.0: // %entry 248; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 249; CHECK-SD-NEXT: uaddlv d0, v0.4s 250; CHECK-SD-NEXT: fmov x0, d0 251; CHECK-SD-NEXT: ret 252; 253; CHECK-GI-LABEL: add_v4i16_v4i64_zext: 254; CHECK-GI: // %bb.0: // %entry 255; CHECK-GI-NEXT: uaddlv s0, v0.4h 256; CHECK-GI-NEXT: mov w0, v0.s[0] 257; CHECK-GI-NEXT: ret 258entry: 259 %xx = zext <4 x i16> %x to <4 x i64> 260 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 261 ret i64 %z 262} 263 264define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) { 265; CHECK-SD-LABEL: add_v4i16_v4i64_sext: 266; CHECK-SD: // %bb.0: // %entry 267; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 268; CHECK-SD-NEXT: saddlv d0, v0.4s 269; CHECK-SD-NEXT: fmov x0, d0 270; CHECK-SD-NEXT: ret 271; 272; CHECK-GI-LABEL: add_v4i16_v4i64_sext: 273; CHECK-GI: // %bb.0: // %entry 274; CHECK-GI-NEXT: saddlv s0, v0.4h 275; CHECK-GI-NEXT: smov x0, v0.s[0] 276; CHECK-GI-NEXT: ret 277entry: 278 %xx = sext <4 x i16> %x to <4 x i64> 279 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 280 ret i64 %z 281} 282 283define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { 284; CHECK-SD-LABEL: add_v2i16_v2i64_zext: 285; CHECK-SD: // %bb.0: // %entry 286; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff 287; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b 288; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 289; CHECK-SD-NEXT: addp d0, v0.2d 290; CHECK-SD-NEXT: fmov x0, d0 291; CHECK-SD-NEXT: ret 292; 293; CHECK-GI-LABEL: add_v2i16_v2i64_zext: 294; CHECK-GI: // %bb.0: // %entry 295; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff 296; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 297; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b 298; CHECK-GI-NEXT: addp d0, v0.2d 299; CHECK-GI-NEXT: fmov x0, d0 300; CHECK-GI-NEXT: ret 301entry: 302 %xx = zext <2 x i16> %x to <2 x i64> 303 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 304 ret i64 %z 305} 306 307define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) { 308; CHECK-LABEL: add_v2i16_v2i64_sext: 309; CHECK: // %bb.0: // %entry 310; CHECK-NEXT: ushll v0.2d, v0.2s, #0 311; CHECK-NEXT: shl v0.2d, v0.2d, #48 312; CHECK-NEXT: sshr v0.2d, v0.2d, #48 313; CHECK-NEXT: addp d0, v0.2d 314; CHECK-NEXT: fmov x0, d0 315; CHECK-NEXT: ret 316entry: 317 %xx = sext <2 x i16> %x to <2 x i64> 318 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 319 ret i64 %z 320} 321 322define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) { 323; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext: 324; CHECK-SD-BASE: // %bb.0: // %entry 325; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0 326; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 327; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h 328; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h 329; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 330; CHECK-SD-BASE-NEXT: addv s0, v0.4s 331; CHECK-SD-BASE-NEXT: fmov w0, s0 332; CHECK-SD-BASE-NEXT: ret 333; 334; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext: 335; CHECK-SD-DOT: // %bb.0: // %entry 336; CHECK-SD-DOT-NEXT: movi v1.16b, #1 337; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 338; CHECK-SD-DOT-NEXT: udot v2.4s, v0.16b, v1.16b 339; CHECK-SD-DOT-NEXT: addv s0, v2.4s 340; CHECK-SD-DOT-NEXT: fmov w0, s0 341; CHECK-SD-DOT-NEXT: ret 342; 343; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext: 344; CHECK-GI-BASE: // %bb.0: // %entry 345; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b 346; CHECK-GI-BASE-NEXT: fmov w8, s0 347; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff 348; CHECK-GI-BASE-NEXT: ret 349; 350; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext: 351; CHECK-GI-DOT: // %bb.0: // %entry 352; CHECK-GI-DOT-NEXT: movi v1.16b, #1 353; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 354; CHECK-GI-DOT-NEXT: udot v2.4s, v0.16b, v1.16b 355; CHECK-GI-DOT-NEXT: addv s0, v2.4s 356; CHECK-GI-DOT-NEXT: fmov w0, s0 357; CHECK-GI-DOT-NEXT: ret 358entry: 359 %xx = zext <16 x i8> %x to <16 x i32> 360 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 361 ret i32 %z 362} 363 364define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) { 365; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext: 366; CHECK-SD-BASE: // %bb.0: // %entry 367; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0 368; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 369; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h 370; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h 371; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 372; CHECK-SD-BASE-NEXT: addv s0, v0.4s 373; CHECK-SD-BASE-NEXT: fmov w0, s0 374; CHECK-SD-BASE-NEXT: ret 375; 376; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext: 377; CHECK-SD-DOT: // %bb.0: // %entry 378; CHECK-SD-DOT-NEXT: movi v1.16b, #1 379; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 380; CHECK-SD-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b 381; CHECK-SD-DOT-NEXT: addv s0, v2.4s 382; CHECK-SD-DOT-NEXT: fmov w0, s0 383; CHECK-SD-DOT-NEXT: ret 384; 385; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext: 386; CHECK-GI-BASE: // %bb.0: // %entry 387; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b 388; CHECK-GI-BASE-NEXT: fmov w8, s0 389; CHECK-GI-BASE-NEXT: sxth w0, w8 390; CHECK-GI-BASE-NEXT: ret 391; 392; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext: 393; CHECK-GI-DOT: // %bb.0: // %entry 394; CHECK-GI-DOT-NEXT: movi v1.16b, #1 395; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 396; CHECK-GI-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b 397; CHECK-GI-DOT-NEXT: addv s0, v2.4s 398; CHECK-GI-DOT-NEXT: fmov w0, s0 399; CHECK-GI-DOT-NEXT: ret 400entry: 401 %xx = sext <16 x i8> %x to <16 x i32> 402 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 403 ret i32 %z 404} 405 406define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) { 407; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext: 408; CHECK-SD-BASE: // %bb.0: // %entry 409; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 410; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h 411; CHECK-SD-BASE-NEXT: fmov w0, s0 412; CHECK-SD-BASE-NEXT: ret 413; 414; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext: 415; CHECK-SD-DOT: // %bb.0: // %entry 416; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000 417; CHECK-SD-DOT-NEXT: movi v2.8b, #1 418; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b 419; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s 420; CHECK-SD-DOT-NEXT: fmov w0, s0 421; CHECK-SD-DOT-NEXT: ret 422; 423; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext: 424; CHECK-GI-BASE: // %bb.0: // %entry 425; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b 426; CHECK-GI-BASE-NEXT: fmov w8, s0 427; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff 428; CHECK-GI-BASE-NEXT: ret 429; 430; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext: 431; CHECK-GI-DOT: // %bb.0: // %entry 432; CHECK-GI-DOT-NEXT: movi v1.8b, #1 433; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 434; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b 435; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 436; CHECK-GI-DOT-NEXT: fmov w0, s0 437; CHECK-GI-DOT-NEXT: ret 438entry: 439 %xx = zext <8 x i8> %x to <8 x i32> 440 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 441 ret i32 %z 442} 443 444define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) { 445; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext: 446; CHECK-SD-BASE: // %bb.0: // %entry 447; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 448; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h 449; CHECK-SD-BASE-NEXT: fmov w0, s0 450; CHECK-SD-BASE-NEXT: ret 451; 452; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext: 453; CHECK-SD-DOT: // %bb.0: // %entry 454; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000 455; CHECK-SD-DOT-NEXT: movi v2.8b, #1 456; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b 457; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s 458; CHECK-SD-DOT-NEXT: fmov w0, s0 459; CHECK-SD-DOT-NEXT: ret 460; 461; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext: 462; CHECK-GI-BASE: // %bb.0: // %entry 463; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b 464; CHECK-GI-BASE-NEXT: fmov w8, s0 465; CHECK-GI-BASE-NEXT: sxth w0, w8 466; CHECK-GI-BASE-NEXT: ret 467; 468; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext: 469; CHECK-GI-DOT: // %bb.0: // %entry 470; CHECK-GI-DOT-NEXT: movi v1.8b, #1 471; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 472; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b 473; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 474; CHECK-GI-DOT-NEXT: fmov w0, s0 475; CHECK-GI-DOT-NEXT: ret 476entry: 477 %xx = sext <8 x i8> %x to <8 x i32> 478 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 479 ret i32 %z 480} 481 482define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) { 483; CHECK-SD-LABEL: add_v4i8_v4i32_zext: 484; CHECK-SD: // %bb.0: // %entry 485; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 486; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 487; CHECK-SD-NEXT: addv s0, v0.4s 488; CHECK-SD-NEXT: fmov w0, s0 489; CHECK-SD-NEXT: ret 490; 491; CHECK-GI-LABEL: add_v4i8_v4i32_zext: 492; CHECK-GI: // %bb.0: // %entry 493; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff 494; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b 495; CHECK-GI-NEXT: uaddlv s0, v0.4h 496; CHECK-GI-NEXT: fmov w8, s0 497; CHECK-GI-NEXT: and w0, w8, #0xffff 498; CHECK-GI-NEXT: ret 499entry: 500 %xx = zext <4 x i8> %x to <4 x i32> 501 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 502 ret i32 %z 503} 504 505define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) { 506; CHECK-SD-LABEL: add_v4i8_v4i32_sext: 507; CHECK-SD: // %bb.0: // %entry 508; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 509; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24 510; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24 511; CHECK-SD-NEXT: addv s0, v0.4s 512; CHECK-SD-NEXT: fmov w0, s0 513; CHECK-SD-NEXT: ret 514; 515; CHECK-GI-LABEL: add_v4i8_v4i32_sext: 516; CHECK-GI: // %bb.0: // %entry 517; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 518; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 519; CHECK-GI-NEXT: saddlv s0, v0.4h 520; CHECK-GI-NEXT: fmov w8, s0 521; CHECK-GI-NEXT: sxth w0, w8 522; CHECK-GI-NEXT: ret 523entry: 524 %xx = sext <4 x i8> %x to <4 x i32> 525 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 526 ret i32 %z 527} 528 529define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) { 530; CHECK-SD-LABEL: add_v16i8_v16i16_zext: 531; CHECK-SD: // %bb.0: // %entry 532; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b 533; CHECK-SD-NEXT: addv h0, v0.8h 534; CHECK-SD-NEXT: fmov w0, s0 535; CHECK-SD-NEXT: ret 536; 537; CHECK-GI-LABEL: add_v16i8_v16i16_zext: 538; CHECK-GI: // %bb.0: // %entry 539; CHECK-GI-NEXT: uaddlv h0, v0.16b 540; CHECK-GI-NEXT: fmov w8, s0 541; CHECK-GI-NEXT: and w0, w8, #0xffff 542; CHECK-GI-NEXT: ret 543entry: 544 %xx = zext <16 x i8> %x to <16 x i16> 545 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 546 ret i16 %z 547} 548 549define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) { 550; CHECK-SD-LABEL: add_v16i8_v16i16_sext: 551; CHECK-SD: // %bb.0: // %entry 552; CHECK-SD-NEXT: saddlp v0.8h, v0.16b 553; CHECK-SD-NEXT: addv h0, v0.8h 554; CHECK-SD-NEXT: smov w0, v0.h[0] 555; CHECK-SD-NEXT: ret 556; 557; CHECK-GI-LABEL: add_v16i8_v16i16_sext: 558; CHECK-GI: // %bb.0: // %entry 559; CHECK-GI-NEXT: saddlv h0, v0.16b 560; CHECK-GI-NEXT: fmov w8, s0 561; CHECK-GI-NEXT: sxth w0, w8 562; CHECK-GI-NEXT: ret 563entry: 564 %xx = sext <16 x i8> %x to <16 x i16> 565 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 566 ret i16 %z 567} 568 569define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) { 570; CHECK-SD-LABEL: add_v8i8_v8i16_zext: 571; CHECK-SD: // %bb.0: // %entry 572; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 573; CHECK-SD-NEXT: addv h0, v0.8h 574; CHECK-SD-NEXT: fmov w0, s0 575; CHECK-SD-NEXT: ret 576; 577; CHECK-GI-LABEL: add_v8i8_v8i16_zext: 578; CHECK-GI: // %bb.0: // %entry 579; CHECK-GI-NEXT: uaddlv h0, v0.8b 580; CHECK-GI-NEXT: fmov w8, s0 581; CHECK-GI-NEXT: and w0, w8, #0xffff 582; CHECK-GI-NEXT: ret 583entry: 584 %xx = zext <8 x i8> %x to <8 x i16> 585 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 586 ret i16 %z 587} 588 589define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) { 590; CHECK-SD-LABEL: add_v8i8_v8i16_sext: 591; CHECK-SD: // %bb.0: // %entry 592; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 593; CHECK-SD-NEXT: addv h0, v0.8h 594; CHECK-SD-NEXT: smov w0, v0.h[0] 595; CHECK-SD-NEXT: ret 596; 597; CHECK-GI-LABEL: add_v8i8_v8i16_sext: 598; CHECK-GI: // %bb.0: // %entry 599; CHECK-GI-NEXT: saddlv h0, v0.8b 600; CHECK-GI-NEXT: fmov w8, s0 601; CHECK-GI-NEXT: sxth w0, w8 602; CHECK-GI-NEXT: ret 603entry: 604 %xx = sext <8 x i8> %x to <8 x i16> 605 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 606 ret i16 %z 607} 608 609define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) { 610; CHECK-SD-LABEL: add_v16i8_v16i8: 611; CHECK-SD: // %bb.0: // %entry 612; CHECK-SD-NEXT: addv b0, v0.16b 613; CHECK-SD-NEXT: fmov w0, s0 614; CHECK-SD-NEXT: ret 615; 616; CHECK-GI-LABEL: add_v16i8_v16i8: 617; CHECK-GI: // %bb.0: // %entry 618; CHECK-GI-NEXT: addv b0, v0.16b 619; CHECK-GI-NEXT: fmov w8, s0 620; CHECK-GI-NEXT: uxtb w0, w8 621; CHECK-GI-NEXT: ret 622entry: 623 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) 624 ret i8 %z 625} 626 627define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { 628; CHECK-SD-LABEL: add_v16i8_v16i64_zext: 629; CHECK-SD: // %bb.0: // %entry 630; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0 631; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 632; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0 633; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0 634; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 635; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 636; CHECK-SD-NEXT: uaddl2 v4.2d, v3.4s, v2.4s 637; CHECK-SD-NEXT: uaddl v2.2d, v3.2s, v2.2s 638; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v1.4s 639; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s 640; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d 641; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 642; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 643; CHECK-SD-NEXT: addp d0, v0.2d 644; CHECK-SD-NEXT: fmov x0, d0 645; CHECK-SD-NEXT: ret 646; 647; CHECK-GI-LABEL: add_v16i8_v16i64_zext: 648; CHECK-GI: // %bb.0: // %entry 649; CHECK-GI-NEXT: uaddlv h0, v0.16b 650; CHECK-GI-NEXT: fmov w8, s0 651; CHECK-GI-NEXT: and x0, x8, #0xffff 652; CHECK-GI-NEXT: ret 653entry: 654 %xx = zext <16 x i8> %x to <16 x i64> 655 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 656 ret i64 %z 657} 658 659define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) { 660; CHECK-SD-LABEL: add_v16i8_v16i64_sext: 661; CHECK-SD: // %bb.0: // %entry 662; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0 663; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 664; CHECK-SD-NEXT: sshll2 v2.4s, v1.8h, #0 665; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0 666; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 667; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 668; CHECK-SD-NEXT: saddl2 v4.2d, v3.4s, v2.4s 669; CHECK-SD-NEXT: saddl v2.2d, v3.2s, v2.2s 670; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v1.4s 671; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 672; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d 673; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 674; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 675; CHECK-SD-NEXT: addp d0, v0.2d 676; CHECK-SD-NEXT: fmov x0, d0 677; CHECK-SD-NEXT: ret 678; 679; CHECK-GI-LABEL: add_v16i8_v16i64_sext: 680; CHECK-GI: // %bb.0: // %entry 681; CHECK-GI-NEXT: saddlv h0, v0.16b 682; CHECK-GI-NEXT: fmov w8, s0 683; CHECK-GI-NEXT: sxth x0, w8 684; CHECK-GI-NEXT: ret 685entry: 686 %xx = sext <16 x i8> %x to <16 x i64> 687 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 688 ret i64 %z 689} 690 691define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) { 692; CHECK-SD-LABEL: add_v8i8_v8i64_zext: 693; CHECK-SD: // %bb.0: // %entry 694; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 695; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 696; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 697; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s 698; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s 699; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 700; CHECK-SD-NEXT: addp d0, v0.2d 701; CHECK-SD-NEXT: fmov x0, d0 702; CHECK-SD-NEXT: ret 703; 704; CHECK-GI-LABEL: add_v8i8_v8i64_zext: 705; CHECK-GI: // %bb.0: // %entry 706; CHECK-GI-NEXT: uaddlv h0, v0.8b 707; CHECK-GI-NEXT: fmov w8, s0 708; CHECK-GI-NEXT: and x0, x8, #0xffff 709; CHECK-GI-NEXT: ret 710entry: 711 %xx = zext <8 x i8> %x to <8 x i64> 712 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 713 ret i64 %z 714} 715 716define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) { 717; CHECK-SD-LABEL: add_v8i8_v8i64_sext: 718; CHECK-SD: // %bb.0: // %entry 719; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 720; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 721; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 722; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s 723; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 724; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 725; CHECK-SD-NEXT: addp d0, v0.2d 726; CHECK-SD-NEXT: fmov x0, d0 727; CHECK-SD-NEXT: ret 728; 729; CHECK-GI-LABEL: add_v8i8_v8i64_sext: 730; CHECK-GI: // %bb.0: // %entry 731; CHECK-GI-NEXT: saddlv h0, v0.8b 732; CHECK-GI-NEXT: fmov w8, s0 733; CHECK-GI-NEXT: sxth x0, w8 734; CHECK-GI-NEXT: ret 735entry: 736 %xx = sext <8 x i8> %x to <8 x i64> 737 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 738 ret i64 %z 739} 740 741define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) { 742; CHECK-SD-LABEL: add_v4i8_v4i64_zext: 743; CHECK-SD: // %bb.0: // %entry 744; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 745; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 746; CHECK-SD-NEXT: uaddlv d0, v0.4s 747; CHECK-SD-NEXT: fmov x0, d0 748; CHECK-SD-NEXT: ret 749; 750; CHECK-GI-LABEL: add_v4i8_v4i64_zext: 751; CHECK-GI: // %bb.0: // %entry 752; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff 753; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b 754; CHECK-GI-NEXT: uaddlv s0, v0.4h 755; CHECK-GI-NEXT: fmov w8, s0 756; CHECK-GI-NEXT: and x0, x8, #0xffff 757; CHECK-GI-NEXT: ret 758entry: 759 %xx = zext <4 x i8> %x to <4 x i64> 760 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 761 ret i64 %z 762} 763 764define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) { 765; CHECK-SD-LABEL: add_v4i8_v4i64_sext: 766; CHECK-SD: // %bb.0: // %entry 767; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 768; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0 769; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 770; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 771; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 772; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56 773; CHECK-SD-NEXT: ssra v1.2d, v0.2d, #56 774; CHECK-SD-NEXT: addp d0, v1.2d 775; CHECK-SD-NEXT: fmov x0, d0 776; CHECK-SD-NEXT: ret 777; 778; CHECK-GI-LABEL: add_v4i8_v4i64_sext: 779; CHECK-GI: // %bb.0: // %entry 780; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 781; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 782; CHECK-GI-NEXT: saddlv s0, v0.4h 783; CHECK-GI-NEXT: fmov w8, s0 784; CHECK-GI-NEXT: sxth x0, w8 785; CHECK-GI-NEXT: ret 786entry: 787 %xx = sext <4 x i8> %x to <4 x i64> 788 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 789 ret i64 %z 790} 791 792define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { 793; CHECK-SD-LABEL: add_v2i8_v2i64_zext: 794; CHECK-SD: // %bb.0: // %entry 795; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff 796; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b 797; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 798; CHECK-SD-NEXT: addp d0, v0.2d 799; CHECK-SD-NEXT: fmov x0, d0 800; CHECK-SD-NEXT: ret 801; 802; CHECK-GI-LABEL: add_v2i8_v2i64_zext: 803; CHECK-GI: // %bb.0: // %entry 804; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff 805; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 806; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b 807; CHECK-GI-NEXT: addp d0, v0.2d 808; CHECK-GI-NEXT: fmov x0, d0 809; CHECK-GI-NEXT: ret 810entry: 811 %xx = zext <2 x i8> %x to <2 x i64> 812 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 813 ret i64 %z 814} 815 816define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) { 817; CHECK-LABEL: add_v2i8_v2i64_sext: 818; CHECK: // %bb.0: // %entry 819; CHECK-NEXT: ushll v0.2d, v0.2s, #0 820; CHECK-NEXT: shl v0.2d, v0.2d, #56 821; CHECK-NEXT: sshr v0.2d, v0.2d, #56 822; CHECK-NEXT: addp d0, v0.2d 823; CHECK-NEXT: fmov x0, d0 824; CHECK-NEXT: ret 825entry: 826 %xx = sext <2 x i8> %x to <2 x i64> 827 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 828 ret i64 %z 829} 830 831define i64 @add_v2i64_v2i64(<2 x i64> %x) { 832; CHECK-LABEL: add_v2i64_v2i64: 833; CHECK: // %bb.0: // %entry 834; CHECK-NEXT: addp d0, v0.2d 835; CHECK-NEXT: fmov x0, d0 836; CHECK-NEXT: ret 837entry: 838 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) 839 ret i64 %z 840} 841 842define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) { 843; CHECK-LABEL: add_v4i32_v4i32_acc: 844; CHECK: // %bb.0: // %entry 845; CHECK-NEXT: addv s0, v0.4s 846; CHECK-NEXT: fmov w8, s0 847; CHECK-NEXT: add w0, w8, w0 848; CHECK-NEXT: ret 849entry: 850 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 851 %r = add i32 %z, %a 852 ret i32 %r 853} 854 855define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) { 856; CHECK-LABEL: add_v4i32_v4i64_acc_zext: 857; CHECK: // %bb.0: // %entry 858; CHECK-NEXT: uaddlv d0, v0.4s 859; CHECK-NEXT: fmov x8, d0 860; CHECK-NEXT: add x0, x8, x0 861; CHECK-NEXT: ret 862entry: 863 %xx = zext <4 x i32> %x to <4 x i64> 864 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 865 %r = add i64 %z, %a 866 ret i64 %r 867} 868 869define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) { 870; CHECK-LABEL: add_v4i32_v4i64_acc_sext: 871; CHECK: // %bb.0: // %entry 872; CHECK-NEXT: saddlv d0, v0.4s 873; CHECK-NEXT: fmov x8, d0 874; CHECK-NEXT: add x0, x8, x0 875; CHECK-NEXT: ret 876entry: 877 %xx = sext <4 x i32> %x to <4 x i64> 878 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 879 %r = add i64 %z, %a 880 ret i64 %r 881} 882 883define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { 884; CHECK-LABEL: add_v2i32_v2i64_acc_zext: 885; CHECK: // %bb.0: // %entry 886; CHECK-NEXT: ushll v0.2d, v0.2s, #0 887; CHECK-NEXT: addp d0, v0.2d 888; CHECK-NEXT: fmov x8, d0 889; CHECK-NEXT: add x0, x8, x0 890; CHECK-NEXT: ret 891entry: 892 %xx = zext <2 x i32> %x to <2 x i64> 893 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 894 %r = add i64 %z, %a 895 ret i64 %r 896} 897 898define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) { 899; CHECK-LABEL: add_v2i32_v2i64_acc_sext: 900; CHECK: // %bb.0: // %entry 901; CHECK-NEXT: sshll v0.2d, v0.2s, #0 902; CHECK-NEXT: addp d0, v0.2d 903; CHECK-NEXT: fmov x8, d0 904; CHECK-NEXT: add x0, x8, x0 905; CHECK-NEXT: ret 906entry: 907 %xx = sext <2 x i32> %x to <2 x i64> 908 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 909 %r = add i64 %z, %a 910 ret i64 %r 911} 912 913define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) { 914; CHECK-LABEL: add_v8i16_v8i32_acc_zext: 915; CHECK: // %bb.0: // %entry 916; CHECK-NEXT: uaddlv s0, v0.8h 917; CHECK-NEXT: fmov w8, s0 918; CHECK-NEXT: add w0, w8, w0 919; CHECK-NEXT: ret 920entry: 921 %xx = zext <8 x i16> %x to <8 x i32> 922 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 923 %r = add i32 %z, %a 924 ret i32 %r 925} 926 927define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) { 928; CHECK-LABEL: add_v8i16_v8i32_acc_sext: 929; CHECK: // %bb.0: // %entry 930; CHECK-NEXT: saddlv s0, v0.8h 931; CHECK-NEXT: fmov w8, s0 932; CHECK-NEXT: add w0, w8, w0 933; CHECK-NEXT: ret 934entry: 935 %xx = sext <8 x i16> %x to <8 x i32> 936 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 937 %r = add i32 %z, %a 938 ret i32 %r 939} 940 941define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) { 942; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext: 943; CHECK-SD: // %bb.0: // %entry 944; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 945; CHECK-SD-NEXT: addv s0, v0.4s 946; CHECK-SD-NEXT: fmov w8, s0 947; CHECK-SD-NEXT: add w0, w8, w0 948; CHECK-SD-NEXT: ret 949; 950; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext: 951; CHECK-GI: // %bb.0: // %entry 952; CHECK-GI-NEXT: uaddlv s0, v0.4h 953; CHECK-GI-NEXT: fmov w8, s0 954; CHECK-GI-NEXT: add w0, w8, w0 955; CHECK-GI-NEXT: ret 956entry: 957 %xx = zext <4 x i16> %x to <4 x i32> 958 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 959 %r = add i32 %z, %a 960 ret i32 %r 961} 962 963define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) { 964; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext: 965; CHECK-SD: // %bb.0: // %entry 966; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 967; CHECK-SD-NEXT: addv s0, v0.4s 968; CHECK-SD-NEXT: fmov w8, s0 969; CHECK-SD-NEXT: add w0, w8, w0 970; CHECK-SD-NEXT: ret 971; 972; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext: 973; CHECK-GI: // %bb.0: // %entry 974; CHECK-GI-NEXT: saddlv s0, v0.4h 975; CHECK-GI-NEXT: fmov w8, s0 976; CHECK-GI-NEXT: add w0, w8, w0 977; CHECK-GI-NEXT: ret 978entry: 979 %xx = sext <4 x i16> %x to <4 x i32> 980 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 981 %r = add i32 %z, %a 982 ret i32 %r 983} 984 985define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) { 986; CHECK-SD-LABEL: add_v8i16_v8i16_acc: 987; CHECK-SD: // %bb.0: // %entry 988; CHECK-SD-NEXT: addv h0, v0.8h 989; CHECK-SD-NEXT: fmov w8, s0 990; CHECK-SD-NEXT: add w8, w8, w0 991; CHECK-SD-NEXT: and w0, w8, #0xffff 992; CHECK-SD-NEXT: ret 993; 994; CHECK-GI-LABEL: add_v8i16_v8i16_acc: 995; CHECK-GI: // %bb.0: // %entry 996; CHECK-GI-NEXT: addv h0, v0.8h 997; CHECK-GI-NEXT: fmov w8, s0 998; CHECK-GI-NEXT: add w8, w0, w8, uxth 999; CHECK-GI-NEXT: and w0, w8, #0xffff 1000; CHECK-GI-NEXT: ret 1001entry: 1002 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) 1003 %r = add i16 %z, %a 1004 ret i16 %r 1005} 1006 1007define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { 1008; CHECK-SD-LABEL: add_v8i16_v8i64_acc_zext: 1009; CHECK-SD: // %bb.0: // %entry 1010; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 1011; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1012; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s 1013; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s 1014; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 1015; CHECK-SD-NEXT: addp d0, v0.2d 1016; CHECK-SD-NEXT: fmov x8, d0 1017; CHECK-SD-NEXT: add x0, x8, x0 1018; CHECK-SD-NEXT: ret 1019; 1020; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext: 1021; CHECK-GI: // %bb.0: // %entry 1022; CHECK-GI-NEXT: uaddlv s0, v0.8h 1023; CHECK-GI-NEXT: fmov w8, s0 1024; CHECK-GI-NEXT: add x0, x0, w8, uxtw 1025; CHECK-GI-NEXT: ret 1026entry: 1027 %xx = zext <8 x i16> %x to <8 x i64> 1028 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 1029 %r = add i64 %z, %a 1030 ret i64 %r 1031} 1032 1033define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { 1034; CHECK-SD-LABEL: add_v8i16_v8i64_acc_sext: 1035; CHECK-SD: // %bb.0: // %entry 1036; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 1037; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 1038; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s 1039; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 1040; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 1041; CHECK-SD-NEXT: addp d0, v0.2d 1042; CHECK-SD-NEXT: fmov x8, d0 1043; CHECK-SD-NEXT: add x0, x8, x0 1044; CHECK-SD-NEXT: ret 1045; 1046; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext: 1047; CHECK-GI: // %bb.0: // %entry 1048; CHECK-GI-NEXT: saddlv s0, v0.8h 1049; CHECK-GI-NEXT: fmov w8, s0 1050; CHECK-GI-NEXT: add x0, x0, w8, sxtw 1051; CHECK-GI-NEXT: ret 1052entry: 1053 %xx = sext <8 x i16> %x to <8 x i64> 1054 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 1055 %r = add i64 %z, %a 1056 ret i64 %r 1057} 1058 1059define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) { 1060; CHECK-SD-LABEL: add_v4i16_v4i64_acc_zext: 1061; CHECK-SD: // %bb.0: // %entry 1062; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1063; CHECK-SD-NEXT: uaddlv d0, v0.4s 1064; CHECK-SD-NEXT: fmov x8, d0 1065; CHECK-SD-NEXT: add x0, x8, x0 1066; CHECK-SD-NEXT: ret 1067; 1068; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext: 1069; CHECK-GI: // %bb.0: // %entry 1070; CHECK-GI-NEXT: uaddlv s0, v0.4h 1071; CHECK-GI-NEXT: fmov w8, s0 1072; CHECK-GI-NEXT: add x0, x0, w8, uxtw 1073; CHECK-GI-NEXT: ret 1074entry: 1075 %xx = zext <4 x i16> %x to <4 x i64> 1076 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1077 %r = add i64 %z, %a 1078 ret i64 %r 1079} 1080 1081define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) { 1082; CHECK-SD-LABEL: add_v4i16_v4i64_acc_sext: 1083; CHECK-SD: // %bb.0: // %entry 1084; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 1085; CHECK-SD-NEXT: saddlv d0, v0.4s 1086; CHECK-SD-NEXT: fmov x8, d0 1087; CHECK-SD-NEXT: add x0, x8, x0 1088; CHECK-SD-NEXT: ret 1089; 1090; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext: 1091; CHECK-GI: // %bb.0: // %entry 1092; CHECK-GI-NEXT: saddlv s0, v0.4h 1093; CHECK-GI-NEXT: fmov w8, s0 1094; CHECK-GI-NEXT: add x0, x0, w8, sxtw 1095; CHECK-GI-NEXT: ret 1096entry: 1097 %xx = sext <4 x i16> %x to <4 x i64> 1098 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1099 %r = add i64 %z, %a 1100 ret i64 %r 1101} 1102 1103define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { 1104; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext: 1105; CHECK-SD: // %bb.0: // %entry 1106; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff 1107; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b 1108; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 1109; CHECK-SD-NEXT: addp d0, v0.2d 1110; CHECK-SD-NEXT: fmov x8, d0 1111; CHECK-SD-NEXT: add x0, x8, x0 1112; CHECK-SD-NEXT: ret 1113; 1114; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext: 1115; CHECK-GI: // %bb.0: // %entry 1116; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff 1117; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 1118; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b 1119; CHECK-GI-NEXT: addp d0, v0.2d 1120; CHECK-GI-NEXT: fmov x8, d0 1121; CHECK-GI-NEXT: add x0, x8, x0 1122; CHECK-GI-NEXT: ret 1123entry: 1124 %xx = zext <2 x i16> %x to <2 x i64> 1125 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1126 %r = add i64 %z, %a 1127 ret i64 %r 1128} 1129 1130define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) { 1131; CHECK-LABEL: add_v2i16_v2i64_acc_sext: 1132; CHECK: // %bb.0: // %entry 1133; CHECK-NEXT: ushll v0.2d, v0.2s, #0 1134; CHECK-NEXT: shl v0.2d, v0.2d, #48 1135; CHECK-NEXT: sshr v0.2d, v0.2d, #48 1136; CHECK-NEXT: addp d0, v0.2d 1137; CHECK-NEXT: fmov x8, d0 1138; CHECK-NEXT: add x0, x8, x0 1139; CHECK-NEXT: ret 1140entry: 1141 %xx = sext <2 x i16> %x to <2 x i64> 1142 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1143 %r = add i64 %z, %a 1144 ret i64 %r 1145} 1146 1147define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) { 1148; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext: 1149; CHECK-SD-BASE: // %bb.0: // %entry 1150; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0 1151; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 1152; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h 1153; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h 1154; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 1155; CHECK-SD-BASE-NEXT: addv s0, v0.4s 1156; CHECK-SD-BASE-NEXT: fmov w8, s0 1157; CHECK-SD-BASE-NEXT: add w0, w8, w0 1158; CHECK-SD-BASE-NEXT: ret 1159; 1160; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext: 1161; CHECK-SD-DOT: // %bb.0: // %entry 1162; CHECK-SD-DOT-NEXT: movi v1.16b, #1 1163; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 1164; CHECK-SD-DOT-NEXT: udot v2.4s, v0.16b, v1.16b 1165; CHECK-SD-DOT-NEXT: addv s0, v2.4s 1166; CHECK-SD-DOT-NEXT: fmov w8, s0 1167; CHECK-SD-DOT-NEXT: add w0, w8, w0 1168; CHECK-SD-DOT-NEXT: ret 1169; 1170; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext: 1171; CHECK-GI-BASE: // %bb.0: // %entry 1172; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b 1173; CHECK-GI-BASE-NEXT: fmov w8, s0 1174; CHECK-GI-BASE-NEXT: add w0, w0, w8, uxth 1175; CHECK-GI-BASE-NEXT: ret 1176; 1177; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext: 1178; CHECK-GI-DOT: // %bb.0: // %entry 1179; CHECK-GI-DOT-NEXT: movi v1.16b, #1 1180; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 1181; CHECK-GI-DOT-NEXT: udot v2.4s, v0.16b, v1.16b 1182; CHECK-GI-DOT-NEXT: addv s0, v2.4s 1183; CHECK-GI-DOT-NEXT: fmov w8, s0 1184; CHECK-GI-DOT-NEXT: add w0, w8, w0 1185; CHECK-GI-DOT-NEXT: ret 1186entry: 1187 %xx = zext <16 x i8> %x to <16 x i32> 1188 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 1189 %r = add i32 %z, %a 1190 ret i32 %r 1191} 1192 1193define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) { 1194; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext: 1195; CHECK-SD-BASE: // %bb.0: // %entry 1196; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0 1197; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 1198; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h 1199; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h 1200; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 1201; CHECK-SD-BASE-NEXT: addv s0, v0.4s 1202; CHECK-SD-BASE-NEXT: fmov w8, s0 1203; CHECK-SD-BASE-NEXT: add w0, w8, w0 1204; CHECK-SD-BASE-NEXT: ret 1205; 1206; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext: 1207; CHECK-SD-DOT: // %bb.0: // %entry 1208; CHECK-SD-DOT-NEXT: movi v1.16b, #1 1209; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 1210; CHECK-SD-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b 1211; CHECK-SD-DOT-NEXT: addv s0, v2.4s 1212; CHECK-SD-DOT-NEXT: fmov w8, s0 1213; CHECK-SD-DOT-NEXT: add w0, w8, w0 1214; CHECK-SD-DOT-NEXT: ret 1215; 1216; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext: 1217; CHECK-GI-BASE: // %bb.0: // %entry 1218; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b 1219; CHECK-GI-BASE-NEXT: fmov w8, s0 1220; CHECK-GI-BASE-NEXT: add w0, w0, w8, sxth 1221; CHECK-GI-BASE-NEXT: ret 1222; 1223; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext: 1224; CHECK-GI-DOT: // %bb.0: // %entry 1225; CHECK-GI-DOT-NEXT: movi v1.16b, #1 1226; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 1227; CHECK-GI-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b 1228; CHECK-GI-DOT-NEXT: addv s0, v2.4s 1229; CHECK-GI-DOT-NEXT: fmov w8, s0 1230; CHECK-GI-DOT-NEXT: add w0, w8, w0 1231; CHECK-GI-DOT-NEXT: ret 1232entry: 1233 %xx = sext <16 x i8> %x to <16 x i32> 1234 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 1235 %r = add i32 %z, %a 1236 ret i32 %r 1237} 1238 1239define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) { 1240; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext: 1241; CHECK-SD-BASE: // %bb.0: // %entry 1242; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 1243; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h 1244; CHECK-SD-BASE-NEXT: fmov w8, s0 1245; CHECK-SD-BASE-NEXT: add w0, w8, w0 1246; CHECK-SD-BASE-NEXT: ret 1247; 1248; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext: 1249; CHECK-SD-DOT: // %bb.0: // %entry 1250; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000 1251; CHECK-SD-DOT-NEXT: movi v2.8b, #1 1252; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b 1253; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s 1254; CHECK-SD-DOT-NEXT: fmov w8, s0 1255; CHECK-SD-DOT-NEXT: add w0, w8, w0 1256; CHECK-SD-DOT-NEXT: ret 1257; 1258; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext: 1259; CHECK-GI-BASE: // %bb.0: // %entry 1260; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b 1261; CHECK-GI-BASE-NEXT: fmov w8, s0 1262; CHECK-GI-BASE-NEXT: add w0, w0, w8, uxth 1263; CHECK-GI-BASE-NEXT: ret 1264; 1265; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext: 1266; CHECK-GI-DOT: // %bb.0: // %entry 1267; CHECK-GI-DOT-NEXT: movi v1.8b, #1 1268; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 1269; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b 1270; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 1271; CHECK-GI-DOT-NEXT: fmov w8, s0 1272; CHECK-GI-DOT-NEXT: add w0, w8, w0 1273; CHECK-GI-DOT-NEXT: ret 1274entry: 1275 %xx = zext <8 x i8> %x to <8 x i32> 1276 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 1277 %r = add i32 %z, %a 1278 ret i32 %r 1279} 1280 1281define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) { 1282; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext: 1283; CHECK-SD-BASE: // %bb.0: // %entry 1284; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 1285; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h 1286; CHECK-SD-BASE-NEXT: fmov w8, s0 1287; CHECK-SD-BASE-NEXT: add w0, w8, w0 1288; CHECK-SD-BASE-NEXT: ret 1289; 1290; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext: 1291; CHECK-SD-DOT: // %bb.0: // %entry 1292; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000 1293; CHECK-SD-DOT-NEXT: movi v2.8b, #1 1294; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b 1295; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s 1296; CHECK-SD-DOT-NEXT: fmov w8, s0 1297; CHECK-SD-DOT-NEXT: add w0, w8, w0 1298; CHECK-SD-DOT-NEXT: ret 1299; 1300; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext: 1301; CHECK-GI-BASE: // %bb.0: // %entry 1302; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b 1303; CHECK-GI-BASE-NEXT: fmov w8, s0 1304; CHECK-GI-BASE-NEXT: add w0, w0, w8, sxth 1305; CHECK-GI-BASE-NEXT: ret 1306; 1307; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext: 1308; CHECK-GI-DOT: // %bb.0: // %entry 1309; CHECK-GI-DOT-NEXT: movi v1.8b, #1 1310; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 1311; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b 1312; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 1313; CHECK-GI-DOT-NEXT: fmov w8, s0 1314; CHECK-GI-DOT-NEXT: add w0, w8, w0 1315; CHECK-GI-DOT-NEXT: ret 1316entry: 1317 %xx = sext <8 x i8> %x to <8 x i32> 1318 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 1319 %r = add i32 %z, %a 1320 ret i32 %r 1321} 1322 1323define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) { 1324; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext: 1325; CHECK-SD: // %bb.0: // %entry 1326; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 1327; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1328; CHECK-SD-NEXT: addv s0, v0.4s 1329; CHECK-SD-NEXT: fmov w8, s0 1330; CHECK-SD-NEXT: add w0, w8, w0 1331; CHECK-SD-NEXT: ret 1332; 1333; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext: 1334; CHECK-GI: // %bb.0: // %entry 1335; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff 1336; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b 1337; CHECK-GI-NEXT: uaddlv s0, v0.4h 1338; CHECK-GI-NEXT: fmov w8, s0 1339; CHECK-GI-NEXT: add w0, w0, w8, uxth 1340; CHECK-GI-NEXT: ret 1341entry: 1342 %xx = zext <4 x i8> %x to <4 x i32> 1343 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 1344 %r = add i32 %z, %a 1345 ret i32 %r 1346} 1347 1348define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) { 1349; CHECK-SD-LABEL: add_v4i8_v4i32_acc_sext: 1350; CHECK-SD: // %bb.0: // %entry 1351; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1352; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24 1353; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24 1354; CHECK-SD-NEXT: addv s0, v0.4s 1355; CHECK-SD-NEXT: fmov w8, s0 1356; CHECK-SD-NEXT: add w0, w8, w0 1357; CHECK-SD-NEXT: ret 1358; 1359; CHECK-GI-LABEL: add_v4i8_v4i32_acc_sext: 1360; CHECK-GI: // %bb.0: // %entry 1361; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 1362; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 1363; CHECK-GI-NEXT: saddlv s0, v0.4h 1364; CHECK-GI-NEXT: fmov w8, s0 1365; CHECK-GI-NEXT: add w0, w0, w8, sxth 1366; CHECK-GI-NEXT: ret 1367entry: 1368 %xx = sext <4 x i8> %x to <4 x i32> 1369 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 1370 %r = add i32 %z, %a 1371 ret i32 %r 1372} 1373 1374define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) { 1375; CHECK-LABEL: add_v16i8_v16i16_acc_zext: 1376; CHECK: // %bb.0: // %entry 1377; CHECK-NEXT: uaddlv h0, v0.16b 1378; CHECK-NEXT: fmov w8, s0 1379; CHECK-NEXT: add w8, w8, w0 1380; CHECK-NEXT: and w0, w8, #0xffff 1381; CHECK-NEXT: ret 1382entry: 1383 %xx = zext <16 x i8> %x to <16 x i16> 1384 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 1385 %r = add i16 %z, %a 1386 ret i16 %r 1387} 1388 1389define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) { 1390; CHECK-LABEL: add_v16i8_v16i16_acc_sext: 1391; CHECK: // %bb.0: // %entry 1392; CHECK-NEXT: saddlv h0, v0.16b 1393; CHECK-NEXT: fmov w8, s0 1394; CHECK-NEXT: add w8, w8, w0 1395; CHECK-NEXT: sxth w0, w8 1396; CHECK-NEXT: ret 1397entry: 1398 %xx = sext <16 x i8> %x to <16 x i16> 1399 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 1400 %r = add i16 %z, %a 1401 ret i16 %r 1402} 1403 1404define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) { 1405; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext: 1406; CHECK-SD: // %bb.0: // %entry 1407; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 1408; CHECK-SD-NEXT: addv h0, v0.8h 1409; CHECK-SD-NEXT: fmov w8, s0 1410; CHECK-SD-NEXT: add w8, w8, w0 1411; CHECK-SD-NEXT: and w0, w8, #0xffff 1412; CHECK-SD-NEXT: ret 1413; 1414; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext: 1415; CHECK-GI: // %bb.0: // %entry 1416; CHECK-GI-NEXT: uaddlv h0, v0.8b 1417; CHECK-GI-NEXT: fmov w8, s0 1418; CHECK-GI-NEXT: add w8, w8, w0 1419; CHECK-GI-NEXT: and w0, w8, #0xffff 1420; CHECK-GI-NEXT: ret 1421entry: 1422 %xx = zext <8 x i8> %x to <8 x i16> 1423 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 1424 %r = add i16 %z, %a 1425 ret i16 %r 1426} 1427 1428define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) { 1429; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext: 1430; CHECK-SD: // %bb.0: // %entry 1431; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 1432; CHECK-SD-NEXT: addv h0, v0.8h 1433; CHECK-SD-NEXT: fmov w8, s0 1434; CHECK-SD-NEXT: add w8, w8, w0 1435; CHECK-SD-NEXT: sxth w0, w8 1436; CHECK-SD-NEXT: ret 1437; 1438; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext: 1439; CHECK-GI: // %bb.0: // %entry 1440; CHECK-GI-NEXT: saddlv h0, v0.8b 1441; CHECK-GI-NEXT: fmov w8, s0 1442; CHECK-GI-NEXT: add w8, w8, w0 1443; CHECK-GI-NEXT: sxth w0, w8 1444; CHECK-GI-NEXT: ret 1445entry: 1446 %xx = sext <8 x i8> %x to <8 x i16> 1447 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 1448 %r = add i16 %z, %a 1449 ret i16 %r 1450} 1451 1452define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) { 1453; CHECK-SD-LABEL: add_v16i8_v16i8_acc: 1454; CHECK-SD: // %bb.0: // %entry 1455; CHECK-SD-NEXT: addv b0, v0.16b 1456; CHECK-SD-NEXT: fmov w8, s0 1457; CHECK-SD-NEXT: add w8, w8, w0 1458; CHECK-SD-NEXT: and w0, w8, #0xff 1459; CHECK-SD-NEXT: ret 1460; 1461; CHECK-GI-LABEL: add_v16i8_v16i8_acc: 1462; CHECK-GI: // %bb.0: // %entry 1463; CHECK-GI-NEXT: addv b0, v0.16b 1464; CHECK-GI-NEXT: fmov w8, s0 1465; CHECK-GI-NEXT: add w8, w0, w8, uxtb 1466; CHECK-GI-NEXT: and w0, w8, #0xff 1467; CHECK-GI-NEXT: ret 1468entry: 1469 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) 1470 %r = add i8 %z, %a 1471 ret i8 %r 1472} 1473 1474define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { 1475; CHECK-SD-LABEL: add_v16i8_v16i64_acc_zext: 1476; CHECK-SD: // %bb.0: // %entry 1477; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0 1478; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 1479; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0 1480; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0 1481; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 1482; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1483; CHECK-SD-NEXT: uaddl2 v4.2d, v3.4s, v2.4s 1484; CHECK-SD-NEXT: uaddl v2.2d, v3.2s, v2.2s 1485; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v1.4s 1486; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s 1487; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d 1488; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 1489; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 1490; CHECK-SD-NEXT: addp d0, v0.2d 1491; CHECK-SD-NEXT: fmov x8, d0 1492; CHECK-SD-NEXT: add x0, x8, x0 1493; CHECK-SD-NEXT: ret 1494; 1495; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext: 1496; CHECK-GI: // %bb.0: // %entry 1497; CHECK-GI-NEXT: uaddlv h0, v0.16b 1498; CHECK-GI-NEXT: fmov w8, s0 1499; CHECK-GI-NEXT: add x0, x0, w8, uxth 1500; CHECK-GI-NEXT: ret 1501entry: 1502 %xx = zext <16 x i8> %x to <16 x i64> 1503 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 1504 %r = add i64 %z, %a 1505 ret i64 %r 1506} 1507 1508define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { 1509; CHECK-SD-LABEL: add_v16i8_v16i64_acc_sext: 1510; CHECK-SD: // %bb.0: // %entry 1511; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0 1512; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 1513; CHECK-SD-NEXT: sshll2 v2.4s, v1.8h, #0 1514; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0 1515; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 1516; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 1517; CHECK-SD-NEXT: saddl2 v4.2d, v3.4s, v2.4s 1518; CHECK-SD-NEXT: saddl v2.2d, v3.2s, v2.2s 1519; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v1.4s 1520; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 1521; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d 1522; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 1523; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 1524; CHECK-SD-NEXT: addp d0, v0.2d 1525; CHECK-SD-NEXT: fmov x8, d0 1526; CHECK-SD-NEXT: add x0, x8, x0 1527; CHECK-SD-NEXT: ret 1528; 1529; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext: 1530; CHECK-GI: // %bb.0: // %entry 1531; CHECK-GI-NEXT: saddlv h0, v0.16b 1532; CHECK-GI-NEXT: fmov w8, s0 1533; CHECK-GI-NEXT: add x0, x0, w8, sxth 1534; CHECK-GI-NEXT: ret 1535entry: 1536 %xx = sext <16 x i8> %x to <16 x i64> 1537 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 1538 %r = add i64 %z, %a 1539 ret i64 %r 1540} 1541 1542define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { 1543; CHECK-SD-LABEL: add_v8i8_v8i64_acc_zext: 1544; CHECK-SD: // %bb.0: // %entry 1545; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 1546; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 1547; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1548; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s 1549; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s 1550; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 1551; CHECK-SD-NEXT: addp d0, v0.2d 1552; CHECK-SD-NEXT: fmov x8, d0 1553; CHECK-SD-NEXT: add x0, x8, x0 1554; CHECK-SD-NEXT: ret 1555; 1556; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext: 1557; CHECK-GI: // %bb.0: // %entry 1558; CHECK-GI-NEXT: uaddlv h0, v0.8b 1559; CHECK-GI-NEXT: fmov w8, s0 1560; CHECK-GI-NEXT: add x0, x0, w8, uxth 1561; CHECK-GI-NEXT: ret 1562entry: 1563 %xx = zext <8 x i8> %x to <8 x i64> 1564 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 1565 %r = add i64 %z, %a 1566 ret i64 %r 1567} 1568 1569define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) { 1570; CHECK-SD-LABEL: add_v8i8_v8i64_acc_sext: 1571; CHECK-SD: // %bb.0: // %entry 1572; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 1573; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 1574; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 1575; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s 1576; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 1577; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 1578; CHECK-SD-NEXT: addp d0, v0.2d 1579; CHECK-SD-NEXT: fmov x8, d0 1580; CHECK-SD-NEXT: add x0, x8, x0 1581; CHECK-SD-NEXT: ret 1582; 1583; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext: 1584; CHECK-GI: // %bb.0: // %entry 1585; CHECK-GI-NEXT: saddlv h0, v0.8b 1586; CHECK-GI-NEXT: fmov w8, s0 1587; CHECK-GI-NEXT: add x0, x0, w8, sxth 1588; CHECK-GI-NEXT: ret 1589entry: 1590 %xx = sext <8 x i8> %x to <8 x i64> 1591 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 1592 %r = add i64 %z, %a 1593 ret i64 %r 1594} 1595 1596define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) { 1597; CHECK-SD-LABEL: add_v4i8_v4i64_acc_zext: 1598; CHECK-SD: // %bb.0: // %entry 1599; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 1600; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1601; CHECK-SD-NEXT: uaddlv d0, v0.4s 1602; CHECK-SD-NEXT: fmov x8, d0 1603; CHECK-SD-NEXT: add x0, x8, x0 1604; CHECK-SD-NEXT: ret 1605; 1606; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext: 1607; CHECK-GI: // %bb.0: // %entry 1608; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff 1609; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b 1610; CHECK-GI-NEXT: uaddlv s0, v0.4h 1611; CHECK-GI-NEXT: fmov w8, s0 1612; CHECK-GI-NEXT: add x0, x0, w8, uxth 1613; CHECK-GI-NEXT: ret 1614entry: 1615 %xx = zext <4 x i8> %x to <4 x i64> 1616 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1617 %r = add i64 %z, %a 1618 ret i64 %r 1619} 1620 1621define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) { 1622; CHECK-SD-LABEL: add_v4i8_v4i64_acc_sext: 1623; CHECK-SD: // %bb.0: // %entry 1624; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 1625; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0 1626; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 1627; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 1628; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 1629; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56 1630; CHECK-SD-NEXT: ssra v1.2d, v0.2d, #56 1631; CHECK-SD-NEXT: addp d0, v1.2d 1632; CHECK-SD-NEXT: fmov x8, d0 1633; CHECK-SD-NEXT: add x0, x8, x0 1634; CHECK-SD-NEXT: ret 1635; 1636; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext: 1637; CHECK-GI: // %bb.0: // %entry 1638; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 1639; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 1640; CHECK-GI-NEXT: saddlv s0, v0.4h 1641; CHECK-GI-NEXT: fmov w8, s0 1642; CHECK-GI-NEXT: add x0, x0, w8, sxth 1643; CHECK-GI-NEXT: ret 1644entry: 1645 %xx = sext <4 x i8> %x to <4 x i64> 1646 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1647 %r = add i64 %z, %a 1648 ret i64 %r 1649} 1650 1651define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { 1652; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext: 1653; CHECK-SD: // %bb.0: // %entry 1654; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff 1655; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b 1656; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 1657; CHECK-SD-NEXT: addp d0, v0.2d 1658; CHECK-SD-NEXT: fmov x8, d0 1659; CHECK-SD-NEXT: add x0, x8, x0 1660; CHECK-SD-NEXT: ret 1661; 1662; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext: 1663; CHECK-GI: // %bb.0: // %entry 1664; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff 1665; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 1666; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b 1667; CHECK-GI-NEXT: addp d0, v0.2d 1668; CHECK-GI-NEXT: fmov x8, d0 1669; CHECK-GI-NEXT: add x0, x8, x0 1670; CHECK-GI-NEXT: ret 1671entry: 1672 %xx = zext <2 x i8> %x to <2 x i64> 1673 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1674 %r = add i64 %z, %a 1675 ret i64 %r 1676} 1677 1678define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) { 1679; CHECK-LABEL: add_v2i8_v2i64_acc_sext: 1680; CHECK: // %bb.0: // %entry 1681; CHECK-NEXT: ushll v0.2d, v0.2s, #0 1682; CHECK-NEXT: shl v0.2d, v0.2d, #56 1683; CHECK-NEXT: sshr v0.2d, v0.2d, #56 1684; CHECK-NEXT: addp d0, v0.2d 1685; CHECK-NEXT: fmov x8, d0 1686; CHECK-NEXT: add x0, x8, x0 1687; CHECK-NEXT: ret 1688entry: 1689 %xx = sext <2 x i8> %x to <2 x i64> 1690 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1691 %r = add i64 %z, %a 1692 ret i64 %r 1693} 1694 1695define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) { 1696; CHECK-LABEL: add_v2i64_v2i64_acc: 1697; CHECK: // %bb.0: // %entry 1698; CHECK-NEXT: addp d0, v0.2d 1699; CHECK-NEXT: fmov x8, d0 1700; CHECK-NEXT: add x0, x8, x0 1701; CHECK-NEXT: ret 1702entry: 1703 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) 1704 %r = add i64 %z, %a 1705 ret i64 %r 1706} 1707 1708define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) { 1709; CHECK-SD-LABEL: add_pair_v4i32_v4i32: 1710; CHECK-SD: // %bb.0: // %entry 1711; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 1712; CHECK-SD-NEXT: addv s0, v0.4s 1713; CHECK-SD-NEXT: fmov w0, s0 1714; CHECK-SD-NEXT: ret 1715; 1716; CHECK-GI-LABEL: add_pair_v4i32_v4i32: 1717; CHECK-GI: // %bb.0: // %entry 1718; CHECK-GI-NEXT: addv s0, v0.4s 1719; CHECK-GI-NEXT: addv s1, v1.4s 1720; CHECK-GI-NEXT: fmov w8, s0 1721; CHECK-GI-NEXT: fmov w9, s1 1722; CHECK-GI-NEXT: add w0, w8, w9 1723; CHECK-GI-NEXT: ret 1724entry: 1725 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 1726 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 1727 %z = add i32 %z1, %z2 1728 ret i32 %z 1729} 1730 1731define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) { 1732; CHECK-SD-LABEL: add_pair_v4i32_v4i64_zext: 1733; CHECK-SD: // %bb.0: // %entry 1734; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s 1735; CHECK-SD-NEXT: uadalp v1.2d, v0.4s 1736; CHECK-SD-NEXT: addp d0, v1.2d 1737; CHECK-SD-NEXT: fmov x0, d0 1738; CHECK-SD-NEXT: ret 1739; 1740; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext: 1741; CHECK-GI: // %bb.0: // %entry 1742; CHECK-GI-NEXT: uaddlv d0, v0.4s 1743; CHECK-GI-NEXT: uaddlv d1, v1.4s 1744; CHECK-GI-NEXT: fmov x8, d0 1745; CHECK-GI-NEXT: fmov x9, d1 1746; CHECK-GI-NEXT: add x0, x8, x9 1747; CHECK-GI-NEXT: ret 1748entry: 1749 %xx = zext <4 x i32> %x to <4 x i64> 1750 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1751 %yy = zext <4 x i32> %y to <4 x i64> 1752 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) 1753 %z = add i64 %z1, %z2 1754 ret i64 %z 1755} 1756 1757define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) { 1758; CHECK-SD-LABEL: add_pair_v4i32_v4i64_sext: 1759; CHECK-SD: // %bb.0: // %entry 1760; CHECK-SD-NEXT: saddlp v1.2d, v1.4s 1761; CHECK-SD-NEXT: sadalp v1.2d, v0.4s 1762; CHECK-SD-NEXT: addp d0, v1.2d 1763; CHECK-SD-NEXT: fmov x0, d0 1764; CHECK-SD-NEXT: ret 1765; 1766; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext: 1767; CHECK-GI: // %bb.0: // %entry 1768; CHECK-GI-NEXT: saddlv d0, v0.4s 1769; CHECK-GI-NEXT: saddlv d1, v1.4s 1770; CHECK-GI-NEXT: fmov x8, d0 1771; CHECK-GI-NEXT: fmov x9, d1 1772; CHECK-GI-NEXT: add x0, x8, x9 1773; CHECK-GI-NEXT: ret 1774entry: 1775 %xx = sext <4 x i32> %x to <4 x i64> 1776 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 1777 %yy = sext <4 x i32> %y to <4 x i64> 1778 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) 1779 %z = add i64 %z1, %z2 1780 ret i64 %z 1781} 1782 1783define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) { 1784; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext: 1785; CHECK-SD: // %bb.0: // %entry 1786; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1787; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 1788; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] 1789; CHECK-SD-NEXT: uaddlv d0, v0.4s 1790; CHECK-SD-NEXT: fmov x0, d0 1791; CHECK-SD-NEXT: ret 1792; 1793; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext: 1794; CHECK-GI: // %bb.0: // %entry 1795; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 1796; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 1797; CHECK-GI-NEXT: addp d0, v0.2d 1798; CHECK-GI-NEXT: addp d1, v1.2d 1799; CHECK-GI-NEXT: fmov x8, d0 1800; CHECK-GI-NEXT: fmov x9, d1 1801; CHECK-GI-NEXT: add x0, x8, x9 1802; CHECK-GI-NEXT: ret 1803entry: 1804 %xx = zext <2 x i32> %x to <2 x i64> 1805 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1806 %yy = zext <2 x i32> %y to <2 x i64> 1807 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) 1808 %z = add i64 %z1, %z2 1809 ret i64 %z 1810} 1811 1812define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) { 1813; CHECK-SD-LABEL: add_pair_v2i32_v2i64_sext: 1814; CHECK-SD: // %bb.0: // %entry 1815; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s 1816; CHECK-SD-NEXT: addp d0, v0.2d 1817; CHECK-SD-NEXT: fmov x0, d0 1818; CHECK-SD-NEXT: ret 1819; 1820; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext: 1821; CHECK-GI: // %bb.0: // %entry 1822; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 1823; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 1824; CHECK-GI-NEXT: addp d0, v0.2d 1825; CHECK-GI-NEXT: addp d1, v1.2d 1826; CHECK-GI-NEXT: fmov x8, d0 1827; CHECK-GI-NEXT: fmov x9, d1 1828; CHECK-GI-NEXT: add x0, x8, x9 1829; CHECK-GI-NEXT: ret 1830entry: 1831 %xx = sext <2 x i32> %x to <2 x i64> 1832 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 1833 %yy = sext <2 x i32> %y to <2 x i64> 1834 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) 1835 %z = add i64 %z1, %z2 1836 ret i64 %z 1837} 1838 1839define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) { 1840; CHECK-SD-LABEL: add_pair_v8i16_v8i32_zext: 1841; CHECK-SD: // %bb.0: // %entry 1842; CHECK-SD-NEXT: uaddlp v1.4s, v1.8h 1843; CHECK-SD-NEXT: uadalp v1.4s, v0.8h 1844; CHECK-SD-NEXT: addv s0, v1.4s 1845; CHECK-SD-NEXT: fmov w0, s0 1846; CHECK-SD-NEXT: ret 1847; 1848; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext: 1849; CHECK-GI: // %bb.0: // %entry 1850; CHECK-GI-NEXT: uaddlv s0, v0.8h 1851; CHECK-GI-NEXT: uaddlv s1, v1.8h 1852; CHECK-GI-NEXT: fmov w8, s0 1853; CHECK-GI-NEXT: fmov w9, s1 1854; CHECK-GI-NEXT: add w0, w8, w9 1855; CHECK-GI-NEXT: ret 1856entry: 1857 %xx = zext <8 x i16> %x to <8 x i32> 1858 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 1859 %yy = zext <8 x i16> %y to <8 x i32> 1860 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) 1861 %z = add i32 %z1, %z2 1862 ret i32 %z 1863} 1864 1865define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) { 1866; CHECK-SD-LABEL: add_pair_v8i16_v8i32_sext: 1867; CHECK-SD: // %bb.0: // %entry 1868; CHECK-SD-NEXT: saddlp v1.4s, v1.8h 1869; CHECK-SD-NEXT: sadalp v1.4s, v0.8h 1870; CHECK-SD-NEXT: addv s0, v1.4s 1871; CHECK-SD-NEXT: fmov w0, s0 1872; CHECK-SD-NEXT: ret 1873; 1874; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext: 1875; CHECK-GI: // %bb.0: // %entry 1876; CHECK-GI-NEXT: saddlv s0, v0.8h 1877; CHECK-GI-NEXT: saddlv s1, v1.8h 1878; CHECK-GI-NEXT: fmov w8, s0 1879; CHECK-GI-NEXT: fmov w9, s1 1880; CHECK-GI-NEXT: add w0, w8, w9 1881; CHECK-GI-NEXT: ret 1882entry: 1883 %xx = sext <8 x i16> %x to <8 x i32> 1884 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 1885 %yy = sext <8 x i16> %y to <8 x i32> 1886 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) 1887 %z = add i32 %z1, %z2 1888 ret i32 %z 1889} 1890 1891define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) { 1892; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext: 1893; CHECK-SD: // %bb.0: // %entry 1894; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1895; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 1896; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] 1897; CHECK-SD-NEXT: uaddlv s0, v0.8h 1898; CHECK-SD-NEXT: fmov w0, s0 1899; CHECK-SD-NEXT: ret 1900; 1901; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext: 1902; CHECK-GI: // %bb.0: // %entry 1903; CHECK-GI-NEXT: uaddlv s0, v0.4h 1904; CHECK-GI-NEXT: uaddlv s1, v1.4h 1905; CHECK-GI-NEXT: fmov w8, s0 1906; CHECK-GI-NEXT: fmov w9, s1 1907; CHECK-GI-NEXT: add w0, w8, w9 1908; CHECK-GI-NEXT: ret 1909entry: 1910 %xx = zext <4 x i16> %x to <4 x i32> 1911 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 1912 %yy = zext <4 x i16> %y to <4 x i32> 1913 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) 1914 %z = add i32 %z1, %z2 1915 ret i32 %z 1916} 1917 1918define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) { 1919; CHECK-SD-LABEL: add_pair_v4i16_v4i32_sext: 1920; CHECK-SD: // %bb.0: // %entry 1921; CHECK-SD-NEXT: saddl v0.4s, v0.4h, v1.4h 1922; CHECK-SD-NEXT: addv s0, v0.4s 1923; CHECK-SD-NEXT: fmov w0, s0 1924; CHECK-SD-NEXT: ret 1925; 1926; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext: 1927; CHECK-GI: // %bb.0: // %entry 1928; CHECK-GI-NEXT: saddlv s0, v0.4h 1929; CHECK-GI-NEXT: saddlv s1, v1.4h 1930; CHECK-GI-NEXT: fmov w8, s0 1931; CHECK-GI-NEXT: fmov w9, s1 1932; CHECK-GI-NEXT: add w0, w8, w9 1933; CHECK-GI-NEXT: ret 1934entry: 1935 %xx = sext <4 x i16> %x to <4 x i32> 1936 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 1937 %yy = sext <4 x i16> %y to <4 x i32> 1938 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) 1939 %z = add i32 %z1, %z2 1940 ret i32 %z 1941} 1942 1943define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) { 1944; CHECK-SD-BASE-LABEL: test_udot_v8i8: 1945; CHECK-SD-BASE: // %bb.0: // %entry 1946; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b 1947; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h 1948; CHECK-SD-BASE-NEXT: fmov w0, s0 1949; CHECK-SD-BASE-NEXT: ret 1950; 1951; CHECK-SD-DOT-LABEL: test_udot_v8i8: 1952; CHECK-SD-DOT: // %bb.0: // %entry 1953; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 1954; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b 1955; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 1956; CHECK-SD-DOT-NEXT: fmov w0, s0 1957; CHECK-SD-DOT-NEXT: ret 1958; 1959; CHECK-GI-BASE-LABEL: test_udot_v8i8: 1960; CHECK-GI-BASE: // %bb.0: // %entry 1961; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0 1962; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0 1963; CHECK-GI-BASE-NEXT: umull v2.4s, v1.4h, v0.4h 1964; CHECK-GI-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h 1965; CHECK-GI-BASE-NEXT: addv s0, v2.4s 1966; CHECK-GI-BASE-NEXT: fmov w0, s0 1967; CHECK-GI-BASE-NEXT: ret 1968; 1969; CHECK-GI-DOT-LABEL: test_udot_v8i8: 1970; CHECK-GI-DOT: // %bb.0: // %entry 1971; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 1972; CHECK-GI-DOT-NEXT: udot v2.2s, v1.8b, v0.8b 1973; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 1974; CHECK-GI-DOT-NEXT: fmov w0, s0 1975; CHECK-GI-DOT-NEXT: ret 1976entry: 1977 %0 = zext <8 x i8> %a to <8 x i32> 1978 %1 = zext <8 x i8> %b to <8 x i32> 1979 %2 = mul nuw nsw <8 x i32> %1, %0 1980 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 1981 ret i32 %3 1982} 1983 1984define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) { 1985; CHECK-SD-BASE-LABEL: test_udot_v16i8: 1986; CHECK-SD-BASE: // %bb.0: // %entry 1987; CHECK-SD-BASE-NEXT: umull2 v2.8h, v1.16b, v0.16b 1988; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b 1989; CHECK-SD-BASE-NEXT: uaddl2 v1.4s, v0.8h, v2.8h 1990; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h 1991; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 1992; CHECK-SD-BASE-NEXT: addv s0, v0.4s 1993; CHECK-SD-BASE-NEXT: fmov w0, s0 1994; CHECK-SD-BASE-NEXT: ret 1995; 1996; CHECK-SD-DOT-LABEL: test_udot_v16i8: 1997; CHECK-SD-DOT: // %bb.0: // %entry 1998; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 1999; CHECK-SD-DOT-NEXT: udot v2.4s, v1.16b, v0.16b 2000; CHECK-SD-DOT-NEXT: addv s0, v2.4s 2001; CHECK-SD-DOT-NEXT: fmov w0, s0 2002; CHECK-SD-DOT-NEXT: ret 2003; 2004; CHECK-GI-BASE-LABEL: test_udot_v16i8: 2005; CHECK-GI-BASE: // %bb.0: // %entry 2006; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0 2007; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 2008; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0 2009; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 2010; CHECK-GI-BASE-NEXT: umull v4.4s, v3.4h, v2.4h 2011; CHECK-GI-BASE-NEXT: umull v5.4s, v1.4h, v0.4h 2012; CHECK-GI-BASE-NEXT: umlal2 v4.4s, v3.8h, v2.8h 2013; CHECK-GI-BASE-NEXT: umlal2 v5.4s, v1.8h, v0.8h 2014; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s 2015; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2016; CHECK-GI-BASE-NEXT: fmov w0, s0 2017; CHECK-GI-BASE-NEXT: ret 2018; 2019; CHECK-GI-DOT-LABEL: test_udot_v16i8: 2020; CHECK-GI-DOT: // %bb.0: // %entry 2021; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 2022; CHECK-GI-DOT-NEXT: udot v2.4s, v1.16b, v0.16b 2023; CHECK-GI-DOT-NEXT: addv s0, v2.4s 2024; CHECK-GI-DOT-NEXT: fmov w0, s0 2025; CHECK-GI-DOT-NEXT: ret 2026entry: 2027 %0 = zext <16 x i8> %a to <16 x i32> 2028 %1 = zext <16 x i8> %b to <16 x i32> 2029 %2 = mul nuw nsw <16 x i32> %1, %0 2030 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 2031 ret i32 %3 2032} 2033 2034define i32 @test_udot_v24i8(ptr %p1, ptr %p2) { 2035; CHECK-SD-BASE-LABEL: test_udot_v24i8: 2036; CHECK-SD-BASE: // %bb.0: // %entry 2037; CHECK-SD-BASE-NEXT: ldr q0, [x0] 2038; CHECK-SD-BASE-NEXT: ldr q1, [x1] 2039; CHECK-SD-BASE-NEXT: ldr d2, [x0, #16] 2040; CHECK-SD-BASE-NEXT: ldr d3, [x1, #16] 2041; CHECK-SD-BASE-NEXT: umull v2.8h, v3.8b, v2.8b 2042; CHECK-SD-BASE-NEXT: umull v3.8h, v1.8b, v0.8b 2043; CHECK-SD-BASE-NEXT: umull2 v0.8h, v1.16b, v0.16b 2044; CHECK-SD-BASE-NEXT: uaddl2 v1.4s, v3.8h, v2.8h 2045; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h 2046; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v1.4s, v0.8h 2047; CHECK-SD-BASE-NEXT: uaddw v0.4s, v2.4s, v0.4h 2048; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2049; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2050; CHECK-SD-BASE-NEXT: fmov w0, s0 2051; CHECK-SD-BASE-NEXT: ret 2052; 2053; CHECK-SD-DOT-LABEL: test_udot_v24i8: 2054; CHECK-SD-DOT: // %bb.0: // %entry 2055; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000 2056; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000 2057; CHECK-SD-DOT-NEXT: ldr q2, [x0] 2058; CHECK-SD-DOT-NEXT: ldr q3, [x1] 2059; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16] 2060; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16] 2061; CHECK-SD-DOT-NEXT: udot v1.2s, v5.8b, v4.8b 2062; CHECK-SD-DOT-NEXT: udot v0.4s, v3.16b, v2.16b 2063; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s 2064; CHECK-SD-DOT-NEXT: addv s0, v0.4s 2065; CHECK-SD-DOT-NEXT: fmov w8, s1 2066; CHECK-SD-DOT-NEXT: fmov w9, s0 2067; CHECK-SD-DOT-NEXT: add w0, w9, w8 2068; CHECK-SD-DOT-NEXT: ret 2069; 2070; CHECK-GI-BASE-LABEL: test_udot_v24i8: 2071; CHECK-GI-BASE: // %bb.0: // %entry 2072; CHECK-GI-BASE-NEXT: ldr q0, [x0] 2073; CHECK-GI-BASE-NEXT: ldr q1, [x1] 2074; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16] 2075; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16] 2076; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0 2077; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 2078; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0 2079; CHECK-GI-BASE-NEXT: ushll v2.8h, v2.8b, #0 2080; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 2081; CHECK-GI-BASE-NEXT: ushll v3.8h, v3.8b, #0 2082; CHECK-GI-BASE-NEXT: umull v6.4s, v5.4h, v4.4h 2083; CHECK-GI-BASE-NEXT: umull2 v4.4s, v5.8h, v4.8h 2084; CHECK-GI-BASE-NEXT: umull2 v5.4s, v1.8h, v0.8h 2085; CHECK-GI-BASE-NEXT: umull v7.4s, v3.4h, v2.4h 2086; CHECK-GI-BASE-NEXT: umull v0.4s, v1.4h, v0.4h 2087; CHECK-GI-BASE-NEXT: umull2 v1.4s, v3.8h, v2.8h 2088; CHECK-GI-BASE-NEXT: addv s2, v6.4s 2089; CHECK-GI-BASE-NEXT: addv s3, v4.4s 2090; CHECK-GI-BASE-NEXT: addv s4, v5.4s 2091; CHECK-GI-BASE-NEXT: addv s5, v7.4s 2092; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2093; CHECK-GI-BASE-NEXT: addv s1, v1.4s 2094; CHECK-GI-BASE-NEXT: fmov w8, s2 2095; CHECK-GI-BASE-NEXT: fmov w9, s3 2096; CHECK-GI-BASE-NEXT: fmov w10, s4 2097; CHECK-GI-BASE-NEXT: fmov w11, s5 2098; CHECK-GI-BASE-NEXT: add w8, w8, w9 2099; CHECK-GI-BASE-NEXT: fmov w9, s0 2100; CHECK-GI-BASE-NEXT: add w10, w10, w11 2101; CHECK-GI-BASE-NEXT: fmov w11, s1 2102; CHECK-GI-BASE-NEXT: add w8, w8, w9 2103; CHECK-GI-BASE-NEXT: add w9, w10, w11 2104; CHECK-GI-BASE-NEXT: add w0, w8, w9 2105; CHECK-GI-BASE-NEXT: ret 2106; 2107; CHECK-GI-DOT-LABEL: test_udot_v24i8: 2108; CHECK-GI-DOT: // %bb.0: // %entry 2109; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 2110; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 2111; CHECK-GI-DOT-NEXT: ldr q2, [x0] 2112; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16] 2113; CHECK-GI-DOT-NEXT: ldr q4, [x1] 2114; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16] 2115; CHECK-GI-DOT-NEXT: udot v1.4s, v4.16b, v2.16b 2116; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b 2117; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s 2118; CHECK-GI-DOT-NEXT: addv s0, v0.4s 2119; CHECK-GI-DOT-NEXT: fmov w0, s0 2120; CHECK-GI-DOT-NEXT: ret 2121entry: 2122 %a = load <24 x i8>, ptr %p1 2123 %b = load <24 x i8>, ptr %p2 2124 %0 = zext <24 x i8> %a to <24 x i32> 2125 %1 = zext <24 x i8> %b to <24 x i32> 2126 %2 = mul nuw nsw <24 x i32> %1, %0 2127 %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2) 2128 ret i32 %3 2129} 2130 2131define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { 2132; CHECK-SD-BASE-LABEL: test_udot_v48i8: 2133; CHECK-SD-BASE: // %bb.0: // %entry 2134; CHECK-SD-BASE-NEXT: ldp q4, q0, [x0, #16] 2135; CHECK-SD-BASE-NEXT: ldr q2, [x1, #32] 2136; CHECK-SD-BASE-NEXT: ldp q1, q5, [x1] 2137; CHECK-SD-BASE-NEXT: ldr q3, [x0] 2138; CHECK-SD-BASE-NEXT: umull2 v6.8h, v2.16b, v0.16b 2139; CHECK-SD-BASE-NEXT: umull v0.8h, v2.8b, v0.8b 2140; CHECK-SD-BASE-NEXT: umull2 v7.8h, v1.16b, v3.16b 2141; CHECK-SD-BASE-NEXT: umull v1.8h, v1.8b, v3.8b 2142; CHECK-SD-BASE-NEXT: umull2 v2.8h, v5.16b, v4.16b 2143; CHECK-SD-BASE-NEXT: umull v3.8h, v5.8b, v4.8b 2144; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v7.8h, v6.8h 2145; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v1.8h, v0.8h 2146; CHECK-SD-BASE-NEXT: uaddl v6.4s, v7.4h, v6.4h 2147; CHECK-SD-BASE-NEXT: uaddl v0.4s, v1.4h, v0.4h 2148; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v4.4s, v2.8h 2149; CHECK-SD-BASE-NEXT: uaddw2 v4.4s, v5.4s, v3.8h 2150; CHECK-SD-BASE-NEXT: uaddw v2.4s, v6.4s, v2.4h 2151; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v3.4h 2152; CHECK-SD-BASE-NEXT: add v1.4s, v4.4s, v1.4s 2153; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 2154; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2155; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2156; CHECK-SD-BASE-NEXT: fmov w0, s0 2157; CHECK-SD-BASE-NEXT: ret 2158; 2159; CHECK-SD-DOT-LABEL: test_udot_v48i8: 2160; CHECK-SD-DOT: // %bb.0: // %entry 2161; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000 2162; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32] 2163; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32] 2164; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b 2165; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0] 2166; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1] 2167; CHECK-SD-DOT-NEXT: udot v0.4s, v4.16b, v3.16b 2168; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b 2169; CHECK-SD-DOT-NEXT: addv s0, v0.4s 2170; CHECK-SD-DOT-NEXT: fmov w0, s0 2171; CHECK-SD-DOT-NEXT: ret 2172; 2173; CHECK-GI-BASE-LABEL: test_udot_v48i8: 2174; CHECK-GI-BASE: // %bb.0: // %entry 2175; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] 2176; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32] 2177; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] 2178; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] 2179; CHECK-GI-BASE-NEXT: ushll v20.8h, v6.8b, #0 2180; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0 2181; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0 2182; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 2183; CHECK-GI-BASE-NEXT: ushll v16.8h, v3.8b, #0 2184; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0 2185; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 2186; CHECK-GI-BASE-NEXT: ushll v17.8h, v2.8b, #0 2187; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0 2188; CHECK-GI-BASE-NEXT: ushll2 v2.8h, v2.16b, #0 2189; CHECK-GI-BASE-NEXT: umull v18.4s, v4.4h, v5.4h 2190; CHECK-GI-BASE-NEXT: umull2 v4.4s, v4.8h, v5.8h 2191; CHECK-GI-BASE-NEXT: umull v5.4s, v0.4h, v1.4h 2192; CHECK-GI-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h 2193; CHECK-GI-BASE-NEXT: umull v19.4s, v16.4h, v17.4h 2194; CHECK-GI-BASE-NEXT: ushll v1.8h, v7.8b, #0 2195; CHECK-GI-BASE-NEXT: umull2 v16.4s, v16.8h, v17.8h 2196; CHECK-GI-BASE-NEXT: umull v17.4s, v3.4h, v2.4h 2197; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h 2198; CHECK-GI-BASE-NEXT: ushll2 v7.8h, v7.16b, #0 2199; CHECK-GI-BASE-NEXT: addv s18, v18.4s 2200; CHECK-GI-BASE-NEXT: addv s4, v4.4s 2201; CHECK-GI-BASE-NEXT: addv s5, v5.4s 2202; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2203; CHECK-GI-BASE-NEXT: addv s19, v19.4s 2204; CHECK-GI-BASE-NEXT: umull v3.4s, v1.4h, v20.4h 2205; CHECK-GI-BASE-NEXT: addv s2, v2.4s 2206; CHECK-GI-BASE-NEXT: umull2 v1.4s, v1.8h, v20.8h 2207; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v6.4h 2208; CHECK-GI-BASE-NEXT: fmov w8, s18 2209; CHECK-GI-BASE-NEXT: fmov w9, s4 2210; CHECK-GI-BASE-NEXT: fmov w10, s5 2211; CHECK-GI-BASE-NEXT: fmov w11, s0 2212; CHECK-GI-BASE-NEXT: fmov w12, s19 2213; CHECK-GI-BASE-NEXT: addv s4, v16.4s 2214; CHECK-GI-BASE-NEXT: addv s5, v17.4s 2215; CHECK-GI-BASE-NEXT: addv s3, v3.4s 2216; CHECK-GI-BASE-NEXT: umull2 v0.4s, v7.8h, v6.8h 2217; CHECK-GI-BASE-NEXT: add w8, w8, w9 2218; CHECK-GI-BASE-NEXT: addv s1, v1.4s 2219; CHECK-GI-BASE-NEXT: add w9, w11, w12 2220; CHECK-GI-BASE-NEXT: add w8, w8, w10 2221; CHECK-GI-BASE-NEXT: fmov w10, s4 2222; CHECK-GI-BASE-NEXT: fmov w11, s5 2223; CHECK-GI-BASE-NEXT: fmov w12, s2 2224; CHECK-GI-BASE-NEXT: addv s4, v20.4s 2225; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2226; CHECK-GI-BASE-NEXT: add w9, w9, w10 2227; CHECK-GI-BASE-NEXT: add w10, w11, w12 2228; CHECK-GI-BASE-NEXT: fmov w11, s3 2229; CHECK-GI-BASE-NEXT: add w8, w8, w9 2230; CHECK-GI-BASE-NEXT: add w9, w10, w11 2231; CHECK-GI-BASE-NEXT: fmov w10, s1 2232; CHECK-GI-BASE-NEXT: fmov w11, s0 2233; CHECK-GI-BASE-NEXT: add w9, w9, w10 2234; CHECK-GI-BASE-NEXT: fmov w10, s4 2235; CHECK-GI-BASE-NEXT: add w8, w8, w9 2236; CHECK-GI-BASE-NEXT: add w9, w10, w11 2237; CHECK-GI-BASE-NEXT: add w0, w8, w9 2238; CHECK-GI-BASE-NEXT: ret 2239; 2240; CHECK-GI-DOT-LABEL: test_udot_v48i8: 2241; CHECK-GI-DOT: // %bb.0: // %entry 2242; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 2243; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 2244; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] 2245; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0] 2246; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 2247; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1] 2248; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32] 2249; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b 2250; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b 2251; CHECK-GI-DOT-NEXT: udot v2.4s, v16.16b, v7.16b 2252; CHECK-GI-DOT-NEXT: addv s0, v0.4s 2253; CHECK-GI-DOT-NEXT: addv s1, v1.4s 2254; CHECK-GI-DOT-NEXT: addv s2, v2.4s 2255; CHECK-GI-DOT-NEXT: fmov w8, s0 2256; CHECK-GI-DOT-NEXT: fmov w9, s1 2257; CHECK-GI-DOT-NEXT: add w8, w8, w9 2258; CHECK-GI-DOT-NEXT: fmov w9, s2 2259; CHECK-GI-DOT-NEXT: add w0, w8, w9 2260; CHECK-GI-DOT-NEXT: ret 2261entry: 2262 %a = load <48 x i8>, ptr %p1 2263 %b = load <48 x i8>, ptr %p2 2264 %0 = zext <48 x i8> %a to <48 x i32> 2265 %1 = zext <48 x i8> %b to <48 x i32> 2266 %2 = mul nuw nsw <48 x i32> %1, %0 2267 %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2) 2268 ret i32 %3 2269} 2270 2271define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) { 2272; CHECK-SD-BASE-LABEL: test_sdot_v8i8: 2273; CHECK-SD-BASE: // %bb.0: // %entry 2274; CHECK-SD-BASE-NEXT: smull v0.8h, v1.8b, v0.8b 2275; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h 2276; CHECK-SD-BASE-NEXT: fmov w0, s0 2277; CHECK-SD-BASE-NEXT: ret 2278; 2279; CHECK-SD-DOT-LABEL: test_sdot_v8i8: 2280; CHECK-SD-DOT: // %bb.0: // %entry 2281; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 2282; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b 2283; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 2284; CHECK-SD-DOT-NEXT: fmov w0, s0 2285; CHECK-SD-DOT-NEXT: ret 2286; 2287; CHECK-GI-BASE-LABEL: test_sdot_v8i8: 2288; CHECK-GI-BASE: // %bb.0: // %entry 2289; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0 2290; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0 2291; CHECK-GI-BASE-NEXT: smull v2.4s, v1.4h, v0.4h 2292; CHECK-GI-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h 2293; CHECK-GI-BASE-NEXT: addv s0, v2.4s 2294; CHECK-GI-BASE-NEXT: fmov w0, s0 2295; CHECK-GI-BASE-NEXT: ret 2296; 2297; CHECK-GI-DOT-LABEL: test_sdot_v8i8: 2298; CHECK-GI-DOT: // %bb.0: // %entry 2299; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 2300; CHECK-GI-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b 2301; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 2302; CHECK-GI-DOT-NEXT: fmov w0, s0 2303; CHECK-GI-DOT-NEXT: ret 2304entry: 2305 %0 = sext <8 x i8> %a to <8 x i32> 2306 %1 = sext <8 x i8> %b to <8 x i32> 2307 %2 = mul nuw nsw <8 x i32> %1, %0 2308 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 2309 ret i32 %3 2310} 2311 2312define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) { 2313; CHECK-SD-BASE-LABEL: test_sdot_v16i8: 2314; CHECK-SD-BASE: // %bb.0: // %entry 2315; CHECK-SD-BASE-NEXT: smull2 v2.8h, v1.16b, v0.16b 2316; CHECK-SD-BASE-NEXT: smull v0.8h, v1.8b, v0.8b 2317; CHECK-SD-BASE-NEXT: saddl2 v1.4s, v0.8h, v2.8h 2318; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h 2319; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2320; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2321; CHECK-SD-BASE-NEXT: fmov w0, s0 2322; CHECK-SD-BASE-NEXT: ret 2323; 2324; CHECK-SD-DOT-LABEL: test_sdot_v16i8: 2325; CHECK-SD-DOT: // %bb.0: // %entry 2326; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 2327; CHECK-SD-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b 2328; CHECK-SD-DOT-NEXT: addv s0, v2.4s 2329; CHECK-SD-DOT-NEXT: fmov w0, s0 2330; CHECK-SD-DOT-NEXT: ret 2331; 2332; CHECK-GI-BASE-LABEL: test_sdot_v16i8: 2333; CHECK-GI-BASE: // %bb.0: // %entry 2334; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0 2335; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 2336; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0 2337; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 2338; CHECK-GI-BASE-NEXT: smull v4.4s, v3.4h, v2.4h 2339; CHECK-GI-BASE-NEXT: smull v5.4s, v1.4h, v0.4h 2340; CHECK-GI-BASE-NEXT: smlal2 v4.4s, v3.8h, v2.8h 2341; CHECK-GI-BASE-NEXT: smlal2 v5.4s, v1.8h, v0.8h 2342; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s 2343; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2344; CHECK-GI-BASE-NEXT: fmov w0, s0 2345; CHECK-GI-BASE-NEXT: ret 2346; 2347; CHECK-GI-DOT-LABEL: test_sdot_v16i8: 2348; CHECK-GI-DOT: // %bb.0: // %entry 2349; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 2350; CHECK-GI-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b 2351; CHECK-GI-DOT-NEXT: addv s0, v2.4s 2352; CHECK-GI-DOT-NEXT: fmov w0, s0 2353; CHECK-GI-DOT-NEXT: ret 2354entry: 2355 %0 = sext <16 x i8> %a to <16 x i32> 2356 %1 = sext <16 x i8> %b to <16 x i32> 2357 %2 = mul nuw nsw <16 x i32> %1, %0 2358 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 2359 ret i32 %3 2360} 2361 2362define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) { 2363; CHECK-SD-BASE-LABEL: test_sdot_v24i8: 2364; CHECK-SD-BASE: // %bb.0: // %entry 2365; CHECK-SD-BASE-NEXT: ldr q0, [x0] 2366; CHECK-SD-BASE-NEXT: ldr q1, [x1] 2367; CHECK-SD-BASE-NEXT: ldr d2, [x0, #16] 2368; CHECK-SD-BASE-NEXT: ldr d3, [x1, #16] 2369; CHECK-SD-BASE-NEXT: smull v2.8h, v3.8b, v2.8b 2370; CHECK-SD-BASE-NEXT: smull v3.8h, v1.8b, v0.8b 2371; CHECK-SD-BASE-NEXT: smull2 v0.8h, v1.16b, v0.16b 2372; CHECK-SD-BASE-NEXT: saddl2 v1.4s, v3.8h, v2.8h 2373; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h 2374; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v1.4s, v0.8h 2375; CHECK-SD-BASE-NEXT: saddw v0.4s, v2.4s, v0.4h 2376; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2377; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2378; CHECK-SD-BASE-NEXT: fmov w0, s0 2379; CHECK-SD-BASE-NEXT: ret 2380; 2381; CHECK-SD-DOT-LABEL: test_sdot_v24i8: 2382; CHECK-SD-DOT: // %bb.0: // %entry 2383; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000 2384; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000 2385; CHECK-SD-DOT-NEXT: ldr q2, [x0] 2386; CHECK-SD-DOT-NEXT: ldr q3, [x1] 2387; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16] 2388; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16] 2389; CHECK-SD-DOT-NEXT: sdot v1.2s, v5.8b, v4.8b 2390; CHECK-SD-DOT-NEXT: sdot v0.4s, v3.16b, v2.16b 2391; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s 2392; CHECK-SD-DOT-NEXT: addv s0, v0.4s 2393; CHECK-SD-DOT-NEXT: fmov w8, s1 2394; CHECK-SD-DOT-NEXT: fmov w9, s0 2395; CHECK-SD-DOT-NEXT: add w0, w9, w8 2396; CHECK-SD-DOT-NEXT: ret 2397; 2398; CHECK-GI-BASE-LABEL: test_sdot_v24i8: 2399; CHECK-GI-BASE: // %bb.0: // %entry 2400; CHECK-GI-BASE-NEXT: ldr q0, [x0] 2401; CHECK-GI-BASE-NEXT: ldr q1, [x1] 2402; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16] 2403; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16] 2404; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0 2405; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 2406; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0 2407; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0 2408; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 2409; CHECK-GI-BASE-NEXT: sshll v3.8h, v3.8b, #0 2410; CHECK-GI-BASE-NEXT: smull v6.4s, v5.4h, v4.4h 2411; CHECK-GI-BASE-NEXT: smull2 v4.4s, v5.8h, v4.8h 2412; CHECK-GI-BASE-NEXT: smull2 v5.4s, v1.8h, v0.8h 2413; CHECK-GI-BASE-NEXT: smull v7.4s, v3.4h, v2.4h 2414; CHECK-GI-BASE-NEXT: smull v0.4s, v1.4h, v0.4h 2415; CHECK-GI-BASE-NEXT: smull2 v1.4s, v3.8h, v2.8h 2416; CHECK-GI-BASE-NEXT: addv s2, v6.4s 2417; CHECK-GI-BASE-NEXT: addv s3, v4.4s 2418; CHECK-GI-BASE-NEXT: addv s4, v5.4s 2419; CHECK-GI-BASE-NEXT: addv s5, v7.4s 2420; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2421; CHECK-GI-BASE-NEXT: addv s1, v1.4s 2422; CHECK-GI-BASE-NEXT: fmov w8, s2 2423; CHECK-GI-BASE-NEXT: fmov w9, s3 2424; CHECK-GI-BASE-NEXT: fmov w10, s4 2425; CHECK-GI-BASE-NEXT: fmov w11, s5 2426; CHECK-GI-BASE-NEXT: add w8, w8, w9 2427; CHECK-GI-BASE-NEXT: fmov w9, s0 2428; CHECK-GI-BASE-NEXT: add w10, w10, w11 2429; CHECK-GI-BASE-NEXT: fmov w11, s1 2430; CHECK-GI-BASE-NEXT: add w8, w8, w9 2431; CHECK-GI-BASE-NEXT: add w9, w10, w11 2432; CHECK-GI-BASE-NEXT: add w0, w8, w9 2433; CHECK-GI-BASE-NEXT: ret 2434; 2435; CHECK-GI-DOT-LABEL: test_sdot_v24i8: 2436; CHECK-GI-DOT: // %bb.0: // %entry 2437; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 2438; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 2439; CHECK-GI-DOT-NEXT: ldr q2, [x0] 2440; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16] 2441; CHECK-GI-DOT-NEXT: ldr q4, [x1] 2442; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16] 2443; CHECK-GI-DOT-NEXT: sdot v1.4s, v4.16b, v2.16b 2444; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b 2445; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s 2446; CHECK-GI-DOT-NEXT: addv s0, v0.4s 2447; CHECK-GI-DOT-NEXT: fmov w0, s0 2448; CHECK-GI-DOT-NEXT: ret 2449entry: 2450 %a = load <24 x i8>, ptr %p1 2451 %b = load <24 x i8>, ptr %p2 2452 %0 = sext <24 x i8> %a to <24 x i32> 2453 %1 = sext <24 x i8> %b to <24 x i32> 2454 %2 = mul nuw nsw <24 x i32> %1, %0 2455 %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2) 2456 ret i32 %3 2457} 2458 2459define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { 2460; CHECK-SD-BASE-LABEL: test_sdot_v48i8: 2461; CHECK-SD-BASE: // %bb.0: // %entry 2462; CHECK-SD-BASE-NEXT: ldp q4, q0, [x0, #16] 2463; CHECK-SD-BASE-NEXT: ldr q2, [x1, #32] 2464; CHECK-SD-BASE-NEXT: ldp q1, q5, [x1] 2465; CHECK-SD-BASE-NEXT: ldr q3, [x0] 2466; CHECK-SD-BASE-NEXT: smull2 v6.8h, v2.16b, v0.16b 2467; CHECK-SD-BASE-NEXT: smull v0.8h, v2.8b, v0.8b 2468; CHECK-SD-BASE-NEXT: smull2 v7.8h, v1.16b, v3.16b 2469; CHECK-SD-BASE-NEXT: smull v1.8h, v1.8b, v3.8b 2470; CHECK-SD-BASE-NEXT: smull2 v2.8h, v5.16b, v4.16b 2471; CHECK-SD-BASE-NEXT: smull v3.8h, v5.8b, v4.8b 2472; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v7.8h, v6.8h 2473; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v1.8h, v0.8h 2474; CHECK-SD-BASE-NEXT: saddl v6.4s, v7.4h, v6.4h 2475; CHECK-SD-BASE-NEXT: saddl v0.4s, v1.4h, v0.4h 2476; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v4.4s, v2.8h 2477; CHECK-SD-BASE-NEXT: saddw2 v4.4s, v5.4s, v3.8h 2478; CHECK-SD-BASE-NEXT: saddw v2.4s, v6.4s, v2.4h 2479; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v3.4h 2480; CHECK-SD-BASE-NEXT: add v1.4s, v4.4s, v1.4s 2481; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 2482; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2483; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2484; CHECK-SD-BASE-NEXT: fmov w0, s0 2485; CHECK-SD-BASE-NEXT: ret 2486; 2487; CHECK-SD-DOT-LABEL: test_sdot_v48i8: 2488; CHECK-SD-DOT: // %bb.0: // %entry 2489; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000 2490; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32] 2491; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32] 2492; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b 2493; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0] 2494; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1] 2495; CHECK-SD-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b 2496; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b 2497; CHECK-SD-DOT-NEXT: addv s0, v0.4s 2498; CHECK-SD-DOT-NEXT: fmov w0, s0 2499; CHECK-SD-DOT-NEXT: ret 2500; 2501; CHECK-GI-BASE-LABEL: test_sdot_v48i8: 2502; CHECK-GI-BASE: // %bb.0: // %entry 2503; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] 2504; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32] 2505; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] 2506; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] 2507; CHECK-GI-BASE-NEXT: sshll v20.8h, v6.8b, #0 2508; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0 2509; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0 2510; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 2511; CHECK-GI-BASE-NEXT: sshll v16.8h, v3.8b, #0 2512; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0 2513; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 2514; CHECK-GI-BASE-NEXT: sshll v17.8h, v2.8b, #0 2515; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0 2516; CHECK-GI-BASE-NEXT: sshll2 v2.8h, v2.16b, #0 2517; CHECK-GI-BASE-NEXT: smull v18.4s, v4.4h, v5.4h 2518; CHECK-GI-BASE-NEXT: smull2 v4.4s, v4.8h, v5.8h 2519; CHECK-GI-BASE-NEXT: smull v5.4s, v0.4h, v1.4h 2520; CHECK-GI-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h 2521; CHECK-GI-BASE-NEXT: smull v19.4s, v16.4h, v17.4h 2522; CHECK-GI-BASE-NEXT: sshll v1.8h, v7.8b, #0 2523; CHECK-GI-BASE-NEXT: smull2 v16.4s, v16.8h, v17.8h 2524; CHECK-GI-BASE-NEXT: smull v17.4s, v3.4h, v2.4h 2525; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h 2526; CHECK-GI-BASE-NEXT: sshll2 v7.8h, v7.16b, #0 2527; CHECK-GI-BASE-NEXT: addv s18, v18.4s 2528; CHECK-GI-BASE-NEXT: addv s4, v4.4s 2529; CHECK-GI-BASE-NEXT: addv s5, v5.4s 2530; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2531; CHECK-GI-BASE-NEXT: addv s19, v19.4s 2532; CHECK-GI-BASE-NEXT: smull v3.4s, v1.4h, v20.4h 2533; CHECK-GI-BASE-NEXT: addv s2, v2.4s 2534; CHECK-GI-BASE-NEXT: smull2 v1.4s, v1.8h, v20.8h 2535; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v6.4h 2536; CHECK-GI-BASE-NEXT: fmov w8, s18 2537; CHECK-GI-BASE-NEXT: fmov w9, s4 2538; CHECK-GI-BASE-NEXT: fmov w10, s5 2539; CHECK-GI-BASE-NEXT: fmov w11, s0 2540; CHECK-GI-BASE-NEXT: fmov w12, s19 2541; CHECK-GI-BASE-NEXT: addv s4, v16.4s 2542; CHECK-GI-BASE-NEXT: addv s5, v17.4s 2543; CHECK-GI-BASE-NEXT: addv s3, v3.4s 2544; CHECK-GI-BASE-NEXT: smull2 v0.4s, v7.8h, v6.8h 2545; CHECK-GI-BASE-NEXT: add w8, w8, w9 2546; CHECK-GI-BASE-NEXT: addv s1, v1.4s 2547; CHECK-GI-BASE-NEXT: add w9, w11, w12 2548; CHECK-GI-BASE-NEXT: add w8, w8, w10 2549; CHECK-GI-BASE-NEXT: fmov w10, s4 2550; CHECK-GI-BASE-NEXT: fmov w11, s5 2551; CHECK-GI-BASE-NEXT: fmov w12, s2 2552; CHECK-GI-BASE-NEXT: addv s4, v20.4s 2553; CHECK-GI-BASE-NEXT: addv s0, v0.4s 2554; CHECK-GI-BASE-NEXT: add w9, w9, w10 2555; CHECK-GI-BASE-NEXT: add w10, w11, w12 2556; CHECK-GI-BASE-NEXT: fmov w11, s3 2557; CHECK-GI-BASE-NEXT: add w8, w8, w9 2558; CHECK-GI-BASE-NEXT: add w9, w10, w11 2559; CHECK-GI-BASE-NEXT: fmov w10, s1 2560; CHECK-GI-BASE-NEXT: fmov w11, s0 2561; CHECK-GI-BASE-NEXT: add w9, w9, w10 2562; CHECK-GI-BASE-NEXT: fmov w10, s4 2563; CHECK-GI-BASE-NEXT: add w8, w8, w9 2564; CHECK-GI-BASE-NEXT: add w9, w10, w11 2565; CHECK-GI-BASE-NEXT: add w0, w8, w9 2566; CHECK-GI-BASE-NEXT: ret 2567; 2568; CHECK-GI-DOT-LABEL: test_sdot_v48i8: 2569; CHECK-GI-DOT: // %bb.0: // %entry 2570; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000 2571; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 2572; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] 2573; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0] 2574; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 2575; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1] 2576; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32] 2577; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b 2578; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b 2579; CHECK-GI-DOT-NEXT: sdot v2.4s, v16.16b, v7.16b 2580; CHECK-GI-DOT-NEXT: addv s0, v0.4s 2581; CHECK-GI-DOT-NEXT: addv s1, v1.4s 2582; CHECK-GI-DOT-NEXT: addv s2, v2.4s 2583; CHECK-GI-DOT-NEXT: fmov w8, s0 2584; CHECK-GI-DOT-NEXT: fmov w9, s1 2585; CHECK-GI-DOT-NEXT: add w8, w8, w9 2586; CHECK-GI-DOT-NEXT: fmov w9, s2 2587; CHECK-GI-DOT-NEXT: add w0, w8, w9 2588; CHECK-GI-DOT-NEXT: ret 2589entry: 2590 %a = load <48 x i8>, ptr %p1 2591 %b = load <48 x i8>, ptr %p2 2592 %0 = sext <48 x i8> %a to <48 x i32> 2593 %1 = sext <48 x i8> %b to <48 x i32> 2594 %2 = mul nuw nsw <48 x i32> %1, %0 2595 %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2) 2596 ret i32 %3 2597} 2598 2599; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT 2600define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) { 2601; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use: 2602; CHECK-SD-BASE: // %bb.0: // %entry 2603; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b 2604; CHECK-SD-BASE-NEXT: uaddlv s1, v0.8h 2605; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0 2606; CHECK-SD-BASE-NEXT: fmov w9, s0 2607; CHECK-SD-BASE-NEXT: fmov w8, s1 2608; CHECK-SD-BASE-NEXT: add w0, w8, w9 2609; CHECK-SD-BASE-NEXT: ret 2610; 2611; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use: 2612; CHECK-SD-DOT: // %bb.0: // %entry 2613; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 2614; CHECK-SD-DOT-NEXT: umull v3.8h, v1.8b, v0.8b 2615; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b 2616; CHECK-SD-DOT-NEXT: ushll v0.4s, v3.4h, #0 2617; CHECK-SD-DOT-NEXT: fmov w9, s0 2618; CHECK-SD-DOT-NEXT: addp v1.2s, v2.2s, v2.2s 2619; CHECK-SD-DOT-NEXT: fmov w8, s1 2620; CHECK-SD-DOT-NEXT: add w0, w8, w9 2621; CHECK-SD-DOT-NEXT: ret 2622; 2623; CHECK-GI-LABEL: test_udot_v8i8_multi_use: 2624; CHECK-GI: // %bb.0: // %entry 2625; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 2626; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 2627; CHECK-GI-NEXT: umull v2.4s, v1.4h, v0.4h 2628; CHECK-GI-NEXT: mov v3.16b, v2.16b 2629; CHECK-GI-NEXT: fmov w8, s2 2630; CHECK-GI-NEXT: umlal2 v3.4s, v1.8h, v0.8h 2631; CHECK-GI-NEXT: addv s0, v3.4s 2632; CHECK-GI-NEXT: fmov w9, s0 2633; CHECK-GI-NEXT: add w0, w9, w8 2634; CHECK-GI-NEXT: ret 2635entry: 2636 %0 = zext <8 x i8> %a to <8 x i32> 2637 %1 = zext <8 x i8> %b to <8 x i32> 2638 %2 = mul nuw nsw <8 x i32> %1, %0 2639 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 2640 %4 = extractelement <8 x i32> %2, i32 0 2641 %5 = add nuw nsw i32 %3, %4 2642 ret i32 %5 2643} 2644 2645define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) { 2646; CHECK-SD-LABEL: add_pair_v8i16_v8i16: 2647; CHECK-SD: // %bb.0: // %entry 2648; CHECK-SD-NEXT: add v0.8h, v0.8h, v1.8h 2649; CHECK-SD-NEXT: addv h0, v0.8h 2650; CHECK-SD-NEXT: fmov w0, s0 2651; CHECK-SD-NEXT: ret 2652; 2653; CHECK-GI-LABEL: add_pair_v8i16_v8i16: 2654; CHECK-GI: // %bb.0: // %entry 2655; CHECK-GI-NEXT: addv h0, v0.8h 2656; CHECK-GI-NEXT: addv h1, v1.8h 2657; CHECK-GI-NEXT: fmov w8, s0 2658; CHECK-GI-NEXT: fmov w9, s1 2659; CHECK-GI-NEXT: add w8, w9, w8, uxth 2660; CHECK-GI-NEXT: and w0, w8, #0xffff 2661; CHECK-GI-NEXT: ret 2662entry: 2663 %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) 2664 %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y) 2665 %z = add i16 %z1, %z2 2666 ret i16 %z 2667} 2668 2669define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) { 2670; CHECK-SD-LABEL: add_pair_v8i16_v8i64_zext: 2671; CHECK-SD: // %bb.0: // %entry 2672; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0 2673; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 2674; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0 2675; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 2676; CHECK-SD-NEXT: uaddl2 v4.2d, v0.4s, v2.4s 2677; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s 2678; CHECK-SD-NEXT: uaddl2 v2.2d, v1.4s, v3.4s 2679; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s 2680; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d 2681; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d 2682; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 2683; CHECK-SD-NEXT: addp d0, v0.2d 2684; CHECK-SD-NEXT: fmov x0, d0 2685; CHECK-SD-NEXT: ret 2686; 2687; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext: 2688; CHECK-GI: // %bb.0: // %entry 2689; CHECK-GI-NEXT: uaddlv s1, v1.8h 2690; CHECK-GI-NEXT: uaddlv s0, v0.8h 2691; CHECK-GI-NEXT: mov w8, v1.s[0] 2692; CHECK-GI-NEXT: fmov w9, s0 2693; CHECK-GI-NEXT: add x0, x8, w9, uxtw 2694; CHECK-GI-NEXT: ret 2695entry: 2696 %xx = zext <8 x i16> %x to <8 x i64> 2697 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 2698 %yy = zext <8 x i16> %y to <8 x i64> 2699 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) 2700 %z = add i64 %z1, %z2 2701 ret i64 %z 2702} 2703 2704define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) { 2705; CHECK-SD-LABEL: add_pair_v8i16_v8i64_sext: 2706; CHECK-SD: // %bb.0: // %entry 2707; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0 2708; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 2709; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0 2710; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 2711; CHECK-SD-NEXT: saddl2 v4.2d, v0.4s, v2.4s 2712; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s 2713; CHECK-SD-NEXT: saddl2 v2.2d, v1.4s, v3.4s 2714; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s 2715; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d 2716; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d 2717; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 2718; CHECK-SD-NEXT: addp d0, v0.2d 2719; CHECK-SD-NEXT: fmov x0, d0 2720; CHECK-SD-NEXT: ret 2721; 2722; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext: 2723; CHECK-GI: // %bb.0: // %entry 2724; CHECK-GI-NEXT: saddlv s1, v1.8h 2725; CHECK-GI-NEXT: saddlv s0, v0.8h 2726; CHECK-GI-NEXT: smov x8, v1.s[0] 2727; CHECK-GI-NEXT: fmov w9, s0 2728; CHECK-GI-NEXT: add x0, x8, w9, sxtw 2729; CHECK-GI-NEXT: ret 2730entry: 2731 %xx = sext <8 x i16> %x to <8 x i64> 2732 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 2733 %yy = sext <8 x i16> %y to <8 x i64> 2734 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) 2735 %z = add i64 %z1, %z2 2736 ret i64 %z 2737} 2738 2739define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) { 2740; CHECK-SD-LABEL: add_pair_v4i16_v4i64_zext: 2741; CHECK-SD: // %bb.0: // %entry 2742; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 2743; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 2744; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s 2745; CHECK-SD-NEXT: uadalp v1.2d, v0.4s 2746; CHECK-SD-NEXT: addp d0, v1.2d 2747; CHECK-SD-NEXT: fmov x0, d0 2748; CHECK-SD-NEXT: ret 2749; 2750; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext: 2751; CHECK-GI: // %bb.0: // %entry 2752; CHECK-GI-NEXT: uaddlv s1, v1.4h 2753; CHECK-GI-NEXT: uaddlv s0, v0.4h 2754; CHECK-GI-NEXT: mov w8, v1.s[0] 2755; CHECK-GI-NEXT: fmov w9, s0 2756; CHECK-GI-NEXT: add x0, x8, w9, uxtw 2757; CHECK-GI-NEXT: ret 2758entry: 2759 %xx = zext <4 x i16> %x to <4 x i64> 2760 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 2761 %yy = zext <4 x i16> %y to <4 x i64> 2762 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) 2763 %z = add i64 %z1, %z2 2764 ret i64 %z 2765} 2766 2767define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) { 2768; CHECK-SD-LABEL: add_pair_v4i16_v4i64_sext: 2769; CHECK-SD: // %bb.0: // %entry 2770; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 2771; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 2772; CHECK-SD-NEXT: saddlp v1.2d, v1.4s 2773; CHECK-SD-NEXT: sadalp v1.2d, v0.4s 2774; CHECK-SD-NEXT: addp d0, v1.2d 2775; CHECK-SD-NEXT: fmov x0, d0 2776; CHECK-SD-NEXT: ret 2777; 2778; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext: 2779; CHECK-GI: // %bb.0: // %entry 2780; CHECK-GI-NEXT: saddlv s1, v1.4h 2781; CHECK-GI-NEXT: saddlv s0, v0.4h 2782; CHECK-GI-NEXT: smov x8, v1.s[0] 2783; CHECK-GI-NEXT: fmov w9, s0 2784; CHECK-GI-NEXT: add x0, x8, w9, sxtw 2785; CHECK-GI-NEXT: ret 2786entry: 2787 %xx = sext <4 x i16> %x to <4 x i64> 2788 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 2789 %yy = sext <4 x i16> %y to <4 x i64> 2790 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) 2791 %z = add i64 %z1, %z2 2792 ret i64 %z 2793} 2794 2795define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) { 2796; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext: 2797; CHECK-SD: // %bb.0: // %entry 2798; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 2799; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 2800; CHECK-SD-NEXT: movi v2.2d, #0x00ffff0000ffff 2801; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] 2802; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b 2803; CHECK-SD-NEXT: uaddlv d0, v0.4s 2804; CHECK-SD-NEXT: fmov x0, d0 2805; CHECK-SD-NEXT: ret 2806; 2807; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext: 2808; CHECK-GI: // %bb.0: // %entry 2809; CHECK-GI-NEXT: movi v2.2d, #0x0000000000ffff 2810; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 2811; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 2812; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b 2813; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 2814; CHECK-GI-NEXT: addp d0, v0.2d 2815; CHECK-GI-NEXT: addp d1, v1.2d 2816; CHECK-GI-NEXT: fmov x8, d0 2817; CHECK-GI-NEXT: fmov x9, d1 2818; CHECK-GI-NEXT: add x0, x8, x9 2819; CHECK-GI-NEXT: ret 2820entry: 2821 %xx = zext <2 x i16> %x to <2 x i64> 2822 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 2823 %yy = zext <2 x i16> %y to <2 x i64> 2824 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) 2825 %z = add i64 %z1, %z2 2826 ret i64 %z 2827} 2828 2829define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) { 2830; CHECK-SD-LABEL: add_pair_v2i16_v2i64_sext: 2831; CHECK-SD: // %bb.0: // %entry 2832; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 2833; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0 2834; CHECK-SD-NEXT: shl v0.2d, v0.2d, #48 2835; CHECK-SD-NEXT: shl v1.2d, v1.2d, #48 2836; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #48 2837; CHECK-SD-NEXT: ssra v0.2d, v1.2d, #48 2838; CHECK-SD-NEXT: addp d0, v0.2d 2839; CHECK-SD-NEXT: fmov x0, d0 2840; CHECK-SD-NEXT: ret 2841; 2842; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext: 2843; CHECK-GI: // %bb.0: // %entry 2844; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 2845; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 2846; CHECK-GI-NEXT: shl v0.2d, v0.2d, #48 2847; CHECK-GI-NEXT: shl v1.2d, v1.2d, #48 2848; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #48 2849; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #48 2850; CHECK-GI-NEXT: addp d0, v0.2d 2851; CHECK-GI-NEXT: addp d1, v1.2d 2852; CHECK-GI-NEXT: fmov x8, d0 2853; CHECK-GI-NEXT: fmov x9, d1 2854; CHECK-GI-NEXT: add x0, x8, x9 2855; CHECK-GI-NEXT: ret 2856entry: 2857 %xx = sext <2 x i16> %x to <2 x i64> 2858 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 2859 %yy = sext <2 x i16> %y to <2 x i64> 2860 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) 2861 %z = add i64 %z1, %z2 2862 ret i64 %z 2863} 2864 2865define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) { 2866; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext: 2867; CHECK-SD-BASE: // %bb.0: // %entry 2868; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0 2869; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 2870; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0 2871; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 2872; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v0.8h, v2.8h 2873; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h 2874; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v1.8h, v3.8h 2875; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v3.4h 2876; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s 2877; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s 2878; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2879; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2880; CHECK-SD-BASE-NEXT: fmov w0, s0 2881; CHECK-SD-BASE-NEXT: ret 2882; 2883; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext: 2884; CHECK-SD-DOT: // %bb.0: // %entry 2885; CHECK-SD-DOT-NEXT: movi v2.16b, #1 2886; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000 2887; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b 2888; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b 2889; CHECK-SD-DOT-NEXT: addv s0, v3.4s 2890; CHECK-SD-DOT-NEXT: fmov w0, s0 2891; CHECK-SD-DOT-NEXT: ret 2892; 2893; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext: 2894; CHECK-GI-BASE: // %bb.0: // %entry 2895; CHECK-GI-BASE-NEXT: uaddlv h1, v1.16b 2896; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b 2897; CHECK-GI-BASE-NEXT: fmov w8, s1 2898; CHECK-GI-BASE-NEXT: fmov w9, s0 2899; CHECK-GI-BASE-NEXT: and w8, w8, #0xffff 2900; CHECK-GI-BASE-NEXT: add w0, w8, w9, uxth 2901; CHECK-GI-BASE-NEXT: ret 2902; 2903; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext: 2904; CHECK-GI-DOT: // %bb.0: // %entry 2905; CHECK-GI-DOT-NEXT: movi v2.16b, #1 2906; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 2907; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 2908; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b 2909; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b 2910; CHECK-GI-DOT-NEXT: addv s0, v4.4s 2911; CHECK-GI-DOT-NEXT: addv s1, v3.4s 2912; CHECK-GI-DOT-NEXT: fmov w8, s0 2913; CHECK-GI-DOT-NEXT: fmov w9, s1 2914; CHECK-GI-DOT-NEXT: add w0, w8, w9 2915; CHECK-GI-DOT-NEXT: ret 2916entry: 2917 %xx = zext <16 x i8> %x to <16 x i32> 2918 %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 2919 %yy = zext <16 x i8> %y to <16 x i32> 2920 %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy) 2921 %z = add i32 %z1, %z2 2922 ret i32 %z 2923} 2924 2925define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) { 2926; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext: 2927; CHECK-SD-BASE: // %bb.0: // %entry 2928; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0 2929; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 2930; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0 2931; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 2932; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v0.8h, v2.8h 2933; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h 2934; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v1.8h, v3.8h 2935; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v3.4h 2936; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s 2937; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s 2938; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 2939; CHECK-SD-BASE-NEXT: addv s0, v0.4s 2940; CHECK-SD-BASE-NEXT: fmov w0, s0 2941; CHECK-SD-BASE-NEXT: ret 2942; 2943; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext: 2944; CHECK-SD-DOT: // %bb.0: // %entry 2945; CHECK-SD-DOT-NEXT: movi v2.16b, #1 2946; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000 2947; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b 2948; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b 2949; CHECK-SD-DOT-NEXT: addv s0, v3.4s 2950; CHECK-SD-DOT-NEXT: fmov w0, s0 2951; CHECK-SD-DOT-NEXT: ret 2952; 2953; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext: 2954; CHECK-GI-BASE: // %bb.0: // %entry 2955; CHECK-GI-BASE-NEXT: saddlv h1, v1.16b 2956; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b 2957; CHECK-GI-BASE-NEXT: fmov w8, s1 2958; CHECK-GI-BASE-NEXT: fmov w9, s0 2959; CHECK-GI-BASE-NEXT: sxth w8, w8 2960; CHECK-GI-BASE-NEXT: add w0, w8, w9, sxth 2961; CHECK-GI-BASE-NEXT: ret 2962; 2963; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext: 2964; CHECK-GI-DOT: // %bb.0: // %entry 2965; CHECK-GI-DOT-NEXT: movi v2.16b, #1 2966; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 2967; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 2968; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b 2969; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b 2970; CHECK-GI-DOT-NEXT: addv s0, v4.4s 2971; CHECK-GI-DOT-NEXT: addv s1, v3.4s 2972; CHECK-GI-DOT-NEXT: fmov w8, s0 2973; CHECK-GI-DOT-NEXT: fmov w9, s1 2974; CHECK-GI-DOT-NEXT: add w0, w8, w9 2975; CHECK-GI-DOT-NEXT: ret 2976entry: 2977 %xx = sext <16 x i8> %x to <16 x i32> 2978 %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) 2979 %yy = sext <16 x i8> %y to <16 x i32> 2980 %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy) 2981 %z = add i32 %z1, %z2 2982 ret i32 %z 2983} 2984 2985define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) { 2986; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext: 2987; CHECK-SD-BASE: // %bb.0: // %entry 2988; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 2989; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 2990; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h 2991; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h 2992; CHECK-SD-BASE-NEXT: addv s0, v1.4s 2993; CHECK-SD-BASE-NEXT: fmov w0, s0 2994; CHECK-SD-BASE-NEXT: ret 2995; 2996; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext: 2997; CHECK-SD-DOT: // %bb.0: // %entry 2998; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 2999; CHECK-SD-DOT-NEXT: movi v3.8b, #1 3000; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v3.8b 3001; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 3002; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 3003; CHECK-SD-DOT-NEXT: fmov w0, s0 3004; CHECK-SD-DOT-NEXT: ret 3005; 3006; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext: 3007; CHECK-GI-BASE: // %bb.0: // %entry 3008; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b 3009; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b 3010; CHECK-GI-BASE-NEXT: fmov w8, s1 3011; CHECK-GI-BASE-NEXT: fmov w9, s0 3012; CHECK-GI-BASE-NEXT: and w8, w8, #0xffff 3013; CHECK-GI-BASE-NEXT: add w0, w8, w9, uxth 3014; CHECK-GI-BASE-NEXT: ret 3015; 3016; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext: 3017; CHECK-GI-DOT: // %bb.0: // %entry 3018; CHECK-GI-DOT-NEXT: movi v2.8b, #1 3019; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 3020; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 3021; CHECK-GI-DOT-NEXT: udot v4.2s, v0.8b, v2.8b 3022; CHECK-GI-DOT-NEXT: udot v3.2s, v1.8b, v2.8b 3023; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s 3024; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s 3025; CHECK-GI-DOT-NEXT: fmov w8, s0 3026; CHECK-GI-DOT-NEXT: fmov w9, s1 3027; CHECK-GI-DOT-NEXT: add w0, w8, w9 3028; CHECK-GI-DOT-NEXT: ret 3029entry: 3030 %xx = zext <8 x i8> %x to <8 x i32> 3031 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 3032 %yy = zext <8 x i8> %y to <8 x i32> 3033 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) 3034 %z = add i32 %z1, %z2 3035 ret i32 %z 3036} 3037 3038define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) { 3039; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext: 3040; CHECK-SD-BASE: // %bb.0: // %entry 3041; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 3042; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 3043; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h 3044; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h 3045; CHECK-SD-BASE-NEXT: addv s0, v1.4s 3046; CHECK-SD-BASE-NEXT: fmov w0, s0 3047; CHECK-SD-BASE-NEXT: ret 3048; 3049; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext: 3050; CHECK-SD-DOT: // %bb.0: // %entry 3051; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 3052; CHECK-SD-DOT-NEXT: movi v3.8b, #1 3053; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v3.8b 3054; CHECK-SD-DOT-NEXT: sdot v2.2s, v0.8b, v3.8b 3055; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 3056; CHECK-SD-DOT-NEXT: fmov w0, s0 3057; CHECK-SD-DOT-NEXT: ret 3058; 3059; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext: 3060; CHECK-GI-BASE: // %bb.0: // %entry 3061; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b 3062; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b 3063; CHECK-GI-BASE-NEXT: fmov w8, s1 3064; CHECK-GI-BASE-NEXT: fmov w9, s0 3065; CHECK-GI-BASE-NEXT: sxth w8, w8 3066; CHECK-GI-BASE-NEXT: add w0, w8, w9, sxth 3067; CHECK-GI-BASE-NEXT: ret 3068; 3069; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext: 3070; CHECK-GI-DOT: // %bb.0: // %entry 3071; CHECK-GI-DOT-NEXT: movi v2.8b, #1 3072; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 3073; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 3074; CHECK-GI-DOT-NEXT: sdot v4.2s, v0.8b, v2.8b 3075; CHECK-GI-DOT-NEXT: sdot v3.2s, v1.8b, v2.8b 3076; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s 3077; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s 3078; CHECK-GI-DOT-NEXT: fmov w8, s0 3079; CHECK-GI-DOT-NEXT: fmov w9, s1 3080; CHECK-GI-DOT-NEXT: add w0, w8, w9 3081; CHECK-GI-DOT-NEXT: ret 3082entry: 3083 %xx = sext <8 x i8> %x to <8 x i32> 3084 %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) 3085 %yy = sext <8 x i8> %y to <8 x i32> 3086 %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) 3087 %z = add i32 %z1, %z2 3088 ret i32 %z 3089} 3090 3091define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) { 3092; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext: 3093; CHECK-SD: // %bb.0: // %entry 3094; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 3095; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 3096; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 3097; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8 3098; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] 3099; CHECK-SD-NEXT: uaddlv s0, v0.8h 3100; CHECK-SD-NEXT: fmov w0, s0 3101; CHECK-SD-NEXT: ret 3102; 3103; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext: 3104; CHECK-GI: // %bb.0: // %entry 3105; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff 3106; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b 3107; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b 3108; CHECK-GI-NEXT: uaddlv s1, v1.4h 3109; CHECK-GI-NEXT: uaddlv s0, v0.4h 3110; CHECK-GI-NEXT: fmov w8, s1 3111; CHECK-GI-NEXT: fmov w9, s0 3112; CHECK-GI-NEXT: and w8, w8, #0xffff 3113; CHECK-GI-NEXT: add w0, w8, w9, uxth 3114; CHECK-GI-NEXT: ret 3115entry: 3116 %xx = zext <4 x i8> %x to <4 x i32> 3117 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 3118 %yy = zext <4 x i8> %y to <4 x i32> 3119 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) 3120 %z = add i32 %z1, %z2 3121 ret i32 %z 3122} 3123 3124define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) { 3125; CHECK-SD-LABEL: add_pair_v4i8_v4i32_sext: 3126; CHECK-SD: // %bb.0: // %entry 3127; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 3128; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 3129; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24 3130; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24 3131; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24 3132; CHECK-SD-NEXT: ssra v0.4s, v1.4s, #24 3133; CHECK-SD-NEXT: addv s0, v0.4s 3134; CHECK-SD-NEXT: fmov w0, s0 3135; CHECK-SD-NEXT: ret 3136; 3137; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext: 3138; CHECK-GI: // %bb.0: // %entry 3139; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 3140; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 3141; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 3142; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 3143; CHECK-GI-NEXT: saddlv s1, v1.4h 3144; CHECK-GI-NEXT: saddlv s0, v0.4h 3145; CHECK-GI-NEXT: fmov w8, s1 3146; CHECK-GI-NEXT: fmov w9, s0 3147; CHECK-GI-NEXT: sxth w8, w8 3148; CHECK-GI-NEXT: add w0, w8, w9, sxth 3149; CHECK-GI-NEXT: ret 3150entry: 3151 %xx = sext <4 x i8> %x to <4 x i32> 3152 %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) 3153 %yy = sext <4 x i8> %y to <4 x i32> 3154 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) 3155 %z = add i32 %z1, %z2 3156 ret i32 %z 3157} 3158 3159define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) { 3160; CHECK-SD-LABEL: add_pair_v16i8_v16i16_zext: 3161; CHECK-SD: // %bb.0: // %entry 3162; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b 3163; CHECK-SD-NEXT: uadalp v1.8h, v0.16b 3164; CHECK-SD-NEXT: addv h0, v1.8h 3165; CHECK-SD-NEXT: fmov w0, s0 3166; CHECK-SD-NEXT: ret 3167; 3168; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext: 3169; CHECK-GI: // %bb.0: // %entry 3170; CHECK-GI-NEXT: uaddlv h0, v0.16b 3171; CHECK-GI-NEXT: uaddlv h1, v1.16b 3172; CHECK-GI-NEXT: fmov w8, s0 3173; CHECK-GI-NEXT: fmov w9, s1 3174; CHECK-GI-NEXT: add w8, w8, w9 3175; CHECK-GI-NEXT: and w0, w8, #0xffff 3176; CHECK-GI-NEXT: ret 3177entry: 3178 %xx = zext <16 x i8> %x to <16 x i16> 3179 %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 3180 %yy = zext <16 x i8> %y to <16 x i16> 3181 %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy) 3182 %z = add i16 %z1, %z2 3183 ret i16 %z 3184} 3185 3186define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) { 3187; CHECK-SD-LABEL: add_pair_v16i8_v16i16_sext: 3188; CHECK-SD: // %bb.0: // %entry 3189; CHECK-SD-NEXT: saddlp v1.8h, v1.16b 3190; CHECK-SD-NEXT: sadalp v1.8h, v0.16b 3191; CHECK-SD-NEXT: addv h0, v1.8h 3192; CHECK-SD-NEXT: smov w0, v0.h[0] 3193; CHECK-SD-NEXT: ret 3194; 3195; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext: 3196; CHECK-GI: // %bb.0: // %entry 3197; CHECK-GI-NEXT: saddlv h0, v0.16b 3198; CHECK-GI-NEXT: saddlv h1, v1.16b 3199; CHECK-GI-NEXT: fmov w8, s0 3200; CHECK-GI-NEXT: fmov w9, s1 3201; CHECK-GI-NEXT: add w8, w8, w9 3202; CHECK-GI-NEXT: sxth w0, w8 3203; CHECK-GI-NEXT: ret 3204entry: 3205 %xx = sext <16 x i8> %x to <16 x i16> 3206 %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) 3207 %yy = sext <16 x i8> %y to <16 x i16> 3208 %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy) 3209 %z = add i16 %z1, %z2 3210 ret i16 %z 3211} 3212 3213define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) { 3214; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext: 3215; CHECK-SD: // %bb.0: // %entry 3216; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 3217; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 3218; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] 3219; CHECK-SD-NEXT: uaddlv h0, v0.16b 3220; CHECK-SD-NEXT: umov w0, v0.h[0] 3221; CHECK-SD-NEXT: ret 3222; 3223; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext: 3224; CHECK-GI: // %bb.0: // %entry 3225; CHECK-GI-NEXT: uaddlv h0, v0.8b 3226; CHECK-GI-NEXT: uaddlv h1, v1.8b 3227; CHECK-GI-NEXT: fmov w8, s0 3228; CHECK-GI-NEXT: fmov w9, s1 3229; CHECK-GI-NEXT: add w8, w8, w9 3230; CHECK-GI-NEXT: and w0, w8, #0xffff 3231; CHECK-GI-NEXT: ret 3232entry: 3233 %xx = zext <8 x i8> %x to <8 x i16> 3234 %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 3235 %yy = zext <8 x i8> %y to <8 x i16> 3236 %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy) 3237 %z = add i16 %z1, %z2 3238 ret i16 %z 3239} 3240 3241define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) { 3242; CHECK-SD-LABEL: add_pair_v8i8_v8i16_sext: 3243; CHECK-SD: // %bb.0: // %entry 3244; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b 3245; CHECK-SD-NEXT: addv h0, v0.8h 3246; CHECK-SD-NEXT: smov w0, v0.h[0] 3247; CHECK-SD-NEXT: ret 3248; 3249; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext: 3250; CHECK-GI: // %bb.0: // %entry 3251; CHECK-GI-NEXT: saddlv h0, v0.8b 3252; CHECK-GI-NEXT: saddlv h1, v1.8b 3253; CHECK-GI-NEXT: fmov w8, s0 3254; CHECK-GI-NEXT: fmov w9, s1 3255; CHECK-GI-NEXT: add w8, w8, w9 3256; CHECK-GI-NEXT: sxth w0, w8 3257; CHECK-GI-NEXT: ret 3258entry: 3259 %xx = sext <8 x i8> %x to <8 x i16> 3260 %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) 3261 %yy = sext <8 x i8> %y to <8 x i16> 3262 %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy) 3263 %z = add i16 %z1, %z2 3264 ret i16 %z 3265} 3266 3267define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) { 3268; CHECK-SD-LABEL: add_pair_v16i8_v16i8: 3269; CHECK-SD: // %bb.0: // %entry 3270; CHECK-SD-NEXT: add v0.16b, v0.16b, v1.16b 3271; CHECK-SD-NEXT: addv b0, v0.16b 3272; CHECK-SD-NEXT: fmov w0, s0 3273; CHECK-SD-NEXT: ret 3274; 3275; CHECK-GI-LABEL: add_pair_v16i8_v16i8: 3276; CHECK-GI: // %bb.0: // %entry 3277; CHECK-GI-NEXT: addv b0, v0.16b 3278; CHECK-GI-NEXT: addv b1, v1.16b 3279; CHECK-GI-NEXT: fmov w8, s0 3280; CHECK-GI-NEXT: fmov w9, s1 3281; CHECK-GI-NEXT: add w8, w9, w8, uxtb 3282; CHECK-GI-NEXT: and w0, w8, #0xff 3283; CHECK-GI-NEXT: ret 3284entry: 3285 %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) 3286 %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y) 3287 %z = add i8 %z1, %z2 3288 ret i8 %z 3289} 3290 3291define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { 3292; CHECK-SD-LABEL: add_pair_v16i8_v16i64_zext: 3293; CHECK-SD: // %bb.0: // %entry 3294; CHECK-SD-NEXT: ushll2 v2.8h, v0.16b, #0 3295; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 3296; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0 3297; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 3298; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0 3299; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0 3300; CHECK-SD-NEXT: ushll2 v5.4s, v0.8h, #0 3301; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 3302; CHECK-SD-NEXT: ushll2 v6.4s, v3.8h, #0 3303; CHECK-SD-NEXT: ushll2 v7.4s, v1.8h, #0 3304; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 3305; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 3306; CHECK-SD-NEXT: uaddl2 v16.2d, v5.4s, v2.4s 3307; CHECK-SD-NEXT: uaddl v2.2d, v5.2s, v2.2s 3308; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v4.4s 3309; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v4.2s 3310; CHECK-SD-NEXT: uaddl2 v4.2d, v7.4s, v6.4s 3311; CHECK-SD-NEXT: uaddl v6.2d, v7.2s, v6.2s 3312; CHECK-SD-NEXT: uaddl2 v7.2d, v1.4s, v3.4s 3313; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s 3314; CHECK-SD-NEXT: add v3.2d, v5.2d, v16.2d 3315; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 3316; CHECK-SD-NEXT: add v2.2d, v7.2d, v4.2d 3317; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d 3318; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d 3319; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d 3320; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 3321; CHECK-SD-NEXT: addp d0, v0.2d 3322; CHECK-SD-NEXT: fmov x0, d0 3323; CHECK-SD-NEXT: ret 3324; 3325; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext: 3326; CHECK-GI: // %bb.0: // %entry 3327; CHECK-GI-NEXT: uaddlv h1, v1.16b 3328; CHECK-GI-NEXT: uaddlv h0, v0.16b 3329; CHECK-GI-NEXT: fmov w8, s1 3330; CHECK-GI-NEXT: fmov w9, s0 3331; CHECK-GI-NEXT: and x8, x8, #0xffff 3332; CHECK-GI-NEXT: add x0, x8, w9, uxth 3333; CHECK-GI-NEXT: ret 3334entry: 3335 %xx = zext <16 x i8> %x to <16 x i64> 3336 %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 3337 %yy = zext <16 x i8> %y to <16 x i64> 3338 %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy) 3339 %z = add i64 %z1, %z2 3340 ret i64 %z 3341} 3342 3343define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { 3344; CHECK-SD-LABEL: add_pair_v16i8_v16i64_sext: 3345; CHECK-SD: // %bb.0: // %entry 3346; CHECK-SD-NEXT: sshll2 v2.8h, v0.16b, #0 3347; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 3348; CHECK-SD-NEXT: sshll2 v3.8h, v1.16b, #0 3349; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0 3350; CHECK-SD-NEXT: sshll v4.4s, v2.4h, #0 3351; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0 3352; CHECK-SD-NEXT: sshll2 v5.4s, v0.8h, #0 3353; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 3354; CHECK-SD-NEXT: sshll2 v6.4s, v3.8h, #0 3355; CHECK-SD-NEXT: sshll2 v7.4s, v1.8h, #0 3356; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0 3357; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 3358; CHECK-SD-NEXT: saddl2 v16.2d, v5.4s, v2.4s 3359; CHECK-SD-NEXT: saddl v2.2d, v5.2s, v2.2s 3360; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v4.4s 3361; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v4.2s 3362; CHECK-SD-NEXT: saddl2 v4.2d, v7.4s, v6.4s 3363; CHECK-SD-NEXT: saddl v6.2d, v7.2s, v6.2s 3364; CHECK-SD-NEXT: saddl2 v7.2d, v1.4s, v3.4s 3365; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s 3366; CHECK-SD-NEXT: add v3.2d, v5.2d, v16.2d 3367; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d 3368; CHECK-SD-NEXT: add v2.2d, v7.2d, v4.2d 3369; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d 3370; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d 3371; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d 3372; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 3373; CHECK-SD-NEXT: addp d0, v0.2d 3374; CHECK-SD-NEXT: fmov x0, d0 3375; CHECK-SD-NEXT: ret 3376; 3377; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext: 3378; CHECK-GI: // %bb.0: // %entry 3379; CHECK-GI-NEXT: saddlv h1, v1.16b 3380; CHECK-GI-NEXT: saddlv h0, v0.16b 3381; CHECK-GI-NEXT: fmov w8, s1 3382; CHECK-GI-NEXT: fmov w9, s0 3383; CHECK-GI-NEXT: sxth x8, w8 3384; CHECK-GI-NEXT: add x0, x8, w9, sxth 3385; CHECK-GI-NEXT: ret 3386entry: 3387 %xx = sext <16 x i8> %x to <16 x i64> 3388 %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) 3389 %yy = sext <16 x i8> %y to <16 x i64> 3390 %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy) 3391 %z = add i64 %z1, %z2 3392 ret i64 %z 3393} 3394 3395define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) { 3396; CHECK-SD-LABEL: add_pair_v8i8_v8i64_zext: 3397; CHECK-SD: // %bb.0: // %entry 3398; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 3399; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 3400; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0 3401; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 3402; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0 3403; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 3404; CHECK-SD-NEXT: uaddl2 v4.2d, v0.4s, v2.4s 3405; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s 3406; CHECK-SD-NEXT: uaddl2 v2.2d, v1.4s, v3.4s 3407; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s 3408; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d 3409; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d 3410; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 3411; CHECK-SD-NEXT: addp d0, v0.2d 3412; CHECK-SD-NEXT: fmov x0, d0 3413; CHECK-SD-NEXT: ret 3414; 3415; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext: 3416; CHECK-GI: // %bb.0: // %entry 3417; CHECK-GI-NEXT: uaddlv h1, v1.8b 3418; CHECK-GI-NEXT: uaddlv h0, v0.8b 3419; CHECK-GI-NEXT: fmov w8, s1 3420; CHECK-GI-NEXT: fmov w9, s0 3421; CHECK-GI-NEXT: and x8, x8, #0xffff 3422; CHECK-GI-NEXT: add x0, x8, w9, uxth 3423; CHECK-GI-NEXT: ret 3424entry: 3425 %xx = zext <8 x i8> %x to <8 x i64> 3426 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 3427 %yy = zext <8 x i8> %y to <8 x i64> 3428 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) 3429 %z = add i64 %z1, %z2 3430 ret i64 %z 3431} 3432 3433define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) { 3434; CHECK-SD-LABEL: add_pair_v8i8_v8i64_sext: 3435; CHECK-SD: // %bb.0: // %entry 3436; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 3437; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0 3438; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0 3439; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 3440; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0 3441; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 3442; CHECK-SD-NEXT: saddl2 v4.2d, v0.4s, v2.4s 3443; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s 3444; CHECK-SD-NEXT: saddl2 v2.2d, v1.4s, v3.4s 3445; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s 3446; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d 3447; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d 3448; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 3449; CHECK-SD-NEXT: addp d0, v0.2d 3450; CHECK-SD-NEXT: fmov x0, d0 3451; CHECK-SD-NEXT: ret 3452; 3453; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext: 3454; CHECK-GI: // %bb.0: // %entry 3455; CHECK-GI-NEXT: saddlv h1, v1.8b 3456; CHECK-GI-NEXT: saddlv h0, v0.8b 3457; CHECK-GI-NEXT: fmov w8, s1 3458; CHECK-GI-NEXT: fmov w9, s0 3459; CHECK-GI-NEXT: sxth x8, w8 3460; CHECK-GI-NEXT: add x0, x8, w9, sxth 3461; CHECK-GI-NEXT: ret 3462entry: 3463 %xx = sext <8 x i8> %x to <8 x i64> 3464 %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) 3465 %yy = sext <8 x i8> %y to <8 x i64> 3466 %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) 3467 %z = add i64 %z1, %z2 3468 ret i64 %z 3469} 3470 3471define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) { 3472; CHECK-SD-LABEL: add_pair_v4i8_v4i64_zext: 3473; CHECK-SD: // %bb.0: // %entry 3474; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8 3475; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 3476; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 3477; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 3478; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s 3479; CHECK-SD-NEXT: uadalp v1.2d, v0.4s 3480; CHECK-SD-NEXT: addp d0, v1.2d 3481; CHECK-SD-NEXT: fmov x0, d0 3482; CHECK-SD-NEXT: ret 3483; 3484; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext: 3485; CHECK-GI: // %bb.0: // %entry 3486; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff 3487; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b 3488; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b 3489; CHECK-GI-NEXT: uaddlv s1, v1.4h 3490; CHECK-GI-NEXT: uaddlv s0, v0.4h 3491; CHECK-GI-NEXT: fmov w8, s1 3492; CHECK-GI-NEXT: fmov w9, s0 3493; CHECK-GI-NEXT: and x8, x8, #0xffff 3494; CHECK-GI-NEXT: add x0, x8, w9, uxth 3495; CHECK-GI-NEXT: ret 3496entry: 3497 %xx = zext <4 x i8> %x to <4 x i64> 3498 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 3499 %yy = zext <4 x i8> %y to <4 x i64> 3500 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) 3501 %z = add i64 %z1, %z2 3502 ret i64 %z 3503} 3504 3505define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) { 3506; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext: 3507; CHECK-SD: // %bb.0: // %entry 3508; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 3509; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 3510; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0 3511; CHECK-SD-NEXT: ushll v3.2d, v0.2s, #0 3512; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 3513; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0 3514; CHECK-SD-NEXT: shl v3.2d, v3.2d, #56 3515; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56 3516; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 3517; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 3518; CHECK-SD-NEXT: sshr v3.2d, v3.2d, #56 3519; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56 3520; CHECK-SD-NEXT: ssra v3.2d, v0.2d, #56 3521; CHECK-SD-NEXT: ssra v2.2d, v1.2d, #56 3522; CHECK-SD-NEXT: add v0.2d, v3.2d, v2.2d 3523; CHECK-SD-NEXT: addp d0, v0.2d 3524; CHECK-SD-NEXT: fmov x0, d0 3525; CHECK-SD-NEXT: ret 3526; 3527; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext: 3528; CHECK-GI: // %bb.0: // %entry 3529; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 3530; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 3531; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 3532; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 3533; CHECK-GI-NEXT: saddlv s1, v1.4h 3534; CHECK-GI-NEXT: saddlv s0, v0.4h 3535; CHECK-GI-NEXT: fmov w8, s1 3536; CHECK-GI-NEXT: fmov w9, s0 3537; CHECK-GI-NEXT: sxth x8, w8 3538; CHECK-GI-NEXT: add x0, x8, w9, sxth 3539; CHECK-GI-NEXT: ret 3540entry: 3541 %xx = sext <4 x i8> %x to <4 x i64> 3542 %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) 3543 %yy = sext <4 x i8> %y to <4 x i64> 3544 %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) 3545 %z = add i64 %z1, %z2 3546 ret i64 %z 3547} 3548 3549define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) { 3550; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext: 3551; CHECK-SD: // %bb.0: // %entry 3552; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 3553; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 3554; CHECK-SD-NEXT: movi v2.2d, #0x0000ff000000ff 3555; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] 3556; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b 3557; CHECK-SD-NEXT: uaddlv d0, v0.4s 3558; CHECK-SD-NEXT: fmov x0, d0 3559; CHECK-SD-NEXT: ret 3560; 3561; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext: 3562; CHECK-GI: // %bb.0: // %entry 3563; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff 3564; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 3565; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 3566; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b 3567; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 3568; CHECK-GI-NEXT: addp d0, v0.2d 3569; CHECK-GI-NEXT: addp d1, v1.2d 3570; CHECK-GI-NEXT: fmov x8, d0 3571; CHECK-GI-NEXT: fmov x9, d1 3572; CHECK-GI-NEXT: add x0, x8, x9 3573; CHECK-GI-NEXT: ret 3574entry: 3575 %xx = zext <2 x i8> %x to <2 x i64> 3576 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 3577 %yy = zext <2 x i8> %y to <2 x i64> 3578 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) 3579 %z = add i64 %z1, %z2 3580 ret i64 %z 3581} 3582 3583define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) { 3584; CHECK-SD-LABEL: add_pair_v2i8_v2i64_sext: 3585; CHECK-SD: // %bb.0: // %entry 3586; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 3587; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0 3588; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 3589; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 3590; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56 3591; CHECK-SD-NEXT: ssra v0.2d, v1.2d, #56 3592; CHECK-SD-NEXT: addp d0, v0.2d 3593; CHECK-SD-NEXT: fmov x0, d0 3594; CHECK-SD-NEXT: ret 3595; 3596; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext: 3597; CHECK-GI: // %bb.0: // %entry 3598; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 3599; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 3600; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56 3601; CHECK-GI-NEXT: shl v1.2d, v1.2d, #56 3602; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56 3603; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #56 3604; CHECK-GI-NEXT: addp d0, v0.2d 3605; CHECK-GI-NEXT: addp d1, v1.2d 3606; CHECK-GI-NEXT: fmov x8, d0 3607; CHECK-GI-NEXT: fmov x9, d1 3608; CHECK-GI-NEXT: add x0, x8, x9 3609; CHECK-GI-NEXT: ret 3610entry: 3611 %xx = sext <2 x i8> %x to <2 x i64> 3612 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) 3613 %yy = sext <2 x i8> %y to <2 x i64> 3614 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) 3615 %z = add i64 %z1, %z2 3616 ret i64 %z 3617} 3618 3619define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) { 3620; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext: 3621; CHECK-SD-BASE: // %bb.0: // %entry 3622; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 3623; CHECK-SD-BASE-NEXT: sshll v3.8h, v3.8b, #0 3624; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 3625; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0 3626; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h 3627; CHECK-SD-BASE-NEXT: saddlp v3.4s, v3.8h 3628; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h 3629; CHECK-SD-BASE-NEXT: sadalp v3.4s, v2.8h 3630; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s 3631; CHECK-SD-BASE-NEXT: addv s0, v0.4s 3632; CHECK-SD-BASE-NEXT: fmov w0, s0 3633; CHECK-SD-BASE-NEXT: ret 3634; 3635; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext: 3636; CHECK-SD-DOT: // %bb.0: // %entry 3637; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000 3638; CHECK-SD-DOT-NEXT: movi v5.8b, #1 3639; CHECK-SD-DOT-NEXT: movi v6.2d, #0000000000000000 3640; CHECK-SD-DOT-NEXT: udot v6.2s, v1.8b, v5.8b 3641; CHECK-SD-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b 3642; CHECK-SD-DOT-NEXT: udot v6.2s, v0.8b, v5.8b 3643; CHECK-SD-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b 3644; CHECK-SD-DOT-NEXT: add v0.2s, v6.2s, v4.2s 3645; CHECK-SD-DOT-NEXT: addp v0.2s, v0.2s, v0.2s 3646; CHECK-SD-DOT-NEXT: fmov w0, s0 3647; CHECK-SD-DOT-NEXT: ret 3648; 3649; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext: 3650; CHECK-GI-BASE: // %bb.0: // %entry 3651; CHECK-GI-BASE-NEXT: saddlv h3, v3.8b 3652; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b 3653; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b 3654; CHECK-GI-BASE-NEXT: saddlv h2, v2.8b 3655; CHECK-GI-BASE-NEXT: fmov w8, s3 3656; CHECK-GI-BASE-NEXT: fmov w9, s1 3657; CHECK-GI-BASE-NEXT: fmov w10, s0 3658; CHECK-GI-BASE-NEXT: fmov w11, s2 3659; CHECK-GI-BASE-NEXT: sxth w8, w8 3660; CHECK-GI-BASE-NEXT: and w9, w9, #0xffff 3661; CHECK-GI-BASE-NEXT: add w9, w9, w10, uxth 3662; CHECK-GI-BASE-NEXT: add w8, w8, w11, sxth 3663; CHECK-GI-BASE-NEXT: add w0, w9, w8 3664; CHECK-GI-BASE-NEXT: ret 3665; 3666; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext: 3667; CHECK-GI-DOT: // %bb.0: // %entry 3668; CHECK-GI-DOT-NEXT: movi v4.8b, #1 3669; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 3670; CHECK-GI-DOT-NEXT: movi v6.2d, #0000000000000000 3671; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000 3672; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000 3673; CHECK-GI-DOT-NEXT: udot v5.2s, v0.8b, v4.8b 3674; CHECK-GI-DOT-NEXT: sdot v6.2s, v3.8b, v4.8b 3675; CHECK-GI-DOT-NEXT: udot v7.2s, v1.8b, v4.8b 3676; CHECK-GI-DOT-NEXT: sdot v16.2s, v2.8b, v4.8b 3677; CHECK-GI-DOT-NEXT: addp v0.2s, v5.2s, v5.2s 3678; CHECK-GI-DOT-NEXT: addp v3.2s, v6.2s, v6.2s 3679; CHECK-GI-DOT-NEXT: addp v1.2s, v7.2s, v7.2s 3680; CHECK-GI-DOT-NEXT: addp v2.2s, v16.2s, v16.2s 3681; CHECK-GI-DOT-NEXT: fmov w8, s0 3682; CHECK-GI-DOT-NEXT: fmov w11, s3 3683; CHECK-GI-DOT-NEXT: fmov w9, s1 3684; CHECK-GI-DOT-NEXT: fmov w10, s2 3685; CHECK-GI-DOT-NEXT: add w8, w8, w9 3686; CHECK-GI-DOT-NEXT: add w9, w10, w11 3687; CHECK-GI-DOT-NEXT: add w0, w8, w9 3688; CHECK-GI-DOT-NEXT: ret 3689entry: 3690 %axx = zext <8 x i8> %ax to <8 x i32> 3691 %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx) 3692 %ayy = zext <8 x i8> %ay to <8 x i32> 3693 %az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy) 3694 %az = add i32 %az1, %az2 3695 %bxx = sext <8 x i8> %bx to <8 x i32> 3696 %bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx) 3697 %byy = sext <8 x i8> %by to <8 x i32> 3698 %bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy) 3699 %bz = add i32 %bz1, %bz2 3700 %z = add i32 %az, %bz 3701 ret i32 %z 3702} 3703 3704define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) { 3705; CHECK-SD-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: 3706; CHECK-SD: // %bb.0: // %entry 3707; CHECK-SD-NEXT: uaddlp v1.4s, v1.8h 3708; CHECK-SD-NEXT: uaddlp v3.4s, v3.8h 3709; CHECK-SD-NEXT: uadalp v1.4s, v0.8h 3710; CHECK-SD-NEXT: uadalp v3.4s, v2.8h 3711; CHECK-SD-NEXT: add v0.4s, v3.4s, v1.4s 3712; CHECK-SD-NEXT: addv s0, v0.4s 3713; CHECK-SD-NEXT: fmov w0, s0 3714; CHECK-SD-NEXT: ret 3715; 3716; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: 3717; CHECK-GI: // %bb.0: // %entry 3718; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 3719; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 3720; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0 3721; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 3722; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 3723; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 3724; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0 3725; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 3726; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s 3727; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s 3728; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s 3729; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s 3730; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 3731; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s 3732; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 3733; CHECK-GI-NEXT: addv s0, v0.4s 3734; CHECK-GI-NEXT: fmov w0, s0 3735; CHECK-GI-NEXT: ret 3736entry: 3737 %axx = zext <8 x i16> %ax to <8 x i32> 3738 %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3739 %s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3740 %axs = add <4 x i32> %s1h, %s1l 3741 %ayy = zext <8 x i16> %ay to <8 x i32> 3742 %s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3743 %s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3744 %ays = add <4 x i32> %s2h, %s2l 3745 %az = add <4 x i32> %axs, %ays 3746 %bxx = zext <8 x i16> %bx to <8 x i32> 3747 %s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3748 %s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3749 %bxs = add <4 x i32> %s3h, %s3l 3750 %byy = zext <8 x i16> %by to <8 x i32> 3751 %s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3752 %s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3753 %bys = add <4 x i32> %s4h, %s4l 3754 %bz = add <4 x i32> %bxs, %bys 3755 %z = add <4 x i32> %az, %bz 3756 %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z) 3757 ret i32 %z2 3758} 3759 3760define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) { 3761; CHECK-SD-LABEL: add_pair_v2i64_v2i64: 3762; CHECK-SD: // %bb.0: // %entry 3763; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 3764; CHECK-SD-NEXT: addp d0, v0.2d 3765; CHECK-SD-NEXT: fmov x0, d0 3766; CHECK-SD-NEXT: ret 3767; 3768; CHECK-GI-LABEL: add_pair_v2i64_v2i64: 3769; CHECK-GI: // %bb.0: // %entry 3770; CHECK-GI-NEXT: addp d0, v0.2d 3771; CHECK-GI-NEXT: addp d1, v1.2d 3772; CHECK-GI-NEXT: fmov x8, d0 3773; CHECK-GI-NEXT: fmov x9, d1 3774; CHECK-GI-NEXT: add x0, x8, x9 3775; CHECK-GI-NEXT: ret 3776entry: 3777 %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) 3778 %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y) 3779 %z = add i64 %z1, %z2 3780 ret i64 %z 3781} 3782 3783; Irregularly sized vectors 3784define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) { 3785; CHECK-SD-LABEL: add_v24i8_v24i16_zext: 3786; CHECK-SD: // %bb.0: // %entry 3787; CHECK-SD-NEXT: fmov s0, w0 3788; CHECK-SD-NEXT: ldr b1, [sp, #64] 3789; CHECK-SD-NEXT: add x8, sp, #72 3790; CHECK-SD-NEXT: ldr b2, [sp] 3791; CHECK-SD-NEXT: add x9, sp, #80 3792; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] 3793; CHECK-SD-NEXT: add x8, sp, #8 3794; CHECK-SD-NEXT: mov v0.b[1], w1 3795; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8] 3796; CHECK-SD-NEXT: add x8, sp, #16 3797; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] 3798; CHECK-SD-NEXT: add x9, sp, #88 3799; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] 3800; CHECK-SD-NEXT: add x8, sp, #24 3801; CHECK-SD-NEXT: mov v0.b[2], w2 3802; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9] 3803; CHECK-SD-NEXT: add x9, sp, #96 3804; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] 3805; CHECK-SD-NEXT: add x8, sp, #32 3806; CHECK-SD-NEXT: mov v0.b[3], w3 3807; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] 3808; CHECK-SD-NEXT: add x9, sp, #104 3809; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] 3810; CHECK-SD-NEXT: add x8, sp, #40 3811; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9] 3812; CHECK-SD-NEXT: add x9, sp, #112 3813; CHECK-SD-NEXT: mov v0.b[4], w4 3814; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] 3815; CHECK-SD-NEXT: add x8, sp, #48 3816; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] 3817; CHECK-SD-NEXT: add x9, sp, #120 3818; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] 3819; CHECK-SD-NEXT: add x8, sp, #56 3820; CHECK-SD-NEXT: mov v0.b[5], w5 3821; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] 3822; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] 3823; CHECK-SD-NEXT: mov v0.b[6], w6 3824; CHECK-SD-NEXT: mov v0.b[7], w7 3825; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b 3826; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v2.8b 3827; CHECK-SD-NEXT: addv h0, v0.8h 3828; CHECK-SD-NEXT: fmov w0, s0 3829; CHECK-SD-NEXT: ret 3830; 3831; CHECK-GI-LABEL: add_v24i8_v24i16_zext: 3832; CHECK-GI: // %bb.0: // %entry 3833; CHECK-GI-NEXT: fmov s0, w0 3834; CHECK-GI-NEXT: ldr w8, [sp] 3835; CHECK-GI-NEXT: ldr w9, [sp, #8] 3836; CHECK-GI-NEXT: ldr w10, [sp, #72] 3837; CHECK-GI-NEXT: mov v0.b[1], w1 3838; CHECK-GI-NEXT: mov v0.b[2], w2 3839; CHECK-GI-NEXT: mov v0.b[3], w3 3840; CHECK-GI-NEXT: mov v0.b[4], w4 3841; CHECK-GI-NEXT: mov v0.b[5], w5 3842; CHECK-GI-NEXT: mov v0.b[6], w6 3843; CHECK-GI-NEXT: mov v0.b[7], w7 3844; CHECK-GI-NEXT: mov v0.b[8], w8 3845; CHECK-GI-NEXT: ldr w8, [sp, #64] 3846; CHECK-GI-NEXT: fmov s1, w8 3847; CHECK-GI-NEXT: ldr w8, [sp, #16] 3848; CHECK-GI-NEXT: mov v0.b[9], w9 3849; CHECK-GI-NEXT: ldr w9, [sp, #80] 3850; CHECK-GI-NEXT: mov v1.b[1], w10 3851; CHECK-GI-NEXT: mov v0.b[10], w8 3852; CHECK-GI-NEXT: ldr w8, [sp, #24] 3853; CHECK-GI-NEXT: mov v1.b[2], w9 3854; CHECK-GI-NEXT: ldr w9, [sp, #88] 3855; CHECK-GI-NEXT: mov v0.b[11], w8 3856; CHECK-GI-NEXT: ldr w8, [sp, #32] 3857; CHECK-GI-NEXT: mov v1.b[3], w9 3858; CHECK-GI-NEXT: ldr w9, [sp, #96] 3859; CHECK-GI-NEXT: mov v0.b[12], w8 3860; CHECK-GI-NEXT: ldr w8, [sp, #40] 3861; CHECK-GI-NEXT: mov v1.b[4], w9 3862; CHECK-GI-NEXT: ldr w9, [sp, #104] 3863; CHECK-GI-NEXT: mov v0.b[13], w8 3864; CHECK-GI-NEXT: ldr w8, [sp, #48] 3865; CHECK-GI-NEXT: mov v1.b[5], w9 3866; CHECK-GI-NEXT: ldr w9, [sp, #112] 3867; CHECK-GI-NEXT: mov v0.b[14], w8 3868; CHECK-GI-NEXT: ldr w8, [sp, #56] 3869; CHECK-GI-NEXT: mov v1.b[6], w9 3870; CHECK-GI-NEXT: ldr w9, [sp, #120] 3871; CHECK-GI-NEXT: mov v0.b[15], w8 3872; CHECK-GI-NEXT: mov v1.b[7], w9 3873; CHECK-GI-NEXT: uaddlv h0, v0.16b 3874; CHECK-GI-NEXT: uaddlv h1, v1.8b 3875; CHECK-GI-NEXT: fmov w8, s0 3876; CHECK-GI-NEXT: fmov w9, s1 3877; CHECK-GI-NEXT: add w0, w8, w9 3878; CHECK-GI-NEXT: ret 3879entry: 3880 %xx = zext <24 x i8> %x to <24 x i16> 3881 %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx) 3882 ret i16 %z 3883} 3884 3885define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) { 3886; CHECK-SD-LABEL: add_v32i8_v32i16_zext: 3887; CHECK-SD: // %bb.0: // %entry 3888; CHECK-SD-NEXT: uaddl2 v2.8h, v0.16b, v1.16b 3889; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b 3890; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h 3891; CHECK-SD-NEXT: addv h0, v0.8h 3892; CHECK-SD-NEXT: fmov w0, s0 3893; CHECK-SD-NEXT: ret 3894; 3895; CHECK-GI-LABEL: add_v32i8_v32i16_zext: 3896; CHECK-GI: // %bb.0: // %entry 3897; CHECK-GI-NEXT: uaddlv h0, v0.16b 3898; CHECK-GI-NEXT: uaddlv h1, v1.16b 3899; CHECK-GI-NEXT: fmov w8, s0 3900; CHECK-GI-NEXT: fmov w9, s1 3901; CHECK-GI-NEXT: add w0, w8, w9 3902; CHECK-GI-NEXT: ret 3903entry: 3904 %xx = zext <32 x i8> %x to <32 x i16> 3905 %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx) 3906 ret i16 %z 3907} 3908 3909define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) { 3910; CHECK-SD-LABEL: add_v24i8_v24i16_sext: 3911; CHECK-SD: // %bb.0: // %entry 3912; CHECK-SD-NEXT: fmov s0, w0 3913; CHECK-SD-NEXT: ldr b1, [sp, #64] 3914; CHECK-SD-NEXT: add x8, sp, #72 3915; CHECK-SD-NEXT: ldr b2, [sp] 3916; CHECK-SD-NEXT: add x9, sp, #80 3917; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] 3918; CHECK-SD-NEXT: add x8, sp, #8 3919; CHECK-SD-NEXT: mov v0.b[1], w1 3920; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8] 3921; CHECK-SD-NEXT: add x8, sp, #16 3922; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] 3923; CHECK-SD-NEXT: add x9, sp, #88 3924; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] 3925; CHECK-SD-NEXT: add x8, sp, #24 3926; CHECK-SD-NEXT: mov v0.b[2], w2 3927; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9] 3928; CHECK-SD-NEXT: add x9, sp, #96 3929; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] 3930; CHECK-SD-NEXT: add x8, sp, #32 3931; CHECK-SD-NEXT: mov v0.b[3], w3 3932; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] 3933; CHECK-SD-NEXT: add x9, sp, #104 3934; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] 3935; CHECK-SD-NEXT: add x8, sp, #40 3936; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9] 3937; CHECK-SD-NEXT: add x9, sp, #112 3938; CHECK-SD-NEXT: mov v0.b[4], w4 3939; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] 3940; CHECK-SD-NEXT: add x8, sp, #48 3941; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] 3942; CHECK-SD-NEXT: add x9, sp, #120 3943; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] 3944; CHECK-SD-NEXT: add x8, sp, #56 3945; CHECK-SD-NEXT: mov v0.b[5], w5 3946; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] 3947; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] 3948; CHECK-SD-NEXT: mov v0.b[6], w6 3949; CHECK-SD-NEXT: mov v0.b[7], w7 3950; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b 3951; CHECK-SD-NEXT: saddw v0.8h, v0.8h, v2.8b 3952; CHECK-SD-NEXT: addv h0, v0.8h 3953; CHECK-SD-NEXT: fmov w0, s0 3954; CHECK-SD-NEXT: ret 3955; 3956; CHECK-GI-LABEL: add_v24i8_v24i16_sext: 3957; CHECK-GI: // %bb.0: // %entry 3958; CHECK-GI-NEXT: fmov s0, w0 3959; CHECK-GI-NEXT: ldr w8, [sp] 3960; CHECK-GI-NEXT: ldr w9, [sp, #8] 3961; CHECK-GI-NEXT: ldr w10, [sp, #72] 3962; CHECK-GI-NEXT: mov v0.b[1], w1 3963; CHECK-GI-NEXT: mov v0.b[2], w2 3964; CHECK-GI-NEXT: mov v0.b[3], w3 3965; CHECK-GI-NEXT: mov v0.b[4], w4 3966; CHECK-GI-NEXT: mov v0.b[5], w5 3967; CHECK-GI-NEXT: mov v0.b[6], w6 3968; CHECK-GI-NEXT: mov v0.b[7], w7 3969; CHECK-GI-NEXT: mov v0.b[8], w8 3970; CHECK-GI-NEXT: ldr w8, [sp, #64] 3971; CHECK-GI-NEXT: fmov s1, w8 3972; CHECK-GI-NEXT: ldr w8, [sp, #16] 3973; CHECK-GI-NEXT: mov v0.b[9], w9 3974; CHECK-GI-NEXT: ldr w9, [sp, #80] 3975; CHECK-GI-NEXT: mov v1.b[1], w10 3976; CHECK-GI-NEXT: mov v0.b[10], w8 3977; CHECK-GI-NEXT: ldr w8, [sp, #24] 3978; CHECK-GI-NEXT: mov v1.b[2], w9 3979; CHECK-GI-NEXT: ldr w9, [sp, #88] 3980; CHECK-GI-NEXT: mov v0.b[11], w8 3981; CHECK-GI-NEXT: ldr w8, [sp, #32] 3982; CHECK-GI-NEXT: mov v1.b[3], w9 3983; CHECK-GI-NEXT: ldr w9, [sp, #96] 3984; CHECK-GI-NEXT: mov v0.b[12], w8 3985; CHECK-GI-NEXT: ldr w8, [sp, #40] 3986; CHECK-GI-NEXT: mov v1.b[4], w9 3987; CHECK-GI-NEXT: ldr w9, [sp, #104] 3988; CHECK-GI-NEXT: mov v0.b[13], w8 3989; CHECK-GI-NEXT: ldr w8, [sp, #48] 3990; CHECK-GI-NEXT: mov v1.b[5], w9 3991; CHECK-GI-NEXT: ldr w9, [sp, #112] 3992; CHECK-GI-NEXT: mov v0.b[14], w8 3993; CHECK-GI-NEXT: ldr w8, [sp, #56] 3994; CHECK-GI-NEXT: mov v1.b[6], w9 3995; CHECK-GI-NEXT: ldr w9, [sp, #120] 3996; CHECK-GI-NEXT: mov v0.b[15], w8 3997; CHECK-GI-NEXT: mov v1.b[7], w9 3998; CHECK-GI-NEXT: saddlv h0, v0.16b 3999; CHECK-GI-NEXT: saddlv h1, v1.8b 4000; CHECK-GI-NEXT: fmov w8, s0 4001; CHECK-GI-NEXT: fmov w9, s1 4002; CHECK-GI-NEXT: add w0, w8, w9 4003; CHECK-GI-NEXT: ret 4004entry: 4005 %xx = sext <24 x i8> %x to <24 x i16> 4006 %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx) 4007 ret i16 %z 4008} 4009 4010define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) { 4011; CHECK-SD-LABEL: add_v32i8_v32i16_sext: 4012; CHECK-SD: // %bb.0: // %entry 4013; CHECK-SD-NEXT: saddl2 v2.8h, v0.16b, v1.16b 4014; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b 4015; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h 4016; CHECK-SD-NEXT: addv h0, v0.8h 4017; CHECK-SD-NEXT: fmov w0, s0 4018; CHECK-SD-NEXT: ret 4019; 4020; CHECK-GI-LABEL: add_v32i8_v32i16_sext: 4021; CHECK-GI: // %bb.0: // %entry 4022; CHECK-GI-NEXT: saddlv h0, v0.16b 4023; CHECK-GI-NEXT: saddlv h1, v1.16b 4024; CHECK-GI-NEXT: fmov w8, s0 4025; CHECK-GI-NEXT: fmov w9, s1 4026; CHECK-GI-NEXT: add w0, w8, w9 4027; CHECK-GI-NEXT: ret 4028entry: 4029 %xx = sext <32 x i8> %x to <32 x i16> 4030 %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx) 4031 ret i16 %z 4032} 4033 4034; Irregularly sized vectors and larger extends 4035define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { 4036; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_zext: 4037; CHECK-SD-BASE: // %bb.0: // %entry 4038; CHECK-SD-BASE-NEXT: fmov s0, w0 4039; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64] 4040; CHECK-SD-BASE-NEXT: add x8, sp, #72 4041; CHECK-SD-BASE-NEXT: ldr b2, [sp] 4042; CHECK-SD-BASE-NEXT: add x9, sp, #80 4043; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8] 4044; CHECK-SD-BASE-NEXT: add x8, sp, #8 4045; CHECK-SD-BASE-NEXT: mov v0.b[1], w1 4046; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8] 4047; CHECK-SD-BASE-NEXT: add x8, sp, #16 4048; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9] 4049; CHECK-SD-BASE-NEXT: add x9, sp, #88 4050; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8] 4051; CHECK-SD-BASE-NEXT: add x8, sp, #24 4052; CHECK-SD-BASE-NEXT: mov v0.b[2], w2 4053; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9] 4054; CHECK-SD-BASE-NEXT: add x9, sp, #96 4055; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8] 4056; CHECK-SD-BASE-NEXT: add x8, sp, #32 4057; CHECK-SD-BASE-NEXT: mov v0.b[3], w3 4058; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9] 4059; CHECK-SD-BASE-NEXT: add x9, sp, #104 4060; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8] 4061; CHECK-SD-BASE-NEXT: add x8, sp, #40 4062; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9] 4063; CHECK-SD-BASE-NEXT: add x9, sp, #112 4064; CHECK-SD-BASE-NEXT: mov v0.b[4], w4 4065; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8] 4066; CHECK-SD-BASE-NEXT: add x8, sp, #48 4067; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9] 4068; CHECK-SD-BASE-NEXT: add x9, sp, #120 4069; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8] 4070; CHECK-SD-BASE-NEXT: add x8, sp, #56 4071; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 4072; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9] 4073; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8] 4074; CHECK-SD-BASE-NEXT: mov v0.b[6], w6 4075; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 4076; CHECK-SD-BASE-NEXT: ushll v2.8h, v2.8b, #0 4077; CHECK-SD-BASE-NEXT: mov v0.b[7], w7 4078; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 4079; CHECK-SD-BASE-NEXT: uaddl2 v3.4s, v0.8h, v1.8h 4080; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h 4081; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v3.4s, v2.8h 4082; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v2.4h 4083; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 4084; CHECK-SD-BASE-NEXT: addv s0, v0.4s 4085; CHECK-SD-BASE-NEXT: fmov w0, s0 4086; CHECK-SD-BASE-NEXT: ret 4087; 4088; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_zext: 4089; CHECK-SD-DOT: // %bb.0: // %entry 4090; CHECK-SD-DOT-NEXT: fmov s0, w0 4091; CHECK-SD-DOT-NEXT: mov x8, sp 4092; CHECK-SD-DOT-NEXT: ldr b1, [sp, #64] 4093; CHECK-SD-DOT-NEXT: add x9, sp, #72 4094; CHECK-SD-DOT-NEXT: movi v2.16b, #1 4095; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000 4096; CHECK-SD-DOT-NEXT: ld1 { v1.b }[1], [x9] 4097; CHECK-SD-DOT-NEXT: add x9, sp, #80 4098; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000 4099; CHECK-SD-DOT-NEXT: mov v0.b[1], w1 4100; CHECK-SD-DOT-NEXT: movi v5.8b, #1 4101; CHECK-SD-DOT-NEXT: ld1 { v1.b }[2], [x9] 4102; CHECK-SD-DOT-NEXT: add x9, sp, #88 4103; CHECK-SD-DOT-NEXT: mov v0.b[2], w2 4104; CHECK-SD-DOT-NEXT: ld1 { v1.b }[3], [x9] 4105; CHECK-SD-DOT-NEXT: add x9, sp, #96 4106; CHECK-SD-DOT-NEXT: mov v0.b[3], w3 4107; CHECK-SD-DOT-NEXT: ld1 { v1.b }[4], [x9] 4108; CHECK-SD-DOT-NEXT: add x9, sp, #104 4109; CHECK-SD-DOT-NEXT: ld1 { v1.b }[5], [x9] 4110; CHECK-SD-DOT-NEXT: add x9, sp, #112 4111; CHECK-SD-DOT-NEXT: mov v0.b[4], w4 4112; CHECK-SD-DOT-NEXT: ld1 { v1.b }[6], [x9] 4113; CHECK-SD-DOT-NEXT: add x9, sp, #120 4114; CHECK-SD-DOT-NEXT: mov v0.b[5], w5 4115; CHECK-SD-DOT-NEXT: ld1 { v1.b }[7], [x9] 4116; CHECK-SD-DOT-NEXT: mov v0.b[6], w6 4117; CHECK-SD-DOT-NEXT: udot v4.2s, v1.8b, v5.8b 4118; CHECK-SD-DOT-NEXT: mov v0.b[7], w7 4119; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s 4120; CHECK-SD-DOT-NEXT: fmov w9, s1 4121; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8] 4122; CHECK-SD-DOT-NEXT: add x8, sp, #8 4123; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8] 4124; CHECK-SD-DOT-NEXT: add x8, sp, #16 4125; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8] 4126; CHECK-SD-DOT-NEXT: add x8, sp, #24 4127; CHECK-SD-DOT-NEXT: ld1 { v0.b }[11], [x8] 4128; CHECK-SD-DOT-NEXT: add x8, sp, #32 4129; CHECK-SD-DOT-NEXT: ld1 { v0.b }[12], [x8] 4130; CHECK-SD-DOT-NEXT: add x8, sp, #40 4131; CHECK-SD-DOT-NEXT: ld1 { v0.b }[13], [x8] 4132; CHECK-SD-DOT-NEXT: add x8, sp, #48 4133; CHECK-SD-DOT-NEXT: ld1 { v0.b }[14], [x8] 4134; CHECK-SD-DOT-NEXT: add x8, sp, #56 4135; CHECK-SD-DOT-NEXT: ld1 { v0.b }[15], [x8] 4136; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b 4137; CHECK-SD-DOT-NEXT: addv s0, v3.4s 4138; CHECK-SD-DOT-NEXT: fmov w8, s0 4139; CHECK-SD-DOT-NEXT: add w0, w8, w9 4140; CHECK-SD-DOT-NEXT: ret 4141; 4142; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext: 4143; CHECK-GI-BASE: // %bb.0: // %entry 4144; CHECK-GI-BASE-NEXT: fmov s0, w0 4145; CHECK-GI-BASE-NEXT: ldr w8, [sp] 4146; CHECK-GI-BASE-NEXT: ldr w9, [sp, #8] 4147; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72] 4148; CHECK-GI-BASE-NEXT: mov v0.b[1], w1 4149; CHECK-GI-BASE-NEXT: mov v0.b[2], w2 4150; CHECK-GI-BASE-NEXT: mov v0.b[3], w3 4151; CHECK-GI-BASE-NEXT: mov v0.b[4], w4 4152; CHECK-GI-BASE-NEXT: mov v0.b[5], w5 4153; CHECK-GI-BASE-NEXT: mov v0.b[6], w6 4154; CHECK-GI-BASE-NEXT: mov v0.b[7], w7 4155; CHECK-GI-BASE-NEXT: mov v0.b[8], w8 4156; CHECK-GI-BASE-NEXT: ldr w8, [sp, #64] 4157; CHECK-GI-BASE-NEXT: fmov s1, w8 4158; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16] 4159; CHECK-GI-BASE-NEXT: mov v0.b[9], w9 4160; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80] 4161; CHECK-GI-BASE-NEXT: mov v1.b[1], w10 4162; CHECK-GI-BASE-NEXT: mov v0.b[10], w8 4163; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24] 4164; CHECK-GI-BASE-NEXT: mov v1.b[2], w9 4165; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] 4166; CHECK-GI-BASE-NEXT: mov v0.b[11], w8 4167; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32] 4168; CHECK-GI-BASE-NEXT: mov v1.b[3], w9 4169; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] 4170; CHECK-GI-BASE-NEXT: mov v0.b[12], w8 4171; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40] 4172; CHECK-GI-BASE-NEXT: mov v1.b[4], w9 4173; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] 4174; CHECK-GI-BASE-NEXT: mov v0.b[13], w8 4175; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48] 4176; CHECK-GI-BASE-NEXT: mov v1.b[5], w9 4177; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] 4178; CHECK-GI-BASE-NEXT: mov v0.b[14], w8 4179; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56] 4180; CHECK-GI-BASE-NEXT: mov v1.b[6], w9 4181; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] 4182; CHECK-GI-BASE-NEXT: mov v0.b[15], w8 4183; CHECK-GI-BASE-NEXT: mov v1.b[7], w9 4184; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b 4185; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b 4186; CHECK-GI-BASE-NEXT: fmov w8, s0 4187; CHECK-GI-BASE-NEXT: fmov w9, s1 4188; CHECK-GI-BASE-NEXT: add w8, w8, w9 4189; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff 4190; CHECK-GI-BASE-NEXT: ret 4191; 4192; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext: 4193; CHECK-GI-DOT: // %bb.0: // %entry 4194; CHECK-GI-DOT-NEXT: fmov s0, w0 4195; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64] 4196; CHECK-GI-DOT-NEXT: ldr w8, [sp] 4197; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72] 4198; CHECK-GI-DOT-NEXT: movi v2.8b, #1 4199; CHECK-GI-DOT-NEXT: movi v3.8b, #1 4200; CHECK-GI-DOT-NEXT: fmov s1, w9 4201; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80] 4202; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 4203; CHECK-GI-DOT-NEXT: mov v0.b[1], w1 4204; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 4205; CHECK-GI-DOT-NEXT: mov v1.b[1], w10 4206; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0] 4207; CHECK-GI-DOT-NEXT: mov v0.b[2], w2 4208; CHECK-GI-DOT-NEXT: mov v1.b[2], w9 4209; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88] 4210; CHECK-GI-DOT-NEXT: mov v0.b[3], w3 4211; CHECK-GI-DOT-NEXT: mov v1.b[3], w9 4212; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96] 4213; CHECK-GI-DOT-NEXT: mov v0.b[4], w4 4214; CHECK-GI-DOT-NEXT: mov v1.b[4], w9 4215; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104] 4216; CHECK-GI-DOT-NEXT: mov v0.b[5], w5 4217; CHECK-GI-DOT-NEXT: mov v1.b[5], w9 4218; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112] 4219; CHECK-GI-DOT-NEXT: mov v0.b[6], w6 4220; CHECK-GI-DOT-NEXT: mov v1.b[6], w9 4221; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120] 4222; CHECK-GI-DOT-NEXT: mov v0.b[7], w7 4223; CHECK-GI-DOT-NEXT: mov v1.b[7], w9 4224; CHECK-GI-DOT-NEXT: mov v0.b[8], w8 4225; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] 4226; CHECK-GI-DOT-NEXT: fmov d1, d1 4227; CHECK-GI-DOT-NEXT: mov v0.b[9], w8 4228; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] 4229; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b 4230; CHECK-GI-DOT-NEXT: mov v0.b[10], w8 4231; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] 4232; CHECK-GI-DOT-NEXT: mov v0.b[11], w8 4233; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] 4234; CHECK-GI-DOT-NEXT: mov v0.b[12], w8 4235; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] 4236; CHECK-GI-DOT-NEXT: mov v0.b[13], w8 4237; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48] 4238; CHECK-GI-DOT-NEXT: mov v0.b[14], w8 4239; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56] 4240; CHECK-GI-DOT-NEXT: mov v0.b[15], w8 4241; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b 4242; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s 4243; CHECK-GI-DOT-NEXT: addv s0, v0.4s 4244; CHECK-GI-DOT-NEXT: fmov w0, s0 4245; CHECK-GI-DOT-NEXT: ret 4246entry: 4247 %xx = zext <24 x i8> %x to <24 x i32> 4248 %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx) 4249 ret i32 %z 4250} 4251 4252define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) { 4253; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext: 4254; CHECK-SD-BASE: // %bb.0: // %entry 4255; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v1.16b, #0 4256; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v0.16b, #0 4257; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 4258; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 4259; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v3.8h, v2.8h 4260; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h 4261; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v0.8h, v1.8h 4262; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h 4263; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s 4264; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 4265; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 4266; CHECK-SD-BASE-NEXT: addv s0, v0.4s 4267; CHECK-SD-BASE-NEXT: fmov w0, s0 4268; CHECK-SD-BASE-NEXT: ret 4269; 4270; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_zext: 4271; CHECK-SD-DOT: // %bb.0: // %entry 4272; CHECK-SD-DOT-NEXT: movi v2.16b, #1 4273; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000 4274; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b 4275; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b 4276; CHECK-SD-DOT-NEXT: addv s0, v3.4s 4277; CHECK-SD-DOT-NEXT: fmov w0, s0 4278; CHECK-SD-DOT-NEXT: ret 4279; 4280; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_zext: 4281; CHECK-GI-BASE: // %bb.0: // %entry 4282; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b 4283; CHECK-GI-BASE-NEXT: uaddlv h1, v1.16b 4284; CHECK-GI-BASE-NEXT: fmov w8, s0 4285; CHECK-GI-BASE-NEXT: fmov w9, s1 4286; CHECK-GI-BASE-NEXT: add w8, w8, w9 4287; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff 4288; CHECK-GI-BASE-NEXT: ret 4289; 4290; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_zext: 4291; CHECK-GI-DOT: // %bb.0: // %entry 4292; CHECK-GI-DOT-NEXT: movi v2.16b, #1 4293; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 4294; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 4295; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b 4296; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b 4297; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v3.4s 4298; CHECK-GI-DOT-NEXT: addv s0, v0.4s 4299; CHECK-GI-DOT-NEXT: fmov w0, s0 4300; CHECK-GI-DOT-NEXT: ret 4301entry: 4302 %xx = zext <32 x i8> %x to <32 x i32> 4303 %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx) 4304 ret i32 %z 4305} 4306 4307define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { 4308; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_sext: 4309; CHECK-SD-BASE: // %bb.0: // %entry 4310; CHECK-SD-BASE-NEXT: fmov s0, w0 4311; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64] 4312; CHECK-SD-BASE-NEXT: add x8, sp, #72 4313; CHECK-SD-BASE-NEXT: ldr b2, [sp] 4314; CHECK-SD-BASE-NEXT: add x9, sp, #80 4315; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8] 4316; CHECK-SD-BASE-NEXT: add x8, sp, #8 4317; CHECK-SD-BASE-NEXT: mov v0.b[1], w1 4318; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8] 4319; CHECK-SD-BASE-NEXT: add x8, sp, #16 4320; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9] 4321; CHECK-SD-BASE-NEXT: add x9, sp, #88 4322; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8] 4323; CHECK-SD-BASE-NEXT: add x8, sp, #24 4324; CHECK-SD-BASE-NEXT: mov v0.b[2], w2 4325; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9] 4326; CHECK-SD-BASE-NEXT: add x9, sp, #96 4327; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8] 4328; CHECK-SD-BASE-NEXT: add x8, sp, #32 4329; CHECK-SD-BASE-NEXT: mov v0.b[3], w3 4330; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9] 4331; CHECK-SD-BASE-NEXT: add x9, sp, #104 4332; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8] 4333; CHECK-SD-BASE-NEXT: add x8, sp, #40 4334; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9] 4335; CHECK-SD-BASE-NEXT: add x9, sp, #112 4336; CHECK-SD-BASE-NEXT: mov v0.b[4], w4 4337; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8] 4338; CHECK-SD-BASE-NEXT: add x8, sp, #48 4339; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9] 4340; CHECK-SD-BASE-NEXT: add x9, sp, #120 4341; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8] 4342; CHECK-SD-BASE-NEXT: add x8, sp, #56 4343; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 4344; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9] 4345; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8] 4346; CHECK-SD-BASE-NEXT: mov v0.b[6], w6 4347; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 4348; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0 4349; CHECK-SD-BASE-NEXT: mov v0.b[7], w7 4350; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 4351; CHECK-SD-BASE-NEXT: saddl2 v3.4s, v0.8h, v1.8h 4352; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h 4353; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v3.4s, v2.8h 4354; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v2.4h 4355; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 4356; CHECK-SD-BASE-NEXT: addv s0, v0.4s 4357; CHECK-SD-BASE-NEXT: fmov w0, s0 4358; CHECK-SD-BASE-NEXT: ret 4359; 4360; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_sext: 4361; CHECK-SD-DOT: // %bb.0: // %entry 4362; CHECK-SD-DOT-NEXT: fmov s0, w0 4363; CHECK-SD-DOT-NEXT: mov x8, sp 4364; CHECK-SD-DOT-NEXT: ldr b1, [sp, #64] 4365; CHECK-SD-DOT-NEXT: add x9, sp, #72 4366; CHECK-SD-DOT-NEXT: movi v2.16b, #1 4367; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000 4368; CHECK-SD-DOT-NEXT: ld1 { v1.b }[1], [x9] 4369; CHECK-SD-DOT-NEXT: add x9, sp, #80 4370; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000 4371; CHECK-SD-DOT-NEXT: mov v0.b[1], w1 4372; CHECK-SD-DOT-NEXT: movi v5.8b, #1 4373; CHECK-SD-DOT-NEXT: ld1 { v1.b }[2], [x9] 4374; CHECK-SD-DOT-NEXT: add x9, sp, #88 4375; CHECK-SD-DOT-NEXT: mov v0.b[2], w2 4376; CHECK-SD-DOT-NEXT: ld1 { v1.b }[3], [x9] 4377; CHECK-SD-DOT-NEXT: add x9, sp, #96 4378; CHECK-SD-DOT-NEXT: mov v0.b[3], w3 4379; CHECK-SD-DOT-NEXT: ld1 { v1.b }[4], [x9] 4380; CHECK-SD-DOT-NEXT: add x9, sp, #104 4381; CHECK-SD-DOT-NEXT: ld1 { v1.b }[5], [x9] 4382; CHECK-SD-DOT-NEXT: add x9, sp, #112 4383; CHECK-SD-DOT-NEXT: mov v0.b[4], w4 4384; CHECK-SD-DOT-NEXT: ld1 { v1.b }[6], [x9] 4385; CHECK-SD-DOT-NEXT: add x9, sp, #120 4386; CHECK-SD-DOT-NEXT: mov v0.b[5], w5 4387; CHECK-SD-DOT-NEXT: ld1 { v1.b }[7], [x9] 4388; CHECK-SD-DOT-NEXT: mov v0.b[6], w6 4389; CHECK-SD-DOT-NEXT: sdot v4.2s, v1.8b, v5.8b 4390; CHECK-SD-DOT-NEXT: mov v0.b[7], w7 4391; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s 4392; CHECK-SD-DOT-NEXT: fmov w9, s1 4393; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8] 4394; CHECK-SD-DOT-NEXT: add x8, sp, #8 4395; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8] 4396; CHECK-SD-DOT-NEXT: add x8, sp, #16 4397; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8] 4398; CHECK-SD-DOT-NEXT: add x8, sp, #24 4399; CHECK-SD-DOT-NEXT: ld1 { v0.b }[11], [x8] 4400; CHECK-SD-DOT-NEXT: add x8, sp, #32 4401; CHECK-SD-DOT-NEXT: ld1 { v0.b }[12], [x8] 4402; CHECK-SD-DOT-NEXT: add x8, sp, #40 4403; CHECK-SD-DOT-NEXT: ld1 { v0.b }[13], [x8] 4404; CHECK-SD-DOT-NEXT: add x8, sp, #48 4405; CHECK-SD-DOT-NEXT: ld1 { v0.b }[14], [x8] 4406; CHECK-SD-DOT-NEXT: add x8, sp, #56 4407; CHECK-SD-DOT-NEXT: ld1 { v0.b }[15], [x8] 4408; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b 4409; CHECK-SD-DOT-NEXT: addv s0, v3.4s 4410; CHECK-SD-DOT-NEXT: fmov w8, s0 4411; CHECK-SD-DOT-NEXT: add w0, w8, w9 4412; CHECK-SD-DOT-NEXT: ret 4413; 4414; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext: 4415; CHECK-GI-BASE: // %bb.0: // %entry 4416; CHECK-GI-BASE-NEXT: fmov s0, w0 4417; CHECK-GI-BASE-NEXT: ldr w8, [sp] 4418; CHECK-GI-BASE-NEXT: ldr w9, [sp, #8] 4419; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72] 4420; CHECK-GI-BASE-NEXT: mov v0.b[1], w1 4421; CHECK-GI-BASE-NEXT: mov v0.b[2], w2 4422; CHECK-GI-BASE-NEXT: mov v0.b[3], w3 4423; CHECK-GI-BASE-NEXT: mov v0.b[4], w4 4424; CHECK-GI-BASE-NEXT: mov v0.b[5], w5 4425; CHECK-GI-BASE-NEXT: mov v0.b[6], w6 4426; CHECK-GI-BASE-NEXT: mov v0.b[7], w7 4427; CHECK-GI-BASE-NEXT: mov v0.b[8], w8 4428; CHECK-GI-BASE-NEXT: ldr w8, [sp, #64] 4429; CHECK-GI-BASE-NEXT: fmov s1, w8 4430; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16] 4431; CHECK-GI-BASE-NEXT: mov v0.b[9], w9 4432; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80] 4433; CHECK-GI-BASE-NEXT: mov v1.b[1], w10 4434; CHECK-GI-BASE-NEXT: mov v0.b[10], w8 4435; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24] 4436; CHECK-GI-BASE-NEXT: mov v1.b[2], w9 4437; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] 4438; CHECK-GI-BASE-NEXT: mov v0.b[11], w8 4439; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32] 4440; CHECK-GI-BASE-NEXT: mov v1.b[3], w9 4441; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] 4442; CHECK-GI-BASE-NEXT: mov v0.b[12], w8 4443; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40] 4444; CHECK-GI-BASE-NEXT: mov v1.b[4], w9 4445; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] 4446; CHECK-GI-BASE-NEXT: mov v0.b[13], w8 4447; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48] 4448; CHECK-GI-BASE-NEXT: mov v1.b[5], w9 4449; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] 4450; CHECK-GI-BASE-NEXT: mov v0.b[14], w8 4451; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56] 4452; CHECK-GI-BASE-NEXT: mov v1.b[6], w9 4453; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] 4454; CHECK-GI-BASE-NEXT: mov v0.b[15], w8 4455; CHECK-GI-BASE-NEXT: mov v1.b[7], w9 4456; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b 4457; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b 4458; CHECK-GI-BASE-NEXT: fmov w8, s0 4459; CHECK-GI-BASE-NEXT: fmov w9, s1 4460; CHECK-GI-BASE-NEXT: add w8, w8, w9 4461; CHECK-GI-BASE-NEXT: sxth w0, w8 4462; CHECK-GI-BASE-NEXT: ret 4463; 4464; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext: 4465; CHECK-GI-DOT: // %bb.0: // %entry 4466; CHECK-GI-DOT-NEXT: fmov s0, w0 4467; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64] 4468; CHECK-GI-DOT-NEXT: ldr w8, [sp] 4469; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72] 4470; CHECK-GI-DOT-NEXT: movi v2.8b, #1 4471; CHECK-GI-DOT-NEXT: movi v3.8b, #1 4472; CHECK-GI-DOT-NEXT: fmov s1, w9 4473; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80] 4474; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 4475; CHECK-GI-DOT-NEXT: mov v0.b[1], w1 4476; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 4477; CHECK-GI-DOT-NEXT: mov v1.b[1], w10 4478; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0] 4479; CHECK-GI-DOT-NEXT: mov v0.b[2], w2 4480; CHECK-GI-DOT-NEXT: mov v1.b[2], w9 4481; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88] 4482; CHECK-GI-DOT-NEXT: mov v0.b[3], w3 4483; CHECK-GI-DOT-NEXT: mov v1.b[3], w9 4484; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96] 4485; CHECK-GI-DOT-NEXT: mov v0.b[4], w4 4486; CHECK-GI-DOT-NEXT: mov v1.b[4], w9 4487; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104] 4488; CHECK-GI-DOT-NEXT: mov v0.b[5], w5 4489; CHECK-GI-DOT-NEXT: mov v1.b[5], w9 4490; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112] 4491; CHECK-GI-DOT-NEXT: mov v0.b[6], w6 4492; CHECK-GI-DOT-NEXT: mov v1.b[6], w9 4493; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120] 4494; CHECK-GI-DOT-NEXT: mov v0.b[7], w7 4495; CHECK-GI-DOT-NEXT: mov v1.b[7], w9 4496; CHECK-GI-DOT-NEXT: mov v0.b[8], w8 4497; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] 4498; CHECK-GI-DOT-NEXT: fmov d1, d1 4499; CHECK-GI-DOT-NEXT: mov v0.b[9], w8 4500; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] 4501; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b 4502; CHECK-GI-DOT-NEXT: mov v0.b[10], w8 4503; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] 4504; CHECK-GI-DOT-NEXT: mov v0.b[11], w8 4505; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] 4506; CHECK-GI-DOT-NEXT: mov v0.b[12], w8 4507; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] 4508; CHECK-GI-DOT-NEXT: mov v0.b[13], w8 4509; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48] 4510; CHECK-GI-DOT-NEXT: mov v0.b[14], w8 4511; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56] 4512; CHECK-GI-DOT-NEXT: mov v0.b[15], w8 4513; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b 4514; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s 4515; CHECK-GI-DOT-NEXT: addv s0, v0.4s 4516; CHECK-GI-DOT-NEXT: fmov w0, s0 4517; CHECK-GI-DOT-NEXT: ret 4518entry: 4519 %xx = sext <24 x i8> %x to <24 x i32> 4520 %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx) 4521 ret i32 %z 4522} 4523 4524define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) { 4525; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext: 4526; CHECK-SD-BASE: // %bb.0: // %entry 4527; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v1.16b, #0 4528; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v0.16b, #0 4529; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 4530; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 4531; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v3.8h, v2.8h 4532; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h 4533; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v0.8h, v1.8h 4534; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h 4535; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s 4536; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s 4537; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s 4538; CHECK-SD-BASE-NEXT: addv s0, v0.4s 4539; CHECK-SD-BASE-NEXT: fmov w0, s0 4540; CHECK-SD-BASE-NEXT: ret 4541; 4542; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_sext: 4543; CHECK-SD-DOT: // %bb.0: // %entry 4544; CHECK-SD-DOT-NEXT: movi v2.16b, #1 4545; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000 4546; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b 4547; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b 4548; CHECK-SD-DOT-NEXT: addv s0, v3.4s 4549; CHECK-SD-DOT-NEXT: fmov w0, s0 4550; CHECK-SD-DOT-NEXT: ret 4551; 4552; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_sext: 4553; CHECK-GI-BASE: // %bb.0: // %entry 4554; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b 4555; CHECK-GI-BASE-NEXT: saddlv h1, v1.16b 4556; CHECK-GI-BASE-NEXT: fmov w8, s0 4557; CHECK-GI-BASE-NEXT: fmov w9, s1 4558; CHECK-GI-BASE-NEXT: add w8, w8, w9 4559; CHECK-GI-BASE-NEXT: sxth w0, w8 4560; CHECK-GI-BASE-NEXT: ret 4561; 4562; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_sext: 4563; CHECK-GI-DOT: // %bb.0: // %entry 4564; CHECK-GI-DOT-NEXT: movi v2.16b, #1 4565; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 4566; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 4567; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b 4568; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b 4569; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v3.4s 4570; CHECK-GI-DOT-NEXT: addv s0, v0.4s 4571; CHECK-GI-DOT-NEXT: fmov w0, s0 4572; CHECK-GI-DOT-NEXT: ret 4573entry: 4574 %xx = sext <32 x i8> %x to <32 x i32> 4575 %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx) 4576 ret i32 %z 4577} 4578 4579define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { 4580; CHECK-SD-BASE-LABEL: full: 4581; CHECK-SD-BASE: // %bb.0: // %entry 4582; CHECK-SD-BASE-NEXT: // kill: def $w3 killed $w3 def $x3 4583; CHECK-SD-BASE-NEXT: // kill: def $w1 killed $w1 def $x1 4584; CHECK-SD-BASE-NEXT: sxtw x8, w3 4585; CHECK-SD-BASE-NEXT: sxtw x9, w1 4586; CHECK-SD-BASE-NEXT: ldr d0, [x0] 4587; CHECK-SD-BASE-NEXT: ldr d1, [x2] 4588; CHECK-SD-BASE-NEXT: add x10, x0, x9 4589; CHECK-SD-BASE-NEXT: add x11, x2, x8 4590; CHECK-SD-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b 4591; CHECK-SD-BASE-NEXT: ldr d1, [x10] 4592; CHECK-SD-BASE-NEXT: ldr d2, [x11] 4593; CHECK-SD-BASE-NEXT: add x10, x10, x9 4594; CHECK-SD-BASE-NEXT: add x11, x11, x8 4595; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4596; CHECK-SD-BASE-NEXT: ldr d2, [x11] 4597; CHECK-SD-BASE-NEXT: add x11, x11, x8 4598; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h 4599; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4600; CHECK-SD-BASE-NEXT: ldr d1, [x10] 4601; CHECK-SD-BASE-NEXT: add x10, x10, x9 4602; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4603; CHECK-SD-BASE-NEXT: ldr d2, [x11] 4604; CHECK-SD-BASE-NEXT: add x11, x11, x8 4605; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4606; CHECK-SD-BASE-NEXT: ldr d1, [x10] 4607; CHECK-SD-BASE-NEXT: add x10, x10, x9 4608; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4609; CHECK-SD-BASE-NEXT: ldr d2, [x11] 4610; CHECK-SD-BASE-NEXT: add x11, x11, x8 4611; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4612; CHECK-SD-BASE-NEXT: ldr d1, [x10] 4613; CHECK-SD-BASE-NEXT: add x10, x10, x9 4614; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4615; CHECK-SD-BASE-NEXT: ldr d2, [x11] 4616; CHECK-SD-BASE-NEXT: add x11, x11, x8 4617; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4618; CHECK-SD-BASE-NEXT: ldr d1, [x10] 4619; CHECK-SD-BASE-NEXT: add x10, x10, x9 4620; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4621; CHECK-SD-BASE-NEXT: ldr d2, [x11] 4622; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4623; CHECK-SD-BASE-NEXT: ldr d1, [x10] 4624; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4625; CHECK-SD-BASE-NEXT: ldr d2, [x11, x8] 4626; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4627; CHECK-SD-BASE-NEXT: ldr d1, [x10, x9] 4628; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b 4629; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h 4630; CHECK-SD-BASE-NEXT: addv s0, v0.4s 4631; CHECK-SD-BASE-NEXT: fmov w0, s0 4632; CHECK-SD-BASE-NEXT: ret 4633; 4634; CHECK-SD-DOT-LABEL: full: 4635; CHECK-SD-DOT: // %bb.0: // %entry 4636; CHECK-SD-DOT-NEXT: ldr d0, [x0] 4637; CHECK-SD-DOT-NEXT: ldr d1, [x2] 4638; CHECK-SD-DOT-NEXT: // kill: def $w3 killed $w3 def $x3 4639; CHECK-SD-DOT-NEXT: // kill: def $w1 killed $w1 def $x1 4640; CHECK-SD-DOT-NEXT: sxtw x8, w3 4641; CHECK-SD-DOT-NEXT: sxtw x9, w1 4642; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 4643; CHECK-SD-DOT-NEXT: movi v3.8b, #1 4644; CHECK-SD-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b 4645; CHECK-SD-DOT-NEXT: add x11, x2, x8 4646; CHECK-SD-DOT-NEXT: add x10, x0, x9 4647; CHECK-SD-DOT-NEXT: ldr d4, [x11] 4648; CHECK-SD-DOT-NEXT: add x11, x11, x8 4649; CHECK-SD-DOT-NEXT: ldr d1, [x10] 4650; CHECK-SD-DOT-NEXT: add x10, x10, x9 4651; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4652; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4653; CHECK-SD-DOT-NEXT: ldr d1, [x10] 4654; CHECK-SD-DOT-NEXT: ldr d4, [x11] 4655; CHECK-SD-DOT-NEXT: add x10, x10, x9 4656; CHECK-SD-DOT-NEXT: add x11, x11, x8 4657; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4658; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4659; CHECK-SD-DOT-NEXT: ldr d1, [x10] 4660; CHECK-SD-DOT-NEXT: ldr d4, [x11] 4661; CHECK-SD-DOT-NEXT: add x10, x10, x9 4662; CHECK-SD-DOT-NEXT: add x11, x11, x8 4663; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4664; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4665; CHECK-SD-DOT-NEXT: ldr d1, [x10] 4666; CHECK-SD-DOT-NEXT: ldr d4, [x11] 4667; CHECK-SD-DOT-NEXT: add x10, x10, x9 4668; CHECK-SD-DOT-NEXT: add x11, x11, x8 4669; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4670; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4671; CHECK-SD-DOT-NEXT: ldr d1, [x10] 4672; CHECK-SD-DOT-NEXT: ldr d4, [x11] 4673; CHECK-SD-DOT-NEXT: add x10, x10, x9 4674; CHECK-SD-DOT-NEXT: add x11, x11, x8 4675; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4676; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4677; CHECK-SD-DOT-NEXT: ldr d1, [x10] 4678; CHECK-SD-DOT-NEXT: ldr d4, [x11] 4679; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4680; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4681; CHECK-SD-DOT-NEXT: ldr d1, [x10, x9] 4682; CHECK-SD-DOT-NEXT: ldr d4, [x11, x8] 4683; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4684; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b 4685; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b 4686; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s 4687; CHECK-SD-DOT-NEXT: fmov w0, s0 4688; CHECK-SD-DOT-NEXT: ret 4689; 4690; CHECK-GI-LABEL: full: 4691; CHECK-GI: // %bb.0: // %entry 4692; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 4693; CHECK-GI-NEXT: // kill: def $w3 killed $w3 def $x3 4694; CHECK-GI-NEXT: sxtw x9, w1 4695; CHECK-GI-NEXT: sxtw x8, w3 4696; CHECK-GI-NEXT: ldr d0, [x0] 4697; CHECK-GI-NEXT: ldr d1, [x2] 4698; CHECK-GI-NEXT: add x10, x0, x9 4699; CHECK-GI-NEXT: add x11, x2, x8 4700; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b 4701; CHECK-GI-NEXT: ldr d1, [x10] 4702; CHECK-GI-NEXT: ldr d2, [x11] 4703; CHECK-GI-NEXT: add x10, x10, x9 4704; CHECK-GI-NEXT: add x11, x11, x8 4705; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b 4706; CHECK-GI-NEXT: ldr d3, [x10] 4707; CHECK-GI-NEXT: ldr d4, [x11] 4708; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 4709; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 4710; CHECK-GI-NEXT: add x10, x10, x9 4711; CHECK-GI-NEXT: add x11, x11, x8 4712; CHECK-GI-NEXT: ldr d2, [x10] 4713; CHECK-GI-NEXT: add x10, x10, x9 4714; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 4715; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 4716; CHECK-GI-NEXT: ldr d6, [x11] 4717; CHECK-GI-NEXT: add x11, x11, x8 4718; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b 4719; CHECK-GI-NEXT: abs v5.4s, v5.4s 4720; CHECK-GI-NEXT: abs v0.4s, v0.4s 4721; CHECK-GI-NEXT: ldr d4, [x10] 4722; CHECK-GI-NEXT: ldr d16, [x11] 4723; CHECK-GI-NEXT: abs v7.4s, v7.4s 4724; CHECK-GI-NEXT: abs v1.4s, v1.4s 4725; CHECK-GI-NEXT: add x10, x10, x9 4726; CHECK-GI-NEXT: add x11, x11, x8 4727; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b 4728; CHECK-GI-NEXT: ldr d6, [x10] 4729; CHECK-GI-NEXT: ldr d17, [x11] 4730; CHECK-GI-NEXT: add x10, x10, x9 4731; CHECK-GI-NEXT: add x11, x11, x8 4732; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b 4733; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 4734; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 4735; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s 4736; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s 4737; CHECK-GI-NEXT: ldr d5, [x10] 4738; CHECK-GI-NEXT: ldr d7, [x11] 4739; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0 4740; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 4741; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b 4742; CHECK-GI-NEXT: ldr d17, [x11, x8] 4743; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 4744; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b 4745; CHECK-GI-NEXT: ldr d7, [x10, x9] 4746; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 4747; CHECK-GI-NEXT: abs v16.4s, v16.4s 4748; CHECK-GI-NEXT: abs v3.4s, v3.4s 4749; CHECK-GI-NEXT: abs v18.4s, v18.4s 4750; CHECK-GI-NEXT: abs v2.4s, v2.4s 4751; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b 4752; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0 4753; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 4754; CHECK-GI-NEXT: abs v19.4s, v19.4s 4755; CHECK-GI-NEXT: abs v4.4s, v4.4s 4756; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s 4757; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0 4758; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 4759; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s 4760; CHECK-GI-NEXT: abs v17.4s, v17.4s 4761; CHECK-GI-NEXT: addv s1, v1.4s 4762; CHECK-GI-NEXT: abs v6.4s, v6.4s 4763; CHECK-GI-NEXT: addv s0, v0.4s 4764; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s 4765; CHECK-GI-NEXT: addv s3, v3.4s 4766; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0 4767; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 4768; CHECK-GI-NEXT: abs v16.4s, v16.4s 4769; CHECK-GI-NEXT: abs v5.4s, v5.4s 4770; CHECK-GI-NEXT: fmov w8, s1 4771; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s 4772; CHECK-GI-NEXT: addv s2, v2.4s 4773; CHECK-GI-NEXT: fmov w9, s0 4774; CHECK-GI-NEXT: addv s4, v4.4s 4775; CHECK-GI-NEXT: fmov w10, s3 4776; CHECK-GI-NEXT: abs v18.4s, v18.4s 4777; CHECK-GI-NEXT: abs v7.4s, v7.4s 4778; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s 4779; CHECK-GI-NEXT: add w8, w8, w9 4780; CHECK-GI-NEXT: addv s3, v6.4s 4781; CHECK-GI-NEXT: fmov w9, s2 4782; CHECK-GI-NEXT: add w8, w10, w8 4783; CHECK-GI-NEXT: fmov w10, s4 4784; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s 4785; CHECK-GI-NEXT: addv s1, v1.4s 4786; CHECK-GI-NEXT: add w8, w9, w8 4787; CHECK-GI-NEXT: fmov w9, s3 4788; CHECK-GI-NEXT: add w8, w10, w8 4789; CHECK-GI-NEXT: addv s0, v0.4s 4790; CHECK-GI-NEXT: add w8, w9, w8 4791; CHECK-GI-NEXT: fmov w9, s1 4792; CHECK-GI-NEXT: add w8, w9, w8 4793; CHECK-GI-NEXT: fmov w9, s0 4794; CHECK-GI-NEXT: add w0, w9, w8 4795; CHECK-GI-NEXT: ret 4796entry: 4797 %idx.ext8 = sext i32 %s2 to i64 4798 %idx.ext = sext i32 %s1 to i64 4799 %0 = load <8 x i8>, ptr %p1, align 1 4800 %1 = zext <8 x i8> %0 to <8 x i32> 4801 %2 = load <8 x i8>, ptr %p2, align 1 4802 %3 = zext <8 x i8> %2 to <8 x i32> 4803 %4 = sub nsw <8 x i32> %1, %3 4804 %5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true) 4805 %6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) 4806 %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext 4807 %add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8 4808 %7 = load <8 x i8>, ptr %add.ptr, align 1 4809 %8 = zext <8 x i8> %7 to <8 x i32> 4810 %9 = load <8 x i8>, ptr %add.ptr9, align 1 4811 %10 = zext <8 x i8> %9 to <8 x i32> 4812 %11 = sub nsw <8 x i32> %8, %10 4813 %12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true) 4814 %13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12) 4815 %op.rdx.1 = add i32 %13, %6 4816 %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext 4817 %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8 4818 %14 = load <8 x i8>, ptr %add.ptr.1, align 1 4819 %15 = zext <8 x i8> %14 to <8 x i32> 4820 %16 = load <8 x i8>, ptr %add.ptr9.1, align 1 4821 %17 = zext <8 x i8> %16 to <8 x i32> 4822 %18 = sub nsw <8 x i32> %15, %17 4823 %19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true) 4824 %20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19) 4825 %op.rdx.2 = add i32 %20, %op.rdx.1 4826 %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext 4827 %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8 4828 %21 = load <8 x i8>, ptr %add.ptr.2, align 1 4829 %22 = zext <8 x i8> %21 to <8 x i32> 4830 %23 = load <8 x i8>, ptr %add.ptr9.2, align 1 4831 %24 = zext <8 x i8> %23 to <8 x i32> 4832 %25 = sub nsw <8 x i32> %22, %24 4833 %26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true) 4834 %27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26) 4835 %op.rdx.3 = add i32 %27, %op.rdx.2 4836 %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext 4837 %add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8 4838 %28 = load <8 x i8>, ptr %add.ptr.3, align 1 4839 %29 = zext <8 x i8> %28 to <8 x i32> 4840 %30 = load <8 x i8>, ptr %add.ptr9.3, align 1 4841 %31 = zext <8 x i8> %30 to <8 x i32> 4842 %32 = sub nsw <8 x i32> %29, %31 4843 %33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true) 4844 %34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33) 4845 %op.rdx.4 = add i32 %34, %op.rdx.3 4846 %add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext 4847 %add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8 4848 %35 = load <8 x i8>, ptr %add.ptr.4, align 1 4849 %36 = zext <8 x i8> %35 to <8 x i32> 4850 %37 = load <8 x i8>, ptr %add.ptr9.4, align 1 4851 %38 = zext <8 x i8> %37 to <8 x i32> 4852 %39 = sub nsw <8 x i32> %36, %38 4853 %40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true) 4854 %41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40) 4855 %op.rdx.5 = add i32 %41, %op.rdx.4 4856 %add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext 4857 %add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8 4858 %42 = load <8 x i8>, ptr %add.ptr.5, align 1 4859 %43 = zext <8 x i8> %42 to <8 x i32> 4860 %44 = load <8 x i8>, ptr %add.ptr9.5, align 1 4861 %45 = zext <8 x i8> %44 to <8 x i32> 4862 %46 = sub nsw <8 x i32> %43, %45 4863 %47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true) 4864 %48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47) 4865 %op.rdx.6 = add i32 %48, %op.rdx.5 4866 %add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext 4867 %add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8 4868 %49 = load <8 x i8>, ptr %add.ptr.6, align 1 4869 %50 = zext <8 x i8> %49 to <8 x i32> 4870 %51 = load <8 x i8>, ptr %add.ptr9.6, align 1 4871 %52 = zext <8 x i8> %51 to <8 x i32> 4872 %53 = sub nsw <8 x i32> %50, %52 4873 %54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true) 4874 %55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54) 4875 %op.rdx.7 = add i32 %55, %op.rdx.6 4876 ret i32 %op.rdx.7 4877} 4878 4879define i32 @extract_hi_lo(<8 x i16> %a) { 4880; CHECK-SD-LABEL: extract_hi_lo: 4881; CHECK-SD: // %bb.0: // %entry 4882; CHECK-SD-NEXT: uaddlv s0, v0.8h 4883; CHECK-SD-NEXT: fmov w0, s0 4884; CHECK-SD-NEXT: ret 4885; 4886; CHECK-GI-LABEL: extract_hi_lo: 4887; CHECK-GI: // %bb.0: // %entry 4888; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 4889; CHECK-GI-NEXT: uaddw2 v0.4s, v1.4s, v0.8h 4890; CHECK-GI-NEXT: addv s0, v0.4s 4891; CHECK-GI-NEXT: fmov w0, s0 4892; CHECK-GI-NEXT: ret 4893entry: 4894 %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4895 %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 4896 %z1 = zext <4 x i16> %e1 to <4 x i32> 4897 %z2 = zext <4 x i16> %e2 to <4 x i32> 4898 %z4 = add <4 x i32> %z1, %z2 4899 %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4) 4900 ret i32 %z5 4901} 4902 4903define i32 @extract_hi_hi(<8 x i16> %a) { 4904; CHECK-SD-LABEL: extract_hi_hi: 4905; CHECK-SD: // %bb.0: // %entry 4906; CHECK-SD-NEXT: mov v0.d[0], v0.d[1] 4907; CHECK-SD-NEXT: uaddlv s0, v0.8h 4908; CHECK-SD-NEXT: fmov w0, s0 4909; CHECK-SD-NEXT: ret 4910; 4911; CHECK-GI-LABEL: extract_hi_hi: 4912; CHECK-GI: // %bb.0: // %entry 4913; CHECK-GI-NEXT: uaddl2 v0.4s, v0.8h, v0.8h 4914; CHECK-GI-NEXT: addv s0, v0.4s 4915; CHECK-GI-NEXT: fmov w0, s0 4916; CHECK-GI-NEXT: ret 4917entry: 4918 %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 4919 %z2 = zext <4 x i16> %e2 to <4 x i32> 4920 %z4 = add <4 x i32> %z2, %z2 4921 %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4) 4922 ret i32 %z5 4923} 4924 4925define i32 @extract_lo_lo(<8 x i16> %a) { 4926; CHECK-SD-LABEL: extract_lo_lo: 4927; CHECK-SD: // %bb.0: // %entry 4928; CHECK-SD-NEXT: mov v0.d[1], v0.d[0] 4929; CHECK-SD-NEXT: uaddlv s0, v0.8h 4930; CHECK-SD-NEXT: fmov w0, s0 4931; CHECK-SD-NEXT: ret 4932; 4933; CHECK-GI-LABEL: extract_lo_lo: 4934; CHECK-GI: // %bb.0: // %entry 4935; CHECK-GI-NEXT: uaddl v0.4s, v0.4h, v0.4h 4936; CHECK-GI-NEXT: addv s0, v0.4s 4937; CHECK-GI-NEXT: fmov w0, s0 4938; CHECK-GI-NEXT: ret 4939entry: 4940 %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4941 %z1 = zext <4 x i16> %e1 to <4 x i32> 4942 %z4 = add <4 x i32> %z1, %z1 4943 %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4) 4944 ret i32 %z5 4945} 4946 4947declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1 4948declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 4949declare i16 @llvm.vector.reduce.add.v24i16(<24 x i16>) 4950declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 4951declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 4952declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 4953declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 4954declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 4955declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 4956declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 4957declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>) 4958declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 4959declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>) 4960declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 4961declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 4962declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 4963declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 4964declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 4965declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 4966