1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD 3; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) 6declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>) 7declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 8declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 9declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 10declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 11declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) 12declare i16 @llvm.vector.reduce.add.v3i16(<3 x i16>) 13declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 14declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 15declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 16declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 17declare i32 @llvm.vector.reduce.add.v3i32(<3 x i32>) 18declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 19declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 20declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 21declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>) 22declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 23declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>) 24 25; CHECK-GI: warning: Instruction selection used fallback path for addv_v2i8 26; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i8 27; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v4i8 28; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i16 29; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i16 30; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i32 31; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i64 32; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i128 33 34 35define i8 @add_B(ptr %arr) { 36; CHECK-LABEL: add_B: 37; CHECK: // %bb.0: 38; CHECK-NEXT: ldr q0, [x0] 39; CHECK-NEXT: addv b0, v0.16b 40; CHECK-NEXT: fmov w0, s0 41; CHECK-NEXT: ret 42 %bin.rdx = load <16 x i8>, ptr %arr 43 %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %bin.rdx) 44 ret i8 %r 45} 46 47define i16 @add_H(ptr %arr) { 48; CHECK-LABEL: add_H: 49; CHECK: // %bb.0: 50; CHECK-NEXT: ldr q0, [x0] 51; CHECK-NEXT: addv h0, v0.8h 52; CHECK-NEXT: fmov w0, s0 53; CHECK-NEXT: ret 54 %bin.rdx = load <8 x i16>, ptr %arr 55 %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx) 56 ret i16 %r 57} 58 59define i32 @add_S( ptr %arr) { 60; CHECK-LABEL: add_S: 61; CHECK: // %bb.0: 62; CHECK-NEXT: ldr q0, [x0] 63; CHECK-NEXT: addv s0, v0.4s 64; CHECK-NEXT: fmov w0, s0 65; CHECK-NEXT: ret 66 %bin.rdx = load <4 x i32>, ptr %arr 67 %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx) 68 ret i32 %r 69} 70 71define i64 @add_D(ptr %arr) { 72; CHECK-LABEL: add_D: 73; CHECK: // %bb.0: 74; CHECK-NEXT: ldr q0, [x0] 75; CHECK-NEXT: addp d0, v0.2d 76; CHECK-NEXT: fmov x0, d0 77; CHECK-NEXT: ret 78 %bin.rdx = load <2 x i64>, ptr %arr 79 %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %bin.rdx) 80 ret i64 %r 81} 82 83 84define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias nocapture readonly %arg2) { 85; CHECK-SD-LABEL: oversized_ADDV_256: 86; CHECK-SD: // %bb.0: // %entry 87; CHECK-SD-NEXT: ldr d0, [x0] 88; CHECK-SD-NEXT: ldr d1, [x1] 89; CHECK-SD-NEXT: uabdl v0.8h, v0.8b, v1.8b 90; CHECK-SD-NEXT: uaddlv s0, v0.8h 91; CHECK-SD-NEXT: fmov w0, s0 92; CHECK-SD-NEXT: ret 93; 94; CHECK-GI-LABEL: oversized_ADDV_256: 95; CHECK-GI: // %bb.0: // %entry 96; CHECK-GI-NEXT: ldr d1, [x0] 97; CHECK-GI-NEXT: ldr d2, [x1] 98; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 99; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b 100; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 101; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 102; CHECK-GI-NEXT: ssubw2 v0.4s, v0.4s, v1.8h 103; CHECK-GI-NEXT: cmlt v4.4s, v2.4s, #0 104; CHECK-GI-NEXT: cmlt v5.4s, v3.4s, #0 105; CHECK-GI-NEXT: neg v6.4s, v2.4s 106; CHECK-GI-NEXT: mov v1.16b, v4.16b 107; CHECK-GI-NEXT: bif v0.16b, v3.16b, v5.16b 108; CHECK-GI-NEXT: bsl v1.16b, v6.16b, v2.16b 109; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s 110; CHECK-GI-NEXT: addv s0, v0.4s 111; CHECK-GI-NEXT: fmov w0, s0 112; CHECK-GI-NEXT: ret 113entry: 114 %0 = load <8 x i8>, ptr %arg1, align 1 115 %1 = zext <8 x i8> %0 to <8 x i32> 116 %2 = load <8 x i8>, ptr %arg2, align 1 117 %3 = zext <8 x i8> %2 to <8 x i32> 118 %4 = sub nsw <8 x i32> %1, %3 119 %5 = icmp slt <8 x i32> %4, zeroinitializer 120 %6 = sub nsw <8 x i32> zeroinitializer, %4 121 %7 = select <8 x i1> %5, <8 x i32> %6, <8 x i32> %4 122 %r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) 123 ret i32 %r 124} 125 126declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 127 128define i32 @oversized_ADDV_512(ptr %arr) { 129; CHECK-SD-LABEL: oversized_ADDV_512: 130; CHECK-SD: // %bb.0: 131; CHECK-SD-NEXT: ldp q0, q1, [x0, #32] 132; CHECK-SD-NEXT: ldp q2, q3, [x0] 133; CHECK-SD-NEXT: add v1.4s, v3.4s, v1.4s 134; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s 135; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 136; CHECK-SD-NEXT: addv s0, v0.4s 137; CHECK-SD-NEXT: fmov w0, s0 138; CHECK-SD-NEXT: ret 139; 140; CHECK-GI-LABEL: oversized_ADDV_512: 141; CHECK-GI: // %bb.0: 142; CHECK-GI-NEXT: ldp q0, q1, [x0] 143; CHECK-GI-NEXT: ldp q2, q3, [x0, #32] 144; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 145; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s 146; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 147; CHECK-GI-NEXT: addv s0, v0.4s 148; CHECK-GI-NEXT: fmov w0, s0 149; CHECK-GI-NEXT: ret 150 %bin.rdx = load <16 x i32>, ptr %arr 151 %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) 152 ret i32 %r 153} 154 155define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) { 156; CHECK-SD-LABEL: addv_combine_i8: 157; CHECK-SD: // %bb.0: // %entry 158; CHECK-SD-NEXT: add v0.8b, v0.8b, v1.8b 159; CHECK-SD-NEXT: addv b0, v0.8b 160; CHECK-SD-NEXT: fmov w0, s0 161; CHECK-SD-NEXT: ret 162; 163; CHECK-GI-LABEL: addv_combine_i8: 164; CHECK-GI: // %bb.0: // %entry 165; CHECK-GI-NEXT: addv b0, v0.8b 166; CHECK-GI-NEXT: addv b1, v1.8b 167; CHECK-GI-NEXT: fmov w8, s0 168; CHECK-GI-NEXT: fmov w9, s1 169; CHECK-GI-NEXT: add w0, w9, w8, uxtb 170; CHECK-GI-NEXT: ret 171entry: 172 %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1) 173 %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2) 174 %r = add i8 %rdx.1, %rdx.2 175 ret i8 %r 176} 177 178define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) { 179; CHECK-SD-LABEL: addv_combine_i16: 180; CHECK-SD: // %bb.0: // %entry 181; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h 182; CHECK-SD-NEXT: addv h0, v0.4h 183; CHECK-SD-NEXT: fmov w0, s0 184; CHECK-SD-NEXT: ret 185; 186; CHECK-GI-LABEL: addv_combine_i16: 187; CHECK-GI: // %bb.0: // %entry 188; CHECK-GI-NEXT: addv h0, v0.4h 189; CHECK-GI-NEXT: addv h1, v1.4h 190; CHECK-GI-NEXT: fmov w8, s0 191; CHECK-GI-NEXT: fmov w9, s1 192; CHECK-GI-NEXT: add w0, w9, w8, uxth 193; CHECK-GI-NEXT: ret 194entry: 195 %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1) 196 %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2) 197 %r = add i16 %rdx.1, %rdx.2 198 ret i16 %r 199} 200 201define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { 202; CHECK-SD-LABEL: addv_combine_i32: 203; CHECK-SD: // %bb.0: // %entry 204; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 205; CHECK-SD-NEXT: addv s0, v0.4s 206; CHECK-SD-NEXT: fmov w0, s0 207; CHECK-SD-NEXT: ret 208; 209; CHECK-GI-LABEL: addv_combine_i32: 210; CHECK-GI: // %bb.0: // %entry 211; CHECK-GI-NEXT: addv s0, v0.4s 212; CHECK-GI-NEXT: addv s1, v1.4s 213; CHECK-GI-NEXT: fmov w8, s0 214; CHECK-GI-NEXT: fmov w9, s1 215; CHECK-GI-NEXT: add w0, w8, w9 216; CHECK-GI-NEXT: ret 217entry: 218 %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1) 219 %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2) 220 %r = add i32 %rdx.1, %rdx.2 221 ret i32 %r 222} 223 224define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { 225; CHECK-SD-LABEL: addv_combine_i64: 226; CHECK-SD: // %bb.0: // %entry 227; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d 228; CHECK-SD-NEXT: addp d0, v0.2d 229; CHECK-SD-NEXT: fmov x0, d0 230; CHECK-SD-NEXT: ret 231; 232; CHECK-GI-LABEL: addv_combine_i64: 233; CHECK-GI: // %bb.0: // %entry 234; CHECK-GI-NEXT: addp d0, v0.2d 235; CHECK-GI-NEXT: addp d1, v1.2d 236; CHECK-GI-NEXT: fmov x8, d0 237; CHECK-GI-NEXT: fmov x9, d1 238; CHECK-GI-NEXT: add x0, x8, x9 239; CHECK-GI-NEXT: ret 240entry: 241 %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1) 242 %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2) 243 %r = add i64 %rdx.1, %rdx.2 244 ret i64 %r 245} 246 247define i8 @addv_v2i8(<2 x i8> %a) { 248; CHECK-LABEL: addv_v2i8: 249; CHECK: // %bb.0: // %entry 250; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 251; CHECK-NEXT: fmov w0, s0 252; CHECK-NEXT: ret 253entry: 254 %arg1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a) 255 ret i8 %arg1 256} 257 258define i8 @addv_v3i8(<3 x i8> %a) { 259; CHECK-LABEL: addv_v3i8: 260; CHECK: // %bb.0: // %entry 261; CHECK-NEXT: movi v0.2d, #0000000000000000 262; CHECK-NEXT: mov v0.h[0], w0 263; CHECK-NEXT: mov v0.h[1], w1 264; CHECK-NEXT: mov v0.h[2], w2 265; CHECK-NEXT: addv h0, v0.4h 266; CHECK-NEXT: fmov w0, s0 267; CHECK-NEXT: ret 268entry: 269 %arg1 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a) 270 ret i8 %arg1 271} 272 273define i8 @addv_v4i8(<4 x i8> %a) { 274; CHECK-LABEL: addv_v4i8: 275; CHECK: // %bb.0: // %entry 276; CHECK-NEXT: addv h0, v0.4h 277; CHECK-NEXT: fmov w0, s0 278; CHECK-NEXT: ret 279entry: 280 %arg1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a) 281 ret i8 %arg1 282} 283 284define i8 @addv_v8i8(<8 x i8> %a) { 285; CHECK-LABEL: addv_v8i8: 286; CHECK: // %bb.0: // %entry 287; CHECK-NEXT: addv b0, v0.8b 288; CHECK-NEXT: fmov w0, s0 289; CHECK-NEXT: ret 290entry: 291 %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) 292 ret i8 %arg1 293} 294 295define i8 @addv_v16i8(<16 x i8> %a) { 296; CHECK-LABEL: addv_v16i8: 297; CHECK: // %bb.0: // %entry 298; CHECK-NEXT: addv b0, v0.16b 299; CHECK-NEXT: fmov w0, s0 300; CHECK-NEXT: ret 301entry: 302 %arg1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) 303 ret i8 %arg1 304} 305 306define i8 @addv_v32i8(<32 x i8> %a) { 307; CHECK-LABEL: addv_v32i8: 308; CHECK: // %bb.0: // %entry 309; CHECK-NEXT: add v0.16b, v0.16b, v1.16b 310; CHECK-NEXT: addv b0, v0.16b 311; CHECK-NEXT: fmov w0, s0 312; CHECK-NEXT: ret 313entry: 314 %arg1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a) 315 ret i8 %arg1 316} 317 318define i16 @addv_v2i16(<2 x i16> %a) { 319; CHECK-LABEL: addv_v2i16: 320; CHECK: // %bb.0: // %entry 321; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 322; CHECK-NEXT: fmov w0, s0 323; CHECK-NEXT: ret 324entry: 325 %arg1 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a) 326 ret i16 %arg1 327} 328 329define i16 @addv_v3i16(<3 x i16> %a) { 330; CHECK-LABEL: addv_v3i16: 331; CHECK: // %bb.0: // %entry 332; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 333; CHECK-NEXT: mov v0.h[3], wzr 334; CHECK-NEXT: addv h0, v0.4h 335; CHECK-NEXT: fmov w0, s0 336; CHECK-NEXT: ret 337entry: 338 %arg1 = call i16 @llvm.vector.reduce.add.v3i16(<3 x i16> %a) 339 ret i16 %arg1 340} 341 342define i16 @addv_v4i16(<4 x i16> %a) { 343; CHECK-LABEL: addv_v4i16: 344; CHECK: // %bb.0: // %entry 345; CHECK-NEXT: addv h0, v0.4h 346; CHECK-NEXT: fmov w0, s0 347; CHECK-NEXT: ret 348entry: 349 %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) 350 ret i16 %arg1 351} 352 353define i16 @addv_v8i16(<8 x i16> %a) { 354; CHECK-LABEL: addv_v8i16: 355; CHECK: // %bb.0: // %entry 356; CHECK-NEXT: addv h0, v0.8h 357; CHECK-NEXT: fmov w0, s0 358; CHECK-NEXT: ret 359entry: 360 %arg1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) 361 ret i16 %arg1 362} 363 364define i16 @addv_v16i16(<16 x i16> %a) { 365; CHECK-LABEL: addv_v16i16: 366; CHECK: // %bb.0: // %entry 367; CHECK-NEXT: add v0.8h, v0.8h, v1.8h 368; CHECK-NEXT: addv h0, v0.8h 369; CHECK-NEXT: fmov w0, s0 370; CHECK-NEXT: ret 371entry: 372 %arg1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a) 373 ret i16 %arg1 374} 375 376define i32 @addv_v2i32(<2 x i32> %a) { 377; CHECK-LABEL: addv_v2i32: 378; CHECK: // %bb.0: // %entry 379; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 380; CHECK-NEXT: fmov w0, s0 381; CHECK-NEXT: ret 382entry: 383 %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) 384 ret i32 %arg1 385} 386 387define i32 @addv_v3i32(<3 x i32> %a) { 388; CHECK-LABEL: addv_v3i32: 389; CHECK: // %bb.0: // %entry 390; CHECK-NEXT: mov v0.s[3], wzr 391; CHECK-NEXT: addv s0, v0.4s 392; CHECK-NEXT: fmov w0, s0 393; CHECK-NEXT: ret 394entry: 395 %arg1 = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a) 396 ret i32 %arg1 397} 398 399define i32 @addv_v4i32(<4 x i32> %a) { 400; CHECK-LABEL: addv_v4i32: 401; CHECK: // %bb.0: // %entry 402; CHECK-NEXT: addv s0, v0.4s 403; CHECK-NEXT: fmov w0, s0 404; CHECK-NEXT: ret 405entry: 406 %arg1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 407 ret i32 %arg1 408} 409 410define i32 @addv_v8i32(<8 x i32> %a) { 411; CHECK-LABEL: addv_v8i32: 412; CHECK: // %bb.0: // %entry 413; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 414; CHECK-NEXT: addv s0, v0.4s 415; CHECK-NEXT: fmov w0, s0 416; CHECK-NEXT: ret 417entry: 418 %arg1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a) 419 ret i32 %arg1 420} 421 422define i64 @addv_v2i64(<2 x i64> %a) { 423; CHECK-LABEL: addv_v2i64: 424; CHECK: // %bb.0: // %entry 425; CHECK-NEXT: addp d0, v0.2d 426; CHECK-NEXT: fmov x0, d0 427; CHECK-NEXT: ret 428entry: 429 %arg1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) 430 ret i64 %arg1 431} 432 433define i64 @addv_v3i64(<3 x i64> %a) { 434; CHECK-LABEL: addv_v3i64: 435; CHECK: // %bb.0: // %entry 436; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 437; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 438; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 439; CHECK-NEXT: mov v0.d[1], v1.d[0] 440; CHECK-NEXT: mov v2.d[1], xzr 441; CHECK-NEXT: add v0.2d, v0.2d, v2.2d 442; CHECK-NEXT: addp d0, v0.2d 443; CHECK-NEXT: fmov x0, d0 444; CHECK-NEXT: ret 445entry: 446 %arg1 = call i64 @llvm.vector.reduce.add.v3i64(<3 x i64> %a) 447 ret i64 %arg1 448} 449 450define i64 @addv_v4i64(<4 x i64> %a) { 451; CHECK-LABEL: addv_v4i64: 452; CHECK: // %bb.0: // %entry 453; CHECK-NEXT: add v0.2d, v0.2d, v1.2d 454; CHECK-NEXT: addp d0, v0.2d 455; CHECK-NEXT: fmov x0, d0 456; CHECK-NEXT: ret 457entry: 458 %arg1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a) 459 ret i64 %arg1 460} 461 462define i128 @addv_v2i128(<2 x i128> %a) { 463; CHECK-LABEL: addv_v2i128: 464; CHECK: // %bb.0: // %entry 465; CHECK-NEXT: adds x0, x0, x2 466; CHECK-NEXT: adc x1, x1, x3 467; CHECK-NEXT: ret 468entry: 469 %arg1 = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a) 470 ret i128 %arg1 471} 472 473;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 474; GISEL: {{.*}} 475; SDAG: {{.*}} 476