1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s 3; RUN: llc < %s -mtriple=arm64-eabi -global-isel | FileCheck %s 4 5define signext i8 @test_vaddv_s8(<8 x i8> %a1) { 6; CHECK-LABEL: test_vaddv_s8: 7; CHECK: // %bb.0: // %entry 8; CHECK-NEXT: addv b0, v0.8b 9; CHECK-NEXT: smov w0, v0.b[0] 10; CHECK-NEXT: ret 11entry: 12 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1) 13 %0 = trunc i32 %vaddv.i to i8 14 ret i8 %0 15} 16 17define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) { 18; CHECK-LABEL: test_vaddv_s8_used_by_laneop: 19; CHECK: // %bb.0: // %entry 20; CHECK-NEXT: addv b1, v1.8b 21; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 22; CHECK-NEXT: mov v0.b[3], v1.b[0] 23; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 24; CHECK-NEXT: ret 25entry: 26 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a2) 27 %1 = trunc i32 %0 to i8 28 %2 = insertelement <8 x i8> %a1, i8 %1, i32 3 29 ret <8 x i8> %2 30} 31 32define signext i16 @test_vaddv_s16(<4 x i16> %a1) { 33; CHECK-LABEL: test_vaddv_s16: 34; CHECK: // %bb.0: // %entry 35; CHECK-NEXT: addv h0, v0.4h 36; CHECK-NEXT: smov w0, v0.h[0] 37; CHECK-NEXT: ret 38entry: 39 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1) 40 %0 = trunc i32 %vaddv.i to i16 41 ret i16 %0 42} 43 44define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) { 45; CHECK-LABEL: test_vaddv_s16_used_by_laneop: 46; CHECK: // %bb.0: // %entry 47; CHECK-NEXT: addv h1, v1.4h 48; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 49; CHECK-NEXT: mov v0.h[3], v1.h[0] 50; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 51; CHECK-NEXT: ret 52entry: 53 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a2) 54 %1 = trunc i32 %0 to i16 55 %2 = insertelement <4 x i16> %a1, i16 %1, i32 3 56 ret <4 x i16> %2 57} 58 59define i32 @test_vaddv_s32(<2 x i32> %a1) { 60; CHECK-LABEL: test_vaddv_s32: 61; CHECK: // %bb.0: // %entry 62; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 63; CHECK-NEXT: fmov w0, s0 64; CHECK-NEXT: ret 65; 2 x i32 is not supported by the ISA, thus, this is a special case 66entry: 67 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1) 68 ret i32 %vaddv.i 69} 70 71define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) { 72; CHECK-LABEL: test_vaddv_s32_used_by_laneop: 73; CHECK: // %bb.0: // %entry 74; CHECK-NEXT: addp v1.2s, v1.2s, v1.2s 75; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 76; CHECK-NEXT: mov v0.s[1], v1.s[0] 77; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 78; CHECK-NEXT: ret 79entry: 80 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a2) 81 %1 = insertelement <2 x i32> %a1, i32 %0, i32 1 82 ret <2 x i32> %1 83} 84 85define i64 @test_vaddv_s64(<2 x i64> %a1) { 86; CHECK-LABEL: test_vaddv_s64: 87; CHECK: // %bb.0: // %entry 88; CHECK-NEXT: addp d0, v0.2d 89; CHECK-NEXT: fmov x0, d0 90; CHECK-NEXT: ret 91entry: 92 %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1) 93 ret i64 %vaddv.i 94} 95 96define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) { 97; CHECK-LABEL: test_vaddv_s64_used_by_laneop: 98; CHECK: // %bb.0: // %entry 99; CHECK-NEXT: addp d1, v1.2d 100; CHECK-NEXT: mov v0.d[1], v1.d[0] 101; CHECK-NEXT: ret 102entry: 103 %0 = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a2) 104 %1 = insertelement <2 x i64> %a1, i64 %0, i64 1 105 ret <2 x i64> %1 106} 107 108define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) { 109; CHECK-LABEL: test_vaddv_u8: 110; CHECK: // %bb.0: // %entry 111; CHECK-NEXT: addv b0, v0.8b 112; CHECK-NEXT: fmov w0, s0 113; CHECK-NEXT: ret 114entry: 115 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1) 116 %0 = trunc i32 %vaddv.i to i8 117 ret i8 %0 118} 119 120define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) { 121; CHECK-LABEL: test_vaddv_u8_used_by_laneop: 122; CHECK: // %bb.0: // %entry 123; CHECK-NEXT: addv b1, v1.8b 124; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 125; CHECK-NEXT: mov v0.b[3], v1.b[0] 126; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 127; CHECK-NEXT: ret 128entry: 129 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a2) 130 %1 = trunc i32 %0 to i8 131 %2 = insertelement <8 x i8> %a1, i8 %1, i32 3 132 ret <8 x i8> %2 133} 134 135define i32 @test_vaddv_u8_masked(<8 x i8> %a1) { 136; CHECK-LABEL: test_vaddv_u8_masked: 137; CHECK: // %bb.0: // %entry 138; CHECK-NEXT: addv b0, v0.8b 139; CHECK-NEXT: fmov w0, s0 140; CHECK-NEXT: ret 141entry: 142 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1) 143 %0 = and i32 %vaddv.i, 511 ; 0x1ff 144 ret i32 %0 145} 146 147define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) { 148; CHECK-LABEL: test_vaddv_u16: 149; CHECK: // %bb.0: // %entry 150; CHECK-NEXT: addv h0, v0.4h 151; CHECK-NEXT: fmov w0, s0 152; CHECK-NEXT: ret 153entry: 154 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1) 155 %0 = trunc i32 %vaddv.i to i16 156 ret i16 %0 157} 158 159define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) { 160; CHECK-LABEL: test_vaddv_u16_used_by_laneop: 161; CHECK: // %bb.0: // %entry 162; CHECK-NEXT: addv h1, v1.4h 163; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 164; CHECK-NEXT: mov v0.h[3], v1.h[0] 165; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 166; CHECK-NEXT: ret 167entry: 168 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a2) 169 %1 = trunc i32 %0 to i16 170 %2 = insertelement <4 x i16> %a1, i16 %1, i32 3 171 ret <4 x i16> %2 172} 173 174define i32 @test_vaddv_u16_masked(<4 x i16> %a1) { 175; CHECK-LABEL: test_vaddv_u16_masked: 176; CHECK: // %bb.0: // %entry 177; CHECK-NEXT: addv h0, v0.4h 178; CHECK-NEXT: fmov w0, s0 179; CHECK-NEXT: ret 180entry: 181 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1) 182 %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff 183 ret i32 %0 184} 185 186define i32 @test_vaddv_u32(<2 x i32> %a1) { 187; CHECK-LABEL: test_vaddv_u32: 188; CHECK: // %bb.0: // %entry 189; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 190; CHECK-NEXT: fmov w0, s0 191; CHECK-NEXT: ret 192; 2 x i32 is not supported by the ISA, thus, this is a special case 193entry: 194 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1) 195 ret i32 %vaddv.i 196} 197 198define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) { 199; CHECK-LABEL: test_vaddv_u32_used_by_laneop: 200; CHECK: // %bb.0: // %entry 201; CHECK-NEXT: addp v1.2s, v1.2s, v1.2s 202; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 203; CHECK-NEXT: mov v0.s[1], v1.s[0] 204; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 205; CHECK-NEXT: ret 206entry: 207 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a2) 208 %1 = insertelement <2 x i32> %a1, i32 %0, i32 1 209 ret <2 x i32> %1 210} 211 212define float @test_vaddv_f32(<2 x float> %a1) { 213; CHECK-LABEL: test_vaddv_f32: 214; CHECK: // %bb.0: // %entry 215; CHECK-NEXT: faddp s0, v0.2s 216; CHECK-NEXT: ret 217entry: 218 %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1) 219 ret float %vaddv.i 220} 221 222define float @test_vaddv_v4f32(<4 x float> %a1) { 223; CHECK-LABEL: test_vaddv_v4f32: 224; CHECK: // %bb.0: // %entry 225; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s 226; CHECK-NEXT: faddp s0, v0.2s 227; CHECK-NEXT: ret 228entry: 229 %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1) 230 ret float %vaddv.i 231} 232 233define double @test_vaddv_f64(<2 x double> %a1) { 234; CHECK-LABEL: test_vaddv_f64: 235; CHECK: // %bb.0: // %entry 236; CHECK-NEXT: faddp d0, v0.2d 237; CHECK-NEXT: ret 238entry: 239 %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1) 240 ret double %vaddv.i 241} 242 243define i64 @test_vaddv_u64(<2 x i64> %a1) { 244; CHECK-LABEL: test_vaddv_u64: 245; CHECK: // %bb.0: // %entry 246; CHECK-NEXT: addp d0, v0.2d 247; CHECK-NEXT: fmov x0, d0 248; CHECK-NEXT: ret 249entry: 250 %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1) 251 ret i64 %vaddv.i 252} 253 254define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) { 255; CHECK-LABEL: test_vaddv_u64_used_by_laneop: 256; CHECK: // %bb.0: // %entry 257; CHECK-NEXT: addp d1, v1.2d 258; CHECK-NEXT: mov v0.d[1], v1.d[0] 259; CHECK-NEXT: ret 260entry: 261 %0 = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a2) 262 %1 = insertelement <2 x i64> %a1, i64 %0, i64 1 263 ret <2 x i64> %1 264} 265 266define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) { 267; CHECK-LABEL: test_vaddv_u64_to_vec: 268; CHECK: // %bb.0: // %entry 269; CHECK-NEXT: addp d0, v0.2d 270; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 271; CHECK-NEXT: ret 272entry: 273 %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1) 274 %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0 275 ret <1 x i64> %vec 276} 277 278define signext i8 @test_vaddvq_s8(<16 x i8> %a1) { 279; CHECK-LABEL: test_vaddvq_s8: 280; CHECK: // %bb.0: // %entry 281; CHECK-NEXT: addv b0, v0.16b 282; CHECK-NEXT: smov w0, v0.b[0] 283; CHECK-NEXT: ret 284entry: 285 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1) 286 %0 = trunc i32 %vaddv.i to i8 287 ret i8 %0 288} 289 290define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) { 291; CHECK-LABEL: test_vaddvq_s8_used_by_laneop: 292; CHECK: // %bb.0: // %entry 293; CHECK-NEXT: addv b1, v1.16b 294; CHECK-NEXT: mov v0.b[3], v1.b[0] 295; CHECK-NEXT: ret 296entry: 297 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a2) 298 %1 = trunc i32 %0 to i8 299 %2 = insertelement <16 x i8> %a1, i8 %1, i32 3 300 ret <16 x i8> %2 301} 302 303define signext i16 @test_vaddvq_s16(<8 x i16> %a1) { 304; CHECK-LABEL: test_vaddvq_s16: 305; CHECK: // %bb.0: // %entry 306; CHECK-NEXT: addv h0, v0.8h 307; CHECK-NEXT: smov w0, v0.h[0] 308; CHECK-NEXT: ret 309entry: 310 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1) 311 %0 = trunc i32 %vaddv.i to i16 312 ret i16 %0 313} 314 315define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) { 316; CHECK-LABEL: test_vaddvq_s16_used_by_laneop: 317; CHECK: // %bb.0: // %entry 318; CHECK-NEXT: addv h1, v1.8h 319; CHECK-NEXT: mov v0.h[3], v1.h[0] 320; CHECK-NEXT: ret 321entry: 322 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a2) 323 %1 = trunc i32 %0 to i16 324 %2 = insertelement <8 x i16> %a1, i16 %1, i32 3 325 ret <8 x i16> %2 326} 327 328define i32 @test_vaddvq_s32(<4 x i32> %a1) { 329; CHECK-LABEL: test_vaddvq_s32: 330; CHECK: // %bb.0: // %entry 331; CHECK-NEXT: addv s0, v0.4s 332; CHECK-NEXT: fmov w0, s0 333; CHECK-NEXT: ret 334entry: 335 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1) 336 ret i32 %vaddv.i 337} 338 339define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) { 340; CHECK-LABEL: test_vaddvq_s32_used_by_laneop: 341; CHECK: // %bb.0: // %entry 342; CHECK-NEXT: addv s1, v1.4s 343; CHECK-NEXT: mov v0.s[3], v1.s[0] 344; CHECK-NEXT: ret 345entry: 346 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a2) 347 %1 = insertelement <4 x i32> %a1, i32 %0, i32 3 348 ret <4 x i32> %1 349} 350 351define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) { 352; CHECK-LABEL: test_vaddvq_u8: 353; CHECK: // %bb.0: // %entry 354; CHECK-NEXT: addv b0, v0.16b 355; CHECK-NEXT: fmov w0, s0 356; CHECK-NEXT: ret 357entry: 358 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1) 359 %0 = trunc i32 %vaddv.i to i8 360 ret i8 %0 361} 362 363define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) { 364; CHECK-LABEL: test_vaddvq_u8_used_by_laneop: 365; CHECK: // %bb.0: // %entry 366; CHECK-NEXT: addv b1, v1.16b 367; CHECK-NEXT: mov v0.b[3], v1.b[0] 368; CHECK-NEXT: ret 369entry: 370 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a2) 371 %1 = trunc i32 %0 to i8 372 %2 = insertelement <16 x i8> %a1, i8 %1, i32 3 373 ret <16 x i8> %2 374} 375 376define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) { 377; CHECK-LABEL: test_vaddvq_u16: 378; CHECK: // %bb.0: // %entry 379; CHECK-NEXT: addv h0, v0.8h 380; CHECK-NEXT: fmov w0, s0 381; CHECK-NEXT: ret 382entry: 383 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1) 384 %0 = trunc i32 %vaddv.i to i16 385 ret i16 %0 386} 387 388define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) { 389; CHECK-LABEL: test_vaddvq_u16_used_by_laneop: 390; CHECK: // %bb.0: // %entry 391; CHECK-NEXT: addv h1, v1.8h 392; CHECK-NEXT: mov v0.h[3], v1.h[0] 393; CHECK-NEXT: ret 394entry: 395 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a2) 396 %1 = trunc i32 %0 to i16 397 %2 = insertelement <8 x i16> %a1, i16 %1, i32 3 398 ret <8 x i16> %2 399} 400 401define i32 @test_vaddvq_u32(<4 x i32> %a1) { 402; CHECK-LABEL: test_vaddvq_u32: 403; CHECK: // %bb.0: // %entry 404; CHECK-NEXT: addv s0, v0.4s 405; CHECK-NEXT: fmov w0, s0 406; CHECK-NEXT: ret 407entry: 408 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1) 409 ret i32 %vaddv.i 410} 411 412define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) { 413; CHECK-LABEL: test_vaddvq_u32_used_by_laneop: 414; CHECK: // %bb.0: // %entry 415; CHECK-NEXT: addv s1, v1.4s 416; CHECK-NEXT: mov v0.s[3], v1.s[0] 417; CHECK-NEXT: ret 418entry: 419 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a2) 420 %1 = insertelement <4 x i32> %a1, i32 %0, i32 3 421 ret <4 x i32> %1 422} 423 424declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>) 425 426declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>) 427 428declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>) 429 430declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>) 431 432declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>) 433 434declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>) 435 436declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>) 437 438declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>) 439 440declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>) 441 442declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>) 443 444declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>) 445 446declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>) 447 448declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>) 449 450declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>) 451 452declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1) 453declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1) 454declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1) 455