1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3 4define arm_aapcs_vfpcc i32 @test1(ptr %ptr, i32 %arg1, <4 x i32> %arg2, <4 x i32> %arg3) { 5; CHECK-LABEL: test1: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vaddv.u32 r2, q1 8; CHECK-NEXT: vaddva.u32 r2, q0 9; CHECK-NEXT: str r2, [r0] 10; CHECK-NEXT: adds r0, r2, r1 11; CHECK-NEXT: bx lr 12entry: 13 %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg2) 14 %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg3) 15 %add1 = add i32 %reduce1, %reduce2 16 store i32 %add1, ptr %ptr, align 4 17 %add2 = add i32 %add1, %arg1 18 ret i32 %add2 19} 20 21define arm_aapcs_vfpcc i32 @test2(ptr %ptr, i32 %arg1, <4 x i32> %arg2, <4 x i32> %arg3) { 22; CHECK-LABEL: test2: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vaddv.u32 r2, q1 25; CHECK-NEXT: vaddva.u32 r2, q0 26; CHECK-NEXT: str r2, [r0] 27; CHECK-NEXT: adds r0, r1, r2 28; CHECK-NEXT: bx lr 29entry: 30 %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg2) 31 %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg3) 32 %add1 = add i32 %reduce1, %reduce2 33 store i32 %add1, ptr %ptr, align 4 34 %add2 = add i32 %arg1, %add1 35 ret i32 %add2 36} 37 38define arm_aapcs_vfpcc i32 @test3(ptr %ptr, i32 %arg1, i32 %arg2, <4 x i32> %arg3, <4 x i32> %arg4) { 39; CHECK-LABEL: test3: 40; CHECK: @ %bb.0: @ %entry 41; CHECK-NEXT: mov r12, r1 42; CHECK-NEXT: vaddva.u32 r2, q1 43; CHECK-NEXT: vaddva.u32 r12, q0 44; CHECK-NEXT: str.w r12, [r0] 45; CHECK-NEXT: add.w r0, r12, r2 46; CHECK-NEXT: bx lr 47entry: 48 %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg3) 49 %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %arg4) 50 %add1 = add i32 %arg1, %reduce1 51 store i32 %add1, ptr %ptr, align 4 52 %add2 = add i32 %arg2, %reduce2 53 %add3 = add i32 %add1, %add2 54 ret i32 %add3 55} 56 57define arm_aapcs_vfpcc i32 @test4(ptr %ptr, i32 %arg1, ptr %arg2) { 58; CHECK-LABEL: test4: 59; CHECK: @ %bb.0: @ %entry 60; CHECK-NEXT: vldrw.u32 q0, [r2] 61; CHECK-NEXT: mov r12, r1 62; CHECK-NEXT: vaddva.u32 r12, q0 63; CHECK-NEXT: vldrw.u32 q0, [r2, #4] 64; CHECK-NEXT: str.w r12, [r0] 65; CHECK-NEXT: vaddva.u32 r12, q0 66; CHECK-NEXT: mov r0, r12 67; CHECK-NEXT: bx lr 68entry: 69 %load1 = load <4 x i32>, ptr %arg2, align 4 70 %gep = getelementptr inbounds i32, ptr %arg2, i32 1 71 %load2 = load <4 x i32>, ptr %gep, align 4 72 %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load1) 73 %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load2) 74 %add1 = add i32 %arg1, %reduce1 75 store i32 %add1, ptr %ptr, align 4 76 %add2 = add i32 %add1, %reduce2 77 ret i32 %add2 78} 79 80define arm_aapcs_vfpcc i32 @test5(ptr %ptr, i32 %arg1, ptr %arg2) { 81; CHECK-LABEL: test5: 82; CHECK: @ %bb.0: @ %entry 83; CHECK-NEXT: vldrw.u32 q0, [r2, #4] 84; CHECK-NEXT: mov r12, r1 85; CHECK-NEXT: vaddva.u32 r12, q0 86; CHECK-NEXT: vldrw.u32 q0, [r2] 87; CHECK-NEXT: str.w r12, [r0] 88; CHECK-NEXT: vaddva.u32 r12, q0 89; CHECK-NEXT: mov r0, r12 90; CHECK-NEXT: bx lr 91entry: 92 %load1 = load <4 x i32>, ptr %arg2, align 4 93 %gep = getelementptr inbounds i32, ptr %arg2, i32 1 94 %load2 = load <4 x i32>, ptr %gep, align 4 95 %reduce1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load1) 96 %reduce2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %load2) 97 %add1 = add i32 %arg1, %reduce2 98 store i32 %add1, ptr %ptr, align 4 99 %add2 = add i32 %add1, %reduce1 100 ret i32 %add2 101} 102 103define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8(<16 x i8> %s0) { 104; CHECK-LABEL: vaddv_shuffle_v16i8: 105; CHECK: @ %bb.0: @ %entry 106; CHECK-NEXT: vaddv.u8 r0, q0 107; CHECK-NEXT: bx lr 108entry: 109 %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 110 %s1 = zext <16 x i8> %s2 to <16 x i16> 111 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) 112 ret i16 %result 113} 114 115define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8_duplicate(<16 x i8> %s0) { 116; CHECK-LABEL: vaddv_shuffle_v16i8_duplicate: 117; CHECK: @ %bb.0: @ %entry 118; CHECK-NEXT: vmov.u8 r0, q0[1] 119; CHECK-NEXT: vmov.u8 r1, q0[2] 120; CHECK-NEXT: vmov.8 q1[0], r0 121; CHECK-NEXT: vmov.8 q1[1], r1 122; CHECK-NEXT: vmov.u8 r1, q0[4] 123; CHECK-NEXT: vmov.8 q1[2], r1 124; CHECK-NEXT: vmov.u8 r1, q0[6] 125; CHECK-NEXT: vmov.8 q1[3], r1 126; CHECK-NEXT: vmov.u8 r1, q0[8] 127; CHECK-NEXT: vmov.8 q1[4], r1 128; CHECK-NEXT: vmov.u8 r1, q0[10] 129; CHECK-NEXT: vmov.8 q1[5], r1 130; CHECK-NEXT: vmov.u8 r1, q0[12] 131; CHECK-NEXT: vmov.8 q1[6], r1 132; CHECK-NEXT: vmov.u8 r1, q0[14] 133; CHECK-NEXT: vmov.8 q1[7], r1 134; CHECK-NEXT: vmov.8 q1[8], r0 135; CHECK-NEXT: vmov.u8 r0, q0[3] 136; CHECK-NEXT: vmov.8 q1[9], r0 137; CHECK-NEXT: vmov.u8 r0, q0[5] 138; CHECK-NEXT: vmov.8 q1[10], r0 139; CHECK-NEXT: vmov.u8 r0, q0[7] 140; CHECK-NEXT: vmov.8 q1[11], r0 141; CHECK-NEXT: vmov.u8 r0, q0[9] 142; CHECK-NEXT: vmov.8 q1[12], r0 143; CHECK-NEXT: vmov.u8 r0, q0[11] 144; CHECK-NEXT: vmov.8 q1[13], r0 145; CHECK-NEXT: vmov.u8 r0, q0[13] 146; CHECK-NEXT: vmov.8 q1[14], r0 147; CHECK-NEXT: vmov.u8 r0, q0[15] 148; CHECK-NEXT: vmov.8 q1[15], r0 149; CHECK-NEXT: vaddv.u8 r0, q1 150; CHECK-NEXT: bx lr 151entry: 152 %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 1, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 153 %s1 = zext <16 x i8> %s2 to <16 x i16> 154 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) 155 ret i16 %result 156} 157 158define arm_aapcs_vfpcc i16 @vaddv_shuffle_v16i8_undef(<16 x i8> %s0) { 159; CHECK-LABEL: vaddv_shuffle_v16i8_undef: 160; CHECK: @ %bb.0: @ %entry 161; CHECK-NEXT: vmov.u8 r0, q0[2] 162; CHECK-NEXT: vmov.8 q1[1], r0 163; CHECK-NEXT: vmov.u8 r0, q0[4] 164; CHECK-NEXT: vmov.8 q1[2], r0 165; CHECK-NEXT: vmov.u8 r0, q0[6] 166; CHECK-NEXT: vmov.8 q1[3], r0 167; CHECK-NEXT: vmov.u8 r0, q0[8] 168; CHECK-NEXT: vmov.8 q1[4], r0 169; CHECK-NEXT: vmov.u8 r0, q0[10] 170; CHECK-NEXT: vmov.8 q1[5], r0 171; CHECK-NEXT: vmov.u8 r0, q0[12] 172; CHECK-NEXT: vmov.8 q1[6], r0 173; CHECK-NEXT: vmov.u8 r0, q0[14] 174; CHECK-NEXT: vmov.8 q1[7], r0 175; CHECK-NEXT: vmov.u8 r0, q0[1] 176; CHECK-NEXT: vmov.8 q1[8], r0 177; CHECK-NEXT: vmov.u8 r0, q0[3] 178; CHECK-NEXT: vmov.8 q1[9], r0 179; CHECK-NEXT: vmov.u8 r0, q0[5] 180; CHECK-NEXT: vmov.8 q1[10], r0 181; CHECK-NEXT: vmov.u8 r0, q0[7] 182; CHECK-NEXT: vmov.8 q1[11], r0 183; CHECK-NEXT: vmov.u8 r0, q0[9] 184; CHECK-NEXT: vmov.8 q1[12], r0 185; CHECK-NEXT: vmov.u8 r0, q0[11] 186; CHECK-NEXT: vmov.8 q1[13], r0 187; CHECK-NEXT: vmov.u8 r0, q0[13] 188; CHECK-NEXT: vmov.8 q1[14], r0 189; CHECK-NEXT: vmov.u8 r0, q0[15] 190; CHECK-NEXT: vmov.8 q1[15], r0 191; CHECK-NEXT: vaddv.u8 r0, q1 192; CHECK-NEXT: bx lr 193entry: 194 %s2 = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 undef, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 195 %s1 = zext <16 x i8> %s2 to <16 x i16> 196 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) 197 ret i16 %result 198} 199 200define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long(<4 x i32> %s0) { 201; CHECK-LABEL: vaddv_shuffle_v4i32_long: 202; CHECK: @ %bb.0: @ %entry 203; CHECK-NEXT: vaddlv.u32 r0, r1, q0 204; CHECK-NEXT: bx lr 205entry: 206 %s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 207 %s1 = zext <4 x i32> %s2 to <4 x i64> 208 %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) 209 ret i64 %r 210} 211 212define arm_aapcs_vfpcc i64 @vaddv_shuffle_v4i32_long_a(<4 x i32> %s0, i64 %a) { 213; CHECK-LABEL: vaddv_shuffle_v4i32_long_a: 214; CHECK: @ %bb.0: @ %entry 215; CHECK-NEXT: vaddlva.u32 r0, r1, q0 216; CHECK-NEXT: bx lr 217entry: 218 %s2 = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 219 %s1 = zext <4 x i32> %s2 to <4 x i64> 220 %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) 221 %r2 = add i64 %r, %a 222 ret i64 %r2 223} 224 225define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8(<16 x i8> %s0, <16 x i8> %s0b) { 226; CHECK-LABEL: vmla_shuffle_v16i8: 227; CHECK: @ %bb.0: @ %entry 228; CHECK-NEXT: vmlav.s8 r0, q0, q1 229; CHECK-NEXT: bx lr 230entry: 231 %s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 232 %s1a = sext <16 x i8> %s2a to <16 x i16> 233 %s2b = shufflevector <16 x i8> %s0b, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 234 %s1b = sext <16 x i8> %s2b to <16 x i16> 235 %s1 = mul <16 x i16> %s1a, %s1b 236 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) 237 ret i16 %result 238} 239 240define arm_aapcs_vfpcc i16 @vmla_shuffle_v16i8_unequal(<16 x i8> %s0, <16 x i8> %s0b) { 241; CHECK-LABEL: vmla_shuffle_v16i8_unequal: 242; CHECK: @ %bb.0: @ %entry 243; CHECK-NEXT: vmov.u8 r0, q1[0] 244; CHECK-NEXT: vmov.8 q2[0], r0 245; CHECK-NEXT: vmov.u8 r0, q1[2] 246; CHECK-NEXT: vmov.8 q2[1], r0 247; CHECK-NEXT: vmov.u8 r0, q1[4] 248; CHECK-NEXT: vmov.8 q2[2], r0 249; CHECK-NEXT: vmov.u8 r0, q1[6] 250; CHECK-NEXT: vmov.8 q2[3], r0 251; CHECK-NEXT: vmov.u8 r0, q1[8] 252; CHECK-NEXT: vmov.8 q2[4], r0 253; CHECK-NEXT: vmov.u8 r0, q1[10] 254; CHECK-NEXT: vmov.8 q2[5], r0 255; CHECK-NEXT: vmov.u8 r0, q1[12] 256; CHECK-NEXT: vmov.8 q2[6], r0 257; CHECK-NEXT: vmov.u8 r0, q1[15] 258; CHECK-NEXT: vmov.8 q2[7], r0 259; CHECK-NEXT: vmov.u8 r0, q1[1] 260; CHECK-NEXT: vmov.8 q2[8], r0 261; CHECK-NEXT: vmov.u8 r0, q1[3] 262; CHECK-NEXT: vmov.8 q2[9], r0 263; CHECK-NEXT: vmov.u8 r0, q1[5] 264; CHECK-NEXT: vmov.8 q2[10], r0 265; CHECK-NEXT: vmov.u8 r0, q1[7] 266; CHECK-NEXT: vmov.8 q2[11], r0 267; CHECK-NEXT: vmov.u8 r0, q1[9] 268; CHECK-NEXT: vmov.8 q2[12], r0 269; CHECK-NEXT: vmov.u8 r0, q1[11] 270; CHECK-NEXT: vmov.8 q2[13], r0 271; CHECK-NEXT: vmov.u8 r0, q1[13] 272; CHECK-NEXT: vmov.8 q2[14], r0 273; CHECK-NEXT: vmov.u8 r0, q1[14] 274; CHECK-NEXT: vmov.8 q2[15], r0 275; CHECK-NEXT: vmov.u8 r0, q0[0] 276; CHECK-NEXT: vmov.8 q1[0], r0 277; CHECK-NEXT: vmov.u8 r0, q0[2] 278; CHECK-NEXT: vmov.8 q1[1], r0 279; CHECK-NEXT: vmov.u8 r0, q0[4] 280; CHECK-NEXT: vmov.8 q1[2], r0 281; CHECK-NEXT: vmov.u8 r0, q0[6] 282; CHECK-NEXT: vmov.8 q1[3], r0 283; CHECK-NEXT: vmov.u8 r0, q0[8] 284; CHECK-NEXT: vmov.8 q1[4], r0 285; CHECK-NEXT: vmov.u8 r0, q0[10] 286; CHECK-NEXT: vmov.8 q1[5], r0 287; CHECK-NEXT: vmov.u8 r0, q0[12] 288; CHECK-NEXT: vmov.8 q1[6], r0 289; CHECK-NEXT: vmov.u8 r0, q0[14] 290; CHECK-NEXT: vmov.8 q1[7], r0 291; CHECK-NEXT: vmov.u8 r0, q0[1] 292; CHECK-NEXT: vmov.8 q1[8], r0 293; CHECK-NEXT: vmov.u8 r0, q0[3] 294; CHECK-NEXT: vmov.8 q1[9], r0 295; CHECK-NEXT: vmov.u8 r0, q0[5] 296; CHECK-NEXT: vmov.8 q1[10], r0 297; CHECK-NEXT: vmov.u8 r0, q0[7] 298; CHECK-NEXT: vmov.8 q1[11], r0 299; CHECK-NEXT: vmov.u8 r0, q0[9] 300; CHECK-NEXT: vmov.8 q1[12], r0 301; CHECK-NEXT: vmov.u8 r0, q0[11] 302; CHECK-NEXT: vmov.8 q1[13], r0 303; CHECK-NEXT: vmov.u8 r0, q0[13] 304; CHECK-NEXT: vmov.8 q1[14], r0 305; CHECK-NEXT: vmov.u8 r0, q0[15] 306; CHECK-NEXT: vmov.8 q1[15], r0 307; CHECK-NEXT: vmlav.s8 r0, q1, q2 308; CHECK-NEXT: bx lr 309entry: 310 %s2a = shufflevector <16 x i8> %s0, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 311 %s1a = sext <16 x i8> %s2a to <16 x i16> 312 %s2b = shufflevector <16 x i8> %s0b, <16 x i8> %s0, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 14> 313 %s1b = sext <16 x i8> %s2b to <16 x i16> 314 %s1 = mul <16 x i16> %s1a, %s1b 315 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s1) 316 ret i16 %result 317} 318 319define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long(<4 x i32> %s0, <4 x i32> %s0b) { 320; CHECK-LABEL: vmla_shuffle_v4i32_long: 321; CHECK: @ %bb.0: @ %entry 322; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1 323; CHECK-NEXT: bx lr 324entry: 325 %s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 326 %s1a = zext <4 x i32> %s2a to <4 x i64> 327 %s2b = shufflevector <4 x i32> %s0b, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 328 %s1b = zext <4 x i32> %s2b to <4 x i64> 329 %s1 = mul <4 x i64> %s1a, %s1b 330 %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) 331 ret i64 %r 332} 333 334define arm_aapcs_vfpcc i64 @vmla_shuffle_v4i32_long_a(<4 x i32> %s0, <4 x i32> %s0b, i64 %a) { 335; CHECK-LABEL: vmla_shuffle_v4i32_long_a: 336; CHECK: @ %bb.0: @ %entry 337; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1 338; CHECK-NEXT: bx lr 339entry: 340 %s2a = shufflevector <4 x i32> %s0, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 341 %s1a = zext <4 x i32> %s2a to <4 x i64> 342 %s2b = shufflevector <4 x i32> %s0b, <4 x i32> %s0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 343 %s1b = zext <4 x i32> %s2b to <4 x i64> 344 %s1 = mul <4 x i64> %s1a, %s1b 345 %r = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s1) 346 %r2 = add i64 %r, %a 347 ret i64 %r2 348} 349 350declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 351declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 352declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 353