1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3 4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) { 5; CHECK-LABEL: add_v4i32_v4i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vpt.i32 eq, q2, zr 8; CHECK-NEXT: vmlavt.u32 r0, q0, q1 9; CHECK-NEXT: bx lr 10entry: 11 %c = icmp eq <4 x i32> %b, zeroinitializer 12 %m = mul <4 x i32> %x, %y 13 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 14 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 15 ret i32 %z 16} 17 18define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) { 19; CHECK-LABEL: add_v4i32_v4i64_zext: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vpt.i32 eq, q2, zr 22; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 23; CHECK-NEXT: bx lr 24entry: 25 %c = icmp eq <4 x i32> %b, zeroinitializer 26 %xx = zext <4 x i32> %x to <4 x i64> 27 %yy = zext <4 x i32> %y to <4 x i64> 28 %m = mul <4 x i64> %xx, %yy 29 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 30 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 31 ret i64 %z 32} 33 34define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) { 35; CHECK-LABEL: add_v4i32_v4i64_sext: 36; CHECK: @ %bb.0: @ %entry 37; CHECK-NEXT: vpt.i32 eq, q2, zr 38; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 39; CHECK-NEXT: bx lr 40entry: 41 %c = icmp eq <4 x i32> %b, zeroinitializer 42 %xx = sext <4 x i32> %x to <4 x i64> 43 %yy = sext <4 x i32> %y to <4 x i64> 44 %m = mul <4 x i64> %xx, %yy 45 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 46 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 47 ret i64 %z 48} 49 50define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) { 51; CHECK-LABEL: add_v2i32_v2i64_zext: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: vmov r0, s8 54; CHECK-NEXT: movs r1, #0 55; CHECK-NEXT: vmullb.u32 q3, q0, q1 56; CHECK-NEXT: vmov.i32 q0, #0x0 57; CHECK-NEXT: cmp r0, #0 58; CHECK-NEXT: csetm r0, eq 59; CHECK-NEXT: bfi r1, r0, #0, #8 60; CHECK-NEXT: vmov r0, s10 61; CHECK-NEXT: cmp r0, #0 62; CHECK-NEXT: csetm r0, eq 63; CHECK-NEXT: bfi r1, r0, #8, #8 64; CHECK-NEXT: vmsr p0, r1 65; CHECK-NEXT: vpsel q0, q3, q0 66; CHECK-NEXT: vmov r0, r1, d1 67; CHECK-NEXT: vmov r2, r3, d0 68; CHECK-NEXT: adds r0, r0, r2 69; CHECK-NEXT: adcs r1, r3 70; CHECK-NEXT: bx lr 71entry: 72 %c = icmp eq <2 x i32> %b, zeroinitializer 73 %xx = zext <2 x i32> %x to <2 x i64> 74 %yy = zext <2 x i32> %y to <2 x i64> 75 %m = mul <2 x i64> %xx, %yy 76 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 77 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 78 ret i64 %z 79} 80 81define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) { 82; CHECK-LABEL: add_v2i32_v2i64_sext: 83; CHECK: @ %bb.0: @ %entry 84; CHECK-NEXT: vmov r0, s8 85; CHECK-NEXT: movs r1, #0 86; CHECK-NEXT: vmullb.s32 q3, q0, q1 87; CHECK-NEXT: vmov.i32 q0, #0x0 88; CHECK-NEXT: cmp r0, #0 89; CHECK-NEXT: csetm r0, eq 90; CHECK-NEXT: bfi r1, r0, #0, #8 91; CHECK-NEXT: vmov r0, s10 92; CHECK-NEXT: cmp r0, #0 93; CHECK-NEXT: csetm r0, eq 94; CHECK-NEXT: bfi r1, r0, #8, #8 95; CHECK-NEXT: vmsr p0, r1 96; CHECK-NEXT: vpsel q0, q3, q0 97; CHECK-NEXT: vmov r0, r1, d1 98; CHECK-NEXT: vmov r2, r3, d0 99; CHECK-NEXT: adds r0, r0, r2 100; CHECK-NEXT: adcs r1, r3 101; CHECK-NEXT: bx lr 102entry: 103 %c = icmp eq <2 x i32> %b, zeroinitializer 104 %xx = sext <2 x i32> %x to <2 x i64> 105 %yy = sext <2 x i32> %y to <2 x i64> 106 %m = mul <2 x i64> %xx, %yy 107 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 108 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 109 ret i64 %z 110} 111 112define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 113; CHECK-LABEL: add_v8i16_v8i32_zext: 114; CHECK: @ %bb.0: @ %entry 115; CHECK-NEXT: vpt.i16 eq, q2, zr 116; CHECK-NEXT: vmlavt.u16 r0, q0, q1 117; CHECK-NEXT: bx lr 118entry: 119 %c = icmp eq <8 x i16> %b, zeroinitializer 120 %xx = zext <8 x i16> %x to <8 x i32> 121 %yy = zext <8 x i16> %y to <8 x i32> 122 %m = mul <8 x i32> %xx, %yy 123 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 124 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 125 ret i32 %z 126} 127 128define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 129; CHECK-LABEL: add_v8i16_v8i32_sext: 130; CHECK: @ %bb.0: @ %entry 131; CHECK-NEXT: vpt.i16 eq, q2, zr 132; CHECK-NEXT: vmlavt.s16 r0, q0, q1 133; CHECK-NEXT: bx lr 134entry: 135 %c = icmp eq <8 x i16> %b, zeroinitializer 136 %xx = sext <8 x i16> %x to <8 x i32> 137 %yy = sext <8 x i16> %y to <8 x i32> 138 %m = mul <8 x i32> %xx, %yy 139 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 140 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 141 ret i32 %z 142} 143 144define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { 145; CHECK-LABEL: add_v4i16_v4i32_zext: 146; CHECK: @ %bb.0: @ %entry 147; CHECK-NEXT: vmovlb.u16 q2, q2 148; CHECK-NEXT: vmovlb.u16 q1, q1 149; CHECK-NEXT: vmovlb.u16 q0, q0 150; CHECK-NEXT: vpt.i32 eq, q2, zr 151; CHECK-NEXT: vmlavt.u32 r0, q0, q1 152; CHECK-NEXT: bx lr 153entry: 154 %c = icmp eq <4 x i16> %b, zeroinitializer 155 %xx = zext <4 x i16> %x to <4 x i32> 156 %yy = zext <4 x i16> %y to <4 x i32> 157 %m = mul <4 x i32> %xx, %yy 158 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 159 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 160 ret i32 %z 161} 162 163define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { 164; CHECK-LABEL: add_v4i16_v4i32_sext: 165; CHECK: @ %bb.0: @ %entry 166; CHECK-NEXT: vmovlb.u16 q2, q2 167; CHECK-NEXT: vmovlb.s16 q1, q1 168; CHECK-NEXT: vmovlb.s16 q0, q0 169; CHECK-NEXT: vpt.i32 eq, q2, zr 170; CHECK-NEXT: vmlavt.u32 r0, q0, q1 171; CHECK-NEXT: bx lr 172entry: 173 %c = icmp eq <4 x i16> %b, zeroinitializer 174 %xx = sext <4 x i16> %x to <4 x i32> 175 %yy = sext <4 x i16> %y to <4 x i32> 176 %m = mul <4 x i32> %xx, %yy 177 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 178 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 179 ret i32 %z 180} 181 182define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 183; CHECK-LABEL: add_v8i16_v8i16: 184; CHECK: @ %bb.0: @ %entry 185; CHECK-NEXT: vpt.i16 eq, q2, zr 186; CHECK-NEXT: vmlavt.u16 r0, q0, q1 187; CHECK-NEXT: uxth r0, r0 188; CHECK-NEXT: bx lr 189entry: 190 %c = icmp eq <8 x i16> %b, zeroinitializer 191 %m = mul <8 x i16> %x, %y 192 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 193 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 194 ret i16 %z 195} 196 197define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 198; CHECK-LABEL: add_v8i16_v8i64_zext: 199; CHECK: @ %bb.0: @ %entry 200; CHECK-NEXT: vpt.i16 eq, q2, zr 201; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 202; CHECK-NEXT: bx lr 203entry: 204 %c = icmp eq <8 x i16> %b, zeroinitializer 205 %xx = zext <8 x i16> %x to <8 x i64> 206 %yy = zext <8 x i16> %y to <8 x i64> 207 %m = mul <8 x i64> %xx, %yy 208 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 209 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 210 ret i64 %z 211} 212 213define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 214; CHECK-LABEL: add_v8i16_v8i64_sext: 215; CHECK: @ %bb.0: @ %entry 216; CHECK-NEXT: vpt.i16 eq, q2, zr 217; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 218; CHECK-NEXT: bx lr 219entry: 220 %c = icmp eq <8 x i16> %b, zeroinitializer 221 %xx = sext <8 x i16> %x to <8 x i64> 222 %yy = sext <8 x i16> %y to <8 x i64> 223 %m = mul <8 x i64> %xx, %yy 224 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 225 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 226 ret i64 %z 227} 228 229define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) { 230; CHECK-LABEL: add_v8i8i16_v8i64_zext: 231; CHECK: @ %bb.0: @ %entry 232; CHECK-NEXT: vmovlb.u8 q1, q1 233; CHECK-NEXT: vpt.i16 eq, q2, zr 234; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 235; CHECK-NEXT: bx lr 236entry: 237 %c = icmp eq <8 x i16> %b, zeroinitializer 238 %xx = zext <8 x i16> %x to <8 x i64> 239 %yy = zext <8 x i8> %y to <8 x i64> 240 %m = mul <8 x i64> %xx, %yy 241 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 242 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 243 ret i64 %z 244} 245 246define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) { 247; CHECK-LABEL: add_v8i8i16_v8i64_sext: 248; CHECK: @ %bb.0: @ %entry 249; CHECK-NEXT: vmovlb.s8 q1, q1 250; CHECK-NEXT: vpt.i16 eq, q2, zr 251; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 252; CHECK-NEXT: bx lr 253entry: 254 %c = icmp eq <8 x i16> %b, zeroinitializer 255 %xx = sext <8 x i16> %x to <8 x i64> 256 %yy = sext <8 x i8> %y to <8 x i64> 257 %m = mul <8 x i64> %xx, %yy 258 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 259 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 260 ret i64 %z 261} 262 263define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 264; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext: 265; CHECK: @ %bb.0: @ %entry 266; CHECK-NEXT: vpt.i16 eq, q2, zr 267; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 268; CHECK-NEXT: bx lr 269entry: 270 %c = icmp eq <8 x i16> %b, zeroinitializer 271 %xx = zext <8 x i16> %x to <8 x i32> 272 %yy = zext <8 x i16> %y to <8 x i32> 273 %m = mul <8 x i32> %xx, %yy 274 %ma = zext <8 x i32> %m to <8 x i64> 275 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 276 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 277 ret i64 %z 278} 279 280define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 281; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: vpt.i16 eq, q2, zr 284; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 285; CHECK-NEXT: bx lr 286entry: 287 %c = icmp eq <8 x i16> %b, zeroinitializer 288 %xx = sext <8 x i16> %x to <8 x i32> 289 %yy = sext <8 x i16> %y to <8 x i32> 290 %m = mul <8 x i32> %xx, %yy 291 %ma = sext <8 x i32> %m to <8 x i64> 292 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 293 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 294 ret i64 %z 295} 296 297define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) { 298; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext: 299; CHECK: @ %bb.0: @ %entry 300; CHECK-NEXT: vpt.i16 eq, q2, zr 301; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q0 302; CHECK-NEXT: bx lr 303entry: 304 %c = icmp eq <8 x i16> %b, zeroinitializer 305 %xx = sext <8 x i16> %x to <8 x i32> 306 %m = mul <8 x i32> %xx, %xx 307 %ma = zext <8 x i32> %m to <8 x i64> 308 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 309 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 310 ret i64 %z 311} 312 313define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { 314; CHECK-LABEL: add_v4i16_v4i64_zext: 315; CHECK: @ %bb.0: @ %entry 316; CHECK-NEXT: vmovlb.u16 q2, q2 317; CHECK-NEXT: vmovlb.u16 q1, q1 318; CHECK-NEXT: vmovlb.u16 q0, q0 319; CHECK-NEXT: vpt.i32 eq, q2, zr 320; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 321; CHECK-NEXT: bx lr 322entry: 323 %c = icmp eq <4 x i16> %b, zeroinitializer 324 %xx = zext <4 x i16> %x to <4 x i64> 325 %yy = zext <4 x i16> %y to <4 x i64> 326 %m = mul <4 x i64> %xx, %yy 327 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 328 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 329 ret i64 %z 330} 331 332define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { 333; CHECK-LABEL: add_v4i16_v4i64_sext: 334; CHECK: @ %bb.0: @ %entry 335; CHECK-NEXT: vmovlb.u16 q2, q2 336; CHECK-NEXT: vmovlb.s16 q1, q1 337; CHECK-NEXT: vmovlb.s16 q0, q0 338; CHECK-NEXT: vpt.i32 eq, q2, zr 339; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 340; CHECK-NEXT: bx lr 341entry: 342 %c = icmp eq <4 x i16> %b, zeroinitializer 343 %xx = sext <4 x i16> %x to <4 x i64> 344 %yy = sext <4 x i16> %y to <4 x i64> 345 %m = mul <4 x i64> %xx, %yy 346 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 347 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 348 ret i64 %z 349} 350 351define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { 352; CHECK-LABEL: add_v2i16_v2i64_zext: 353; CHECK: @ %bb.0: @ %entry 354; CHECK-NEXT: vmov.i64 q3, #0xffff 355; CHECK-NEXT: vand q1, q1, q3 356; CHECK-NEXT: vand q0, q0, q3 357; CHECK-NEXT: vmov r0, s6 358; CHECK-NEXT: vmov r1, s2 359; CHECK-NEXT: vmov r2, s4 360; CHECK-NEXT: vand q1, q2, q3 361; CHECK-NEXT: vmov r3, s0 362; CHECK-NEXT: umull r0, r1, r1, r0 363; CHECK-NEXT: umull r2, r3, r3, r2 364; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 365; CHECK-NEXT: vmov r0, s4 366; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 367; CHECK-NEXT: movs r1, #0 368; CHECK-NEXT: cmp r0, #0 369; CHECK-NEXT: csetm r0, eq 370; CHECK-NEXT: bfi r1, r0, #0, #8 371; CHECK-NEXT: vmov r0, s6 372; CHECK-NEXT: vmov.i32 q1, #0x0 373; CHECK-NEXT: cmp r0, #0 374; CHECK-NEXT: csetm r0, eq 375; CHECK-NEXT: bfi r1, r0, #8, #8 376; CHECK-NEXT: vmsr p0, r1 377; CHECK-NEXT: vpsel q0, q0, q1 378; CHECK-NEXT: vmov r0, r1, d1 379; CHECK-NEXT: vmov r2, r3, d0 380; CHECK-NEXT: adds r0, r0, r2 381; CHECK-NEXT: adcs r1, r3 382; CHECK-NEXT: bx lr 383entry: 384 %c = icmp eq <2 x i16> %b, zeroinitializer 385 %xx = zext <2 x i16> %x to <2 x i64> 386 %yy = zext <2 x i16> %y to <2 x i64> 387 %m = mul <2 x i64> %xx, %yy 388 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 389 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 390 ret i64 %z 391} 392 393define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { 394; CHECK-LABEL: add_v2i16_v2i64_sext: 395; CHECK: @ %bb.0: @ %entry 396; CHECK-NEXT: vmov.i32 q3, #0xffff 397; CHECK-NEXT: movs r1, #0 398; CHECK-NEXT: vand q2, q2, q3 399; CHECK-NEXT: vmov r2, s4 400; CHECK-NEXT: vmov r0, s8 401; CHECK-NEXT: vmov r3, s0 402; CHECK-NEXT: cmp r0, #0 403; CHECK-NEXT: sxth r2, r2 404; CHECK-NEXT: csetm r0, eq 405; CHECK-NEXT: bfi r1, r0, #0, #8 406; CHECK-NEXT: vmov r0, s10 407; CHECK-NEXT: sxth r3, r3 408; CHECK-NEXT: smull r2, r3, r3, r2 409; CHECK-NEXT: cmp r0, #0 410; CHECK-NEXT: csetm r0, eq 411; CHECK-NEXT: bfi r1, r0, #8, #8 412; CHECK-NEXT: vmov r0, s6 413; CHECK-NEXT: vmsr p0, r1 414; CHECK-NEXT: vmov r1, s2 415; CHECK-NEXT: vmov.i32 q1, #0x0 416; CHECK-NEXT: sxth r0, r0 417; CHECK-NEXT: sxth r1, r1 418; CHECK-NEXT: smull r0, r1, r1, r0 419; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 420; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 421; CHECK-NEXT: vpsel q0, q0, q1 422; CHECK-NEXT: vmov r0, r1, d1 423; CHECK-NEXT: vmov r2, r3, d0 424; CHECK-NEXT: adds r0, r0, r2 425; CHECK-NEXT: adcs r1, r3 426; CHECK-NEXT: bx lr 427entry: 428 %c = icmp eq <2 x i16> %b, zeroinitializer 429 %xx = sext <2 x i16> %x to <2 x i64> 430 %yy = sext <2 x i16> %y to <2 x i64> 431 %m = mul <2 x i64> %xx, %yy 432 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 433 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 434 ret i64 %z 435} 436 437define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 438; CHECK-LABEL: add_v16i8_v16i32_zext: 439; CHECK: @ %bb.0: @ %entry 440; CHECK-NEXT: vpt.i8 eq, q2, zr 441; CHECK-NEXT: vmlavt.u8 r0, q0, q1 442; CHECK-NEXT: bx lr 443entry: 444 %c = icmp eq <16 x i8> %b, zeroinitializer 445 %xx = zext <16 x i8> %x to <16 x i32> 446 %yy = zext <16 x i8> %y to <16 x i32> 447 %m = mul <16 x i32> %xx, %yy 448 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 449 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 450 ret i32 %z 451} 452 453define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 454; CHECK-LABEL: add_v16i8_v16i32_sext: 455; CHECK: @ %bb.0: @ %entry 456; CHECK-NEXT: vpt.i8 eq, q2, zr 457; CHECK-NEXT: vmlavt.s8 r0, q0, q1 458; CHECK-NEXT: bx lr 459entry: 460 %c = icmp eq <16 x i8> %b, zeroinitializer 461 %xx = sext <16 x i8> %x to <16 x i32> 462 %yy = sext <16 x i8> %y to <16 x i32> 463 %m = mul <16 x i32> %xx, %yy 464 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 465 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 466 ret i32 %z 467} 468 469define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 470; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext: 471; CHECK: @ %bb.0: @ %entry 472; CHECK-NEXT: vpt.i8 eq, q2, zr 473; CHECK-NEXT: vmlavt.u8 r0, q0, q1 474; CHECK-NEXT: bx lr 475entry: 476 %c = icmp eq <16 x i8> %b, zeroinitializer 477 %xx = zext <16 x i8> %x to <16 x i16> 478 %yy = zext <16 x i8> %y to <16 x i16> 479 %m = mul <16 x i16> %xx, %yy 480 %ma = zext <16 x i16> %m to <16 x i32> 481 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 482 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 483 ret i32 %z 484} 485 486define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 487; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext: 488; CHECK: @ %bb.0: @ %entry 489; CHECK-NEXT: vpt.i8 eq, q2, zr 490; CHECK-NEXT: vmlavt.s8 r0, q0, q1 491; CHECK-NEXT: bx lr 492entry: 493 %c = icmp eq <16 x i8> %b, zeroinitializer 494 %xx = sext <16 x i8> %x to <16 x i16> 495 %yy = sext <16 x i8> %y to <16 x i16> 496 %m = mul <16 x i16> %xx, %yy 497 %ma = sext <16 x i16> %m to <16 x i32> 498 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 499 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 500 ret i32 %z 501} 502 503define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 504; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext: 505; CHECK: @ %bb.0: @ %entry 506; CHECK-NEXT: vpt.i8 eq, q2, zr 507; CHECK-NEXT: vmlavt.s8 r0, q0, q0 508; CHECK-NEXT: bx lr 509entry: 510 %c = icmp eq <16 x i8> %b, zeroinitializer 511 %xx = sext <16 x i8> %x to <16 x i16> 512 %m = mul <16 x i16> %xx, %xx 513 %ma = zext <16 x i16> %m to <16 x i32> 514 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 515 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 516 ret i32 %z 517} 518 519define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 520; CHECK-LABEL: add_v8i8_v8i32_zext: 521; CHECK: @ %bb.0: @ %entry 522; CHECK-NEXT: vmovlb.u8 q2, q2 523; CHECK-NEXT: vmovlb.u8 q1, q1 524; CHECK-NEXT: vmovlb.u8 q0, q0 525; CHECK-NEXT: vpt.i16 eq, q2, zr 526; CHECK-NEXT: vmlavt.u16 r0, q0, q1 527; CHECK-NEXT: bx lr 528entry: 529 %c = icmp eq <8 x i8> %b, zeroinitializer 530 %xx = zext <8 x i8> %x to <8 x i32> 531 %yy = zext <8 x i8> %y to <8 x i32> 532 %m = mul <8 x i32> %xx, %yy 533 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 534 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 535 ret i32 %z 536} 537 538define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 539; CHECK-LABEL: add_v8i8_v8i32_sext: 540; CHECK: @ %bb.0: @ %entry 541; CHECK-NEXT: vmovlb.u8 q2, q2 542; CHECK-NEXT: vmovlb.s8 q1, q1 543; CHECK-NEXT: vmovlb.s8 q0, q0 544; CHECK-NEXT: vpt.i16 eq, q2, zr 545; CHECK-NEXT: vmlavt.s16 r0, q0, q1 546; CHECK-NEXT: bx lr 547entry: 548 %c = icmp eq <8 x i8> %b, zeroinitializer 549 %xx = sext <8 x i8> %x to <8 x i32> 550 %yy = sext <8 x i8> %y to <8 x i32> 551 %m = mul <8 x i32> %xx, %yy 552 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 553 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 554 ret i32 %z 555} 556 557define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) { 558; CHECK-LABEL: add_v8i8i16_v8i32_zext: 559; CHECK: @ %bb.0: @ %entry 560; CHECK-NEXT: vmovlb.u8 q2, q2 561; CHECK-NEXT: vmovlb.u8 q0, q0 562; CHECK-NEXT: vpt.i16 eq, q2, zr 563; CHECK-NEXT: vmlavt.u16 r0, q0, q1 564; CHECK-NEXT: bx lr 565entry: 566 %c = icmp eq <8 x i8> %b, zeroinitializer 567 %xx = zext <8 x i8> %x to <8 x i32> 568 %yy = zext <8 x i16> %y to <8 x i32> 569 %m = mul <8 x i32> %xx, %yy 570 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 571 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 572 ret i32 %z 573} 574 575define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) { 576; CHECK-LABEL: add_v8i8i16_v8i32_sext: 577; CHECK: @ %bb.0: @ %entry 578; CHECK-NEXT: vmovlb.u8 q2, q2 579; CHECK-NEXT: vmovlb.s8 q0, q0 580; CHECK-NEXT: vpt.i16 eq, q2, zr 581; CHECK-NEXT: vmlavt.s16 r0, q0, q1 582; CHECK-NEXT: bx lr 583entry: 584 %c = icmp eq <8 x i8> %b, zeroinitializer 585 %xx = sext <8 x i8> %x to <8 x i32> 586 %yy = sext <8 x i16> %y to <8 x i32> 587 %m = mul <8 x i32> %xx, %yy 588 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 589 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 590 ret i32 %z 591} 592 593define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 594; CHECK-LABEL: add_v4i8_v4i32_zext: 595; CHECK: @ %bb.0: @ %entry 596; CHECK-NEXT: vmov.i32 q3, #0xff 597; CHECK-NEXT: vand q2, q2, q3 598; CHECK-NEXT: vand q1, q1, q3 599; CHECK-NEXT: vand q0, q0, q3 600; CHECK-NEXT: vpt.i32 eq, q2, zr 601; CHECK-NEXT: vmlavt.u32 r0, q0, q1 602; CHECK-NEXT: bx lr 603entry: 604 %c = icmp eq <4 x i8> %b, zeroinitializer 605 %xx = zext <4 x i8> %x to <4 x i32> 606 %yy = zext <4 x i8> %y to <4 x i32> 607 %m = mul <4 x i32> %xx, %yy 608 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 609 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 610 ret i32 %z 611} 612 613define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 614; CHECK-LABEL: add_v4i8_v4i32_sext: 615; CHECK: @ %bb.0: @ %entry 616; CHECK-NEXT: vmov.i32 q3, #0xff 617; CHECK-NEXT: vmovlb.s8 q1, q1 618; CHECK-NEXT: vmovlb.s8 q0, q0 619; CHECK-NEXT: vand q2, q2, q3 620; CHECK-NEXT: vmovlb.s16 q1, q1 621; CHECK-NEXT: vmovlb.s16 q0, q0 622; CHECK-NEXT: vpt.i32 eq, q2, zr 623; CHECK-NEXT: vmlavt.u32 r0, q0, q1 624; CHECK-NEXT: bx lr 625entry: 626 %c = icmp eq <4 x i8> %b, zeroinitializer 627 %xx = sext <4 x i8> %x to <4 x i32> 628 %yy = sext <4 x i8> %y to <4 x i32> 629 %m = mul <4 x i32> %xx, %yy 630 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 631 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 632 ret i32 %z 633} 634 635define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 636; CHECK-LABEL: add_v4i8_v4i32_szext: 637; CHECK: @ %bb.0: @ %entry 638; CHECK-NEXT: vmov.i32 q3, #0xff 639; CHECK-NEXT: vmovlb.s8 q0, q0 640; CHECK-NEXT: vand q2, q2, q3 641; CHECK-NEXT: vand q1, q1, q3 642; CHECK-NEXT: vmovlb.s16 q0, q0 643; CHECK-NEXT: vpt.i32 eq, q2, zr 644; CHECK-NEXT: vmlavt.u32 r0, q0, q1 645; CHECK-NEXT: bx lr 646entry: 647 %c = icmp eq <4 x i8> %b, zeroinitializer 648 %xx = sext <4 x i8> %x to <4 x i32> 649 %yy = zext <4 x i8> %y to <4 x i32> 650 %m = mul <4 x i32> %xx, %yy 651 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 652 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 653 ret i32 %z 654} 655 656define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 657; CHECK-LABEL: add_v16i8_v16i16_zext: 658; CHECK: @ %bb.0: @ %entry 659; CHECK-NEXT: vpt.i8 eq, q2, zr 660; CHECK-NEXT: vmlavt.u8 r0, q0, q1 661; CHECK-NEXT: uxth r0, r0 662; CHECK-NEXT: bx lr 663entry: 664 %c = icmp eq <16 x i8> %b, zeroinitializer 665 %xx = zext <16 x i8> %x to <16 x i16> 666 %yy = zext <16 x i8> %y to <16 x i16> 667 %m = mul <16 x i16> %xx, %yy 668 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 669 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 670 ret i16 %z 671} 672 673define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 674; CHECK-LABEL: add_v16i8_v16i16_sext: 675; CHECK: @ %bb.0: @ %entry 676; CHECK-NEXT: vpt.i8 eq, q2, zr 677; CHECK-NEXT: vmlavt.s8 r0, q0, q1 678; CHECK-NEXT: sxth r0, r0 679; CHECK-NEXT: bx lr 680entry: 681 %c = icmp eq <16 x i8> %b, zeroinitializer 682 %xx = sext <16 x i8> %x to <16 x i16> 683 %yy = sext <16 x i8> %y to <16 x i16> 684 %m = mul <16 x i16> %xx, %yy 685 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 686 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 687 ret i16 %z 688} 689 690define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 691; CHECK-LABEL: add_v16i8_v16i16_szext: 692; CHECK: @ %bb.0: @ %entry 693; CHECK-NEXT: .pad #32 694; CHECK-NEXT: sub sp, #32 695; CHECK-NEXT: add r0, sp, #16 696; CHECK-NEXT: mov r1, sp 697; CHECK-NEXT: vstrw.32 q1, [r0] 698; CHECK-NEXT: vstrw.32 q0, [r1] 699; CHECK-NEXT: vcmp.i8 eq, q2, zr 700; CHECK-NEXT: vmov.i8 q0, #0x0 701; CHECK-NEXT: vmov.i8 q1, #0xff 702; CHECK-NEXT: vldrb.s16 q2, [r1, #8] 703; CHECK-NEXT: vpsel q0, q1, q0 704; CHECK-NEXT: vmov.u8 r2, q0[8] 705; CHECK-NEXT: vmov.u8 r3, q0[0] 706; CHECK-NEXT: vmov.16 q1[0], r2 707; CHECK-NEXT: vmov.u8 r2, q0[9] 708; CHECK-NEXT: vmov.16 q1[1], r2 709; CHECK-NEXT: vmov.u8 r2, q0[10] 710; CHECK-NEXT: vmov.16 q1[2], r2 711; CHECK-NEXT: vmov.u8 r2, q0[11] 712; CHECK-NEXT: vmov.16 q1[3], r2 713; CHECK-NEXT: vmov.u8 r2, q0[12] 714; CHECK-NEXT: vmov.16 q1[4], r2 715; CHECK-NEXT: vmov.u8 r2, q0[13] 716; CHECK-NEXT: vmov.16 q1[5], r2 717; CHECK-NEXT: vmov.u8 r2, q0[14] 718; CHECK-NEXT: vmov.16 q1[6], r2 719; CHECK-NEXT: vmov.u8 r2, q0[15] 720; CHECK-NEXT: vmov.16 q1[7], r2 721; CHECK-NEXT: vcmp.i16 ne, q1, zr 722; CHECK-NEXT: vldrb.u16 q1, [r0, #8] 723; CHECK-NEXT: vpst 724; CHECK-NEXT: vmlavt.u16 r2, q2, q1 725; CHECK-NEXT: vmov.16 q1[0], r3 726; CHECK-NEXT: vmov.u8 r3, q0[1] 727; CHECK-NEXT: vmov.16 q1[1], r3 728; CHECK-NEXT: vmov.u8 r3, q0[2] 729; CHECK-NEXT: vmov.16 q1[2], r3 730; CHECK-NEXT: vmov.u8 r3, q0[3] 731; CHECK-NEXT: vmov.16 q1[3], r3 732; CHECK-NEXT: vmov.u8 r3, q0[4] 733; CHECK-NEXT: vmov.16 q1[4], r3 734; CHECK-NEXT: vmov.u8 r3, q0[5] 735; CHECK-NEXT: vmov.16 q1[5], r3 736; CHECK-NEXT: vmov.u8 r3, q0[6] 737; CHECK-NEXT: vmov.16 q1[6], r3 738; CHECK-NEXT: vmov.u8 r3, q0[7] 739; CHECK-NEXT: vmov.16 q1[7], r3 740; CHECK-NEXT: vldrb.u16 q0, [r0] 741; CHECK-NEXT: vcmp.i16 ne, q1, zr 742; CHECK-NEXT: vldrb.s16 q1, [r1] 743; CHECK-NEXT: vpst 744; CHECK-NEXT: vmlavat.u16 r2, q1, q0 745; CHECK-NEXT: sxth r0, r2 746; CHECK-NEXT: add sp, #32 747; CHECK-NEXT: bx lr 748entry: 749 %c = icmp eq <16 x i8> %b, zeroinitializer 750 %xx = sext <16 x i8> %x to <16 x i16> 751 %yy = zext <16 x i8> %y to <16 x i16> 752 %m = mul <16 x i16> %xx, %yy 753 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 754 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 755 ret i16 %z 756} 757 758define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 759; CHECK-LABEL: add_v8i8_v8i16_zext: 760; CHECK: @ %bb.0: @ %entry 761; CHECK-NEXT: vmovlb.u8 q2, q2 762; CHECK-NEXT: vmovlb.u8 q1, q1 763; CHECK-NEXT: vmovlb.u8 q0, q0 764; CHECK-NEXT: vpt.i16 eq, q2, zr 765; CHECK-NEXT: vmlavt.u16 r0, q0, q1 766; CHECK-NEXT: uxth r0, r0 767; CHECK-NEXT: bx lr 768entry: 769 %c = icmp eq <8 x i8> %b, zeroinitializer 770 %xx = zext <8 x i8> %x to <8 x i16> 771 %yy = zext <8 x i8> %y to <8 x i16> 772 %m = mul <8 x i16> %xx, %yy 773 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 774 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 775 ret i16 %z 776} 777 778define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 779; CHECK-LABEL: add_v8i8_v8i16_sext: 780; CHECK: @ %bb.0: @ %entry 781; CHECK-NEXT: vmovlb.u8 q2, q2 782; CHECK-NEXT: vmovlb.s8 q1, q1 783; CHECK-NEXT: vmovlb.s8 q0, q0 784; CHECK-NEXT: vpt.i16 eq, q2, zr 785; CHECK-NEXT: vmlavt.u16 r0, q0, q1 786; CHECK-NEXT: sxth r0, r0 787; CHECK-NEXT: bx lr 788entry: 789 %c = icmp eq <8 x i8> %b, zeroinitializer 790 %xx = sext <8 x i8> %x to <8 x i16> 791 %yy = sext <8 x i8> %y to <8 x i16> 792 %m = mul <8 x i16> %xx, %yy 793 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 794 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 795 ret i16 %z 796} 797 798define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 799; CHECK-LABEL: add_v16i8_v16i8: 800; CHECK: @ %bb.0: @ %entry 801; CHECK-NEXT: vpt.i8 eq, q2, zr 802; CHECK-NEXT: vmlavt.u8 r0, q0, q1 803; CHECK-NEXT: uxtb r0, r0 804; CHECK-NEXT: bx lr 805entry: 806 %c = icmp eq <16 x i8> %b, zeroinitializer 807 %m = mul <16 x i8> %x, %y 808 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer 809 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) 810 ret i8 %z 811} 812 813define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 814; CHECK-LABEL: add_v16i8_v16i64_zext: 815; CHECK: @ %bb.0: @ %entry 816; CHECK-NEXT: .save {r7, lr} 817; CHECK-NEXT: push {r7, lr} 818; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 819; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 820; CHECK-NEXT: .pad #32 821; CHECK-NEXT: sub sp, #32 822; CHECK-NEXT: vmov q3, q0 823; CHECK-NEXT: vmov.i8 q0, #0x0 824; CHECK-NEXT: vcmp.i8 eq, q2, zr 825; CHECK-NEXT: vmov.i8 q2, #0xff 826; CHECK-NEXT: vpsel q6, q2, q0 827; CHECK-NEXT: vmov q4, q0 828; CHECK-NEXT: vmov.u8 r0, q6[0] 829; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 830; CHECK-NEXT: vmov.16 q0[0], r0 831; CHECK-NEXT: vmov.u8 r0, q6[1] 832; CHECK-NEXT: vmov.16 q0[1], r0 833; CHECK-NEXT: vmov.u8 r0, q6[2] 834; CHECK-NEXT: vmov.16 q0[2], r0 835; CHECK-NEXT: vmov.u8 r0, q6[3] 836; CHECK-NEXT: vmov.16 q0[3], r0 837; CHECK-NEXT: vmov.u8 r0, q6[4] 838; CHECK-NEXT: vmov.16 q0[4], r0 839; CHECK-NEXT: vmov.u8 r0, q6[5] 840; CHECK-NEXT: vmov.16 q0[5], r0 841; CHECK-NEXT: vmov.u8 r0, q6[6] 842; CHECK-NEXT: vmov.16 q0[6], r0 843; CHECK-NEXT: vmov.u8 r0, q6[7] 844; CHECK-NEXT: vmov.16 q0[7], r0 845; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill 846; CHECK-NEXT: vcmp.i16 ne, q0, zr 847; CHECK-NEXT: vmov.u8 r2, q3[0] 848; CHECK-NEXT: vpsel q7, q2, q4 849; CHECK-NEXT: vmov.u16 r0, q7[2] 850; CHECK-NEXT: vmov.u16 r1, q7[0] 851; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 852; CHECK-NEXT: vmov.u16 r0, q7[3] 853; CHECK-NEXT: vmov.u16 r1, q7[1] 854; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 855; CHECK-NEXT: vcmp.i32 ne, q0, zr 856; CHECK-NEXT: vpsel q0, q2, q4 857; CHECK-NEXT: vmov r0, r1, d0 858; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 859; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 860; CHECK-NEXT: vmov.u8 r0, q1[1] 861; CHECK-NEXT: vmov.u8 r1, q1[0] 862; CHECK-NEXT: vcmp.i32 ne, q2, zr 863; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 864; CHECK-NEXT: vmov.u8 r1, q3[1] 865; CHECK-NEXT: vmov.i64 q2, #0xff 866; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 867; CHECK-NEXT: vand q5, q5, q2 868; CHECK-NEXT: vand q4, q4, q2 869; CHECK-NEXT: vmov r0, s22 870; CHECK-NEXT: vmov r1, s18 871; CHECK-NEXT: vmov r2, s20 872; CHECK-NEXT: vmov.i32 q5, #0x0 873; CHECK-NEXT: vmov r3, s16 874; CHECK-NEXT: umull r0, r1, r1, r0 875; CHECK-NEXT: umull r2, r3, r3, r2 876; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 877; CHECK-NEXT: vmov q4[3], q4[1], r3, r1 878; CHECK-NEXT: vpsel q4, q4, q5 879; CHECK-NEXT: vmov r0, r1, d9 880; CHECK-NEXT: vmov r2, r3, d8 881; CHECK-NEXT: adds.w r12, r2, r0 882; CHECK-NEXT: vmov.u8 r0, q3[2] 883; CHECK-NEXT: adc.w lr, r3, r1 884; CHECK-NEXT: vmov r2, r3, d1 885; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 886; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 887; CHECK-NEXT: vmov.u8 r2, q1[3] 888; CHECK-NEXT: vmov.u8 r3, q1[2] 889; CHECK-NEXT: vcmp.i32 ne, q0, zr 890; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 891; CHECK-NEXT: vmov.u8 r3, q3[3] 892; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 893; CHECK-NEXT: vand q0, q0, q2 894; CHECK-NEXT: vand q4, q4, q2 895; CHECK-NEXT: vmov r2, s2 896; CHECK-NEXT: vmov r0, s18 897; CHECK-NEXT: vmov r1, s16 898; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 899; CHECK-NEXT: vmov r3, s0 900; CHECK-NEXT: umull r0, r2, r0, r2 901; CHECK-NEXT: umull r1, r3, r1, r3 902; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 903; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 904; CHECK-NEXT: vpsel q0, q0, q5 905; CHECK-NEXT: vmov r0, r1, d0 906; CHECK-NEXT: vmov r2, r3, d1 907; CHECK-NEXT: adds.w r0, r0, r12 908; CHECK-NEXT: adc.w r1, r1, lr 909; CHECK-NEXT: adds.w r12, r0, r2 910; CHECK-NEXT: adc.w lr, r1, r3 911; CHECK-NEXT: vmov.u16 r2, q7[6] 912; CHECK-NEXT: vmov.u16 r3, q7[4] 913; CHECK-NEXT: vmov.u8 r0, q3[4] 914; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 915; CHECK-NEXT: vmov.u16 r2, q7[7] 916; CHECK-NEXT: vmov.u16 r3, q7[5] 917; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 918; CHECK-NEXT: vcmp.i32 ne, q0, zr 919; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 920; CHECK-NEXT: vpsel q0, q0, q4 921; CHECK-NEXT: vmov r2, r3, d0 922; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 923; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 924; CHECK-NEXT: vmov.u8 r2, q1[5] 925; CHECK-NEXT: vmov.u8 r3, q1[4] 926; CHECK-NEXT: vcmp.i32 ne, q4, zr 927; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 928; CHECK-NEXT: vmov.u8 r3, q3[5] 929; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 930; CHECK-NEXT: vand q4, q4, q2 931; CHECK-NEXT: vand q7, q7, q2 932; CHECK-NEXT: vmov r2, s18 933; CHECK-NEXT: vmov r0, s30 934; CHECK-NEXT: vmov r1, s28 935; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload 936; CHECK-NEXT: vmov r3, s16 937; CHECK-NEXT: umull r0, r2, r0, r2 938; CHECK-NEXT: umull r1, r3, r1, r3 939; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 940; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 941; CHECK-NEXT: vpsel q4, q4, q5 942; CHECK-NEXT: vmov r0, r1, d8 943; CHECK-NEXT: vmov r2, r3, d9 944; CHECK-NEXT: adds.w r0, r0, r12 945; CHECK-NEXT: adc.w r1, r1, lr 946; CHECK-NEXT: adds.w r12, r0, r2 947; CHECK-NEXT: adc.w lr, r1, r3 948; CHECK-NEXT: vmov r2, r3, d1 949; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 950; CHECK-NEXT: vmov.u8 r0, q3[6] 951; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 952; CHECK-NEXT: vmov.u8 r2, q1[7] 953; CHECK-NEXT: vmov.u8 r3, q1[6] 954; CHECK-NEXT: vcmp.i32 ne, q0, zr 955; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 956; CHECK-NEXT: vmov.u8 r3, q3[7] 957; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 958; CHECK-NEXT: vand q0, q0, q2 959; CHECK-NEXT: vand q4, q4, q2 960; CHECK-NEXT: vmov r2, s2 961; CHECK-NEXT: vmov r0, s18 962; CHECK-NEXT: vmov r1, s16 963; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 964; CHECK-NEXT: vmov r3, s0 965; CHECK-NEXT: umull r0, r2, r0, r2 966; CHECK-NEXT: umull r1, r3, r1, r3 967; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 968; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 969; CHECK-NEXT: vpsel q0, q0, q5 970; CHECK-NEXT: vmov r0, r1, d0 971; CHECK-NEXT: vmov r2, r3, d1 972; CHECK-NEXT: adds.w r0, r0, r12 973; CHECK-NEXT: adc.w r1, r1, lr 974; CHECK-NEXT: adds.w r12, r0, r2 975; CHECK-NEXT: vmov.u8 r2, q6[8] 976; CHECK-NEXT: adc.w lr, r1, r3 977; CHECK-NEXT: vmov.16 q0[0], r2 978; CHECK-NEXT: vmov.u8 r2, q6[9] 979; CHECK-NEXT: vmov.16 q0[1], r2 980; CHECK-NEXT: vmov.u8 r2, q6[10] 981; CHECK-NEXT: vmov.16 q0[2], r2 982; CHECK-NEXT: vmov.u8 r2, q6[11] 983; CHECK-NEXT: vmov.16 q0[3], r2 984; CHECK-NEXT: vmov.u8 r2, q6[12] 985; CHECK-NEXT: vmov.16 q0[4], r2 986; CHECK-NEXT: vmov.u8 r2, q6[13] 987; CHECK-NEXT: vmov.16 q0[5], r2 988; CHECK-NEXT: vmov.u8 r2, q6[14] 989; CHECK-NEXT: vmov.16 q0[6], r2 990; CHECK-NEXT: vmov.u8 r2, q6[15] 991; CHECK-NEXT: vmov.16 q0[7], r2 992; CHECK-NEXT: vmov.u8 r0, q3[8] 993; CHECK-NEXT: vcmp.i16 ne, q0, zr 994; CHECK-NEXT: vpsel q6, q7, q4 995; CHECK-NEXT: vmov.u16 r2, q6[2] 996; CHECK-NEXT: vmov.u16 r3, q6[0] 997; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 998; CHECK-NEXT: vmov.u16 r2, q6[3] 999; CHECK-NEXT: vmov.u16 r3, q6[1] 1000; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1001; CHECK-NEXT: vcmp.i32 ne, q0, zr 1002; CHECK-NEXT: vpsel q0, q7, q4 1003; CHECK-NEXT: vmov r2, r3, d0 1004; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 1005; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 1006; CHECK-NEXT: vmov.u8 r2, q1[9] 1007; CHECK-NEXT: vmov.u8 r3, q1[8] 1008; CHECK-NEXT: vcmp.i32 ne, q4, zr 1009; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 1010; CHECK-NEXT: vmov.u8 r3, q3[9] 1011; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 1012; CHECK-NEXT: vand q4, q4, q2 1013; CHECK-NEXT: vand q7, q7, q2 1014; CHECK-NEXT: vmov r2, s18 1015; CHECK-NEXT: vmov r0, s30 1016; CHECK-NEXT: vmov r3, s16 1017; CHECK-NEXT: vmov r1, s28 1018; CHECK-NEXT: umull r0, r2, r0, r2 1019; CHECK-NEXT: umull r1, r3, r1, r3 1020; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 1021; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 1022; CHECK-NEXT: vpsel q4, q4, q5 1023; CHECK-NEXT: vmov r0, r1, d8 1024; CHECK-NEXT: vmov r2, r3, d9 1025; CHECK-NEXT: adds.w r0, r0, r12 1026; CHECK-NEXT: adc.w r1, r1, lr 1027; CHECK-NEXT: adds.w r12, r0, r2 1028; CHECK-NEXT: adc.w lr, r1, r3 1029; CHECK-NEXT: vmov r2, r3, d1 1030; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1031; CHECK-NEXT: vmov.u8 r0, q3[10] 1032; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1033; CHECK-NEXT: vmov.u8 r2, q1[11] 1034; CHECK-NEXT: vmov.u8 r3, q1[10] 1035; CHECK-NEXT: vcmp.i32 ne, q0, zr 1036; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1037; CHECK-NEXT: vmov.u8 r3, q3[11] 1038; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 1039; CHECK-NEXT: vand q0, q0, q2 1040; CHECK-NEXT: vand q4, q4, q2 1041; CHECK-NEXT: vmov r2, s2 1042; CHECK-NEXT: vmov r0, s18 1043; CHECK-NEXT: vmov r1, s16 1044; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 1045; CHECK-NEXT: vmov r3, s0 1046; CHECK-NEXT: umull r0, r2, r0, r2 1047; CHECK-NEXT: umull r1, r3, r1, r3 1048; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 1049; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1050; CHECK-NEXT: vpsel q0, q0, q5 1051; CHECK-NEXT: vmov r0, r1, d0 1052; CHECK-NEXT: vmov r2, r3, d1 1053; CHECK-NEXT: adds.w r0, r0, r12 1054; CHECK-NEXT: adc.w r1, r1, lr 1055; CHECK-NEXT: adds.w r12, r0, r2 1056; CHECK-NEXT: adc.w lr, r1, r3 1057; CHECK-NEXT: vmov.u16 r2, q6[6] 1058; CHECK-NEXT: vmov.u16 r3, q6[4] 1059; CHECK-NEXT: vmov.u8 r0, q3[12] 1060; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1061; CHECK-NEXT: vmov.u16 r2, q6[7] 1062; CHECK-NEXT: vmov.u16 r3, q6[5] 1063; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1064; CHECK-NEXT: vcmp.i32 ne, q0, zr 1065; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 1066; CHECK-NEXT: vpsel q0, q0, q4 1067; CHECK-NEXT: vmov r2, r3, d0 1068; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 1069; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 1070; CHECK-NEXT: vmov.u8 r2, q1[13] 1071; CHECK-NEXT: vmov.u8 r3, q1[12] 1072; CHECK-NEXT: vcmp.i32 ne, q4, zr 1073; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 1074; CHECK-NEXT: vmov.u8 r3, q3[13] 1075; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 1076; CHECK-NEXT: vand q4, q4, q2 1077; CHECK-NEXT: vand q6, q6, q2 1078; CHECK-NEXT: vmov r2, s18 1079; CHECK-NEXT: vmov r0, s26 1080; CHECK-NEXT: vmov r3, s16 1081; CHECK-NEXT: vmov r1, s24 1082; CHECK-NEXT: umull r0, r2, r0, r2 1083; CHECK-NEXT: umull r1, r3, r1, r3 1084; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 1085; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 1086; CHECK-NEXT: vpsel q4, q4, q5 1087; CHECK-NEXT: vmov r0, r1, d8 1088; CHECK-NEXT: vmov r2, r3, d9 1089; CHECK-NEXT: adds.w r0, r0, r12 1090; CHECK-NEXT: adc.w r1, r1, lr 1091; CHECK-NEXT: adds.w r12, r0, r2 1092; CHECK-NEXT: adc.w lr, r1, r3 1093; CHECK-NEXT: vmov r2, r3, d1 1094; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1095; CHECK-NEXT: vmov.u8 r0, q3[14] 1096; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1097; CHECK-NEXT: vmov.u8 r2, q1[15] 1098; CHECK-NEXT: vmov.u8 r3, q1[14] 1099; CHECK-NEXT: vcmp.i32 ne, q0, zr 1100; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1101; CHECK-NEXT: vmov.u8 r3, q3[15] 1102; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 1103; CHECK-NEXT: vand q0, q0, q2 1104; CHECK-NEXT: vand q1, q1, q2 1105; CHECK-NEXT: vmov r2, s2 1106; CHECK-NEXT: vmov r0, s6 1107; CHECK-NEXT: vmov r3, s0 1108; CHECK-NEXT: vmov r1, s4 1109; CHECK-NEXT: umull r0, r2, r0, r2 1110; CHECK-NEXT: umull r1, r3, r1, r3 1111; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 1112; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1113; CHECK-NEXT: vpsel q0, q0, q5 1114; CHECK-NEXT: vmov r0, r1, d0 1115; CHECK-NEXT: vmov r2, r3, d1 1116; CHECK-NEXT: adds.w r0, r0, r12 1117; CHECK-NEXT: adc.w r1, r1, lr 1118; CHECK-NEXT: adds r0, r0, r2 1119; CHECK-NEXT: adcs r1, r3 1120; CHECK-NEXT: add sp, #32 1121; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1122; CHECK-NEXT: pop {r7, pc} 1123entry: 1124 %c = icmp eq <16 x i8> %b, zeroinitializer 1125 %xx = zext <16 x i8> %x to <16 x i64> 1126 %yy = zext <16 x i8> %y to <16 x i64> 1127 %m = mul <16 x i64> %xx, %yy 1128 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 1129 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 1130 ret i64 %z 1131} 1132 1133define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { 1134; CHECK-LABEL: add_v16i8_v16i64_sext: 1135; CHECK: @ %bb.0: @ %entry 1136; CHECK-NEXT: .save {r7, lr} 1137; CHECK-NEXT: push {r7, lr} 1138; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1139; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1140; CHECK-NEXT: .pad #16 1141; CHECK-NEXT: sub sp, #16 1142; CHECK-NEXT: vmov q3, q0 1143; CHECK-NEXT: vcmp.i8 eq, q2, zr 1144; CHECK-NEXT: vmov.i8 q0, #0x0 1145; CHECK-NEXT: vmov.i8 q2, #0xff 1146; CHECK-NEXT: vpsel q5, q2, q0 1147; CHECK-NEXT: vmov.s8 r2, q1[0] 1148; CHECK-NEXT: vmov.u8 r0, q5[0] 1149; CHECK-NEXT: vmov.s8 r3, q3[0] 1150; CHECK-NEXT: vmov.16 q4[0], r0 1151; CHECK-NEXT: vmov.u8 r0, q5[1] 1152; CHECK-NEXT: vmov.16 q4[1], r0 1153; CHECK-NEXT: vmov.u8 r0, q5[2] 1154; CHECK-NEXT: vmov.16 q4[2], r0 1155; CHECK-NEXT: vmov.u8 r0, q5[3] 1156; CHECK-NEXT: vmov.16 q4[3], r0 1157; CHECK-NEXT: vmov.u8 r0, q5[4] 1158; CHECK-NEXT: vmov.16 q4[4], r0 1159; CHECK-NEXT: vmov.u8 r0, q5[5] 1160; CHECK-NEXT: vmov.16 q4[5], r0 1161; CHECK-NEXT: vmov.u8 r0, q5[6] 1162; CHECK-NEXT: vmov.16 q4[6], r0 1163; CHECK-NEXT: vmov.u8 r0, q5[7] 1164; CHECK-NEXT: vmov.16 q4[7], r0 1165; CHECK-NEXT: smull r2, r3, r3, r2 1166; CHECK-NEXT: vcmp.i16 ne, q4, zr 1167; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 1168; CHECK-NEXT: vpsel q6, q2, q0 1169; CHECK-NEXT: vmov.u16 r0, q6[2] 1170; CHECK-NEXT: vmov.u16 r1, q6[0] 1171; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 1172; CHECK-NEXT: vmov.u16 r0, q6[3] 1173; CHECK-NEXT: vmov.u16 r1, q6[1] 1174; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 1175; CHECK-NEXT: vcmp.i32 ne, q4, zr 1176; CHECK-NEXT: vpsel q7, q2, q0 1177; CHECK-NEXT: vmov r0, r1, d14 1178; CHECK-NEXT: vmov q4[2], q4[0], r0, r1 1179; CHECK-NEXT: vmov q4[3], q4[1], r0, r1 1180; CHECK-NEXT: vmov.s8 r0, q1[1] 1181; CHECK-NEXT: vmov.s8 r1, q3[1] 1182; CHECK-NEXT: vcmp.i32 ne, q4, zr 1183; CHECK-NEXT: smull r0, r1, r1, r0 1184; CHECK-NEXT: vmov.i32 q4, #0x0 1185; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 1186; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 1187; CHECK-NEXT: vpsel q0, q0, q4 1188; CHECK-NEXT: vmov r0, r1, d1 1189; CHECK-NEXT: vmov r2, r3, d0 1190; CHECK-NEXT: adds.w r12, r2, r0 1191; CHECK-NEXT: vmov.s8 r0, q1[2] 1192; CHECK-NEXT: adc.w lr, r3, r1 1193; CHECK-NEXT: vmov r2, r3, d15 1194; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1195; CHECK-NEXT: vmov.s8 r1, q3[2] 1196; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1197; CHECK-NEXT: vmov.s8 r2, q1[3] 1198; CHECK-NEXT: vmov.s8 r3, q3[3] 1199; CHECK-NEXT: smull r0, r1, r1, r0 1200; CHECK-NEXT: vcmp.i32 ne, q0, zr 1201; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload 1202; CHECK-NEXT: smull r2, r3, r3, r2 1203; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1204; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1205; CHECK-NEXT: vpsel q0, q0, q4 1206; CHECK-NEXT: vmov r0, r1, d0 1207; CHECK-NEXT: vmov r2, r3, d1 1208; CHECK-NEXT: adds.w r0, r0, r12 1209; CHECK-NEXT: adc.w r1, r1, lr 1210; CHECK-NEXT: adds.w r12, r0, r2 1211; CHECK-NEXT: adc.w lr, r1, r3 1212; CHECK-NEXT: vmov.u16 r2, q6[6] 1213; CHECK-NEXT: vmov.u16 r3, q6[4] 1214; CHECK-NEXT: vmov.s8 r0, q1[4] 1215; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1216; CHECK-NEXT: vmov.u16 r2, q6[7] 1217; CHECK-NEXT: vmov.u16 r3, q6[5] 1218; CHECK-NEXT: vmov.s8 r1, q3[4] 1219; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1220; CHECK-NEXT: smull r0, r1, r1, r0 1221; CHECK-NEXT: vcmp.i32 ne, q0, zr 1222; CHECK-NEXT: vpsel q6, q2, q7 1223; CHECK-NEXT: vmov r2, r3, d12 1224; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1225; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1226; CHECK-NEXT: vmov.s8 r2, q1[5] 1227; CHECK-NEXT: vmov.s8 r3, q3[5] 1228; CHECK-NEXT: vcmp.i32 ne, q0, zr 1229; CHECK-NEXT: smull r2, r3, r3, r2 1230; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1231; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1232; CHECK-NEXT: vpsel q0, q0, q4 1233; CHECK-NEXT: vmov r0, r1, d0 1234; CHECK-NEXT: vmov r2, r3, d1 1235; CHECK-NEXT: adds.w r0, r0, r12 1236; CHECK-NEXT: adc.w r1, r1, lr 1237; CHECK-NEXT: adds.w r12, r0, r2 1238; CHECK-NEXT: adc.w lr, r1, r3 1239; CHECK-NEXT: vmov r2, r3, d13 1240; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1241; CHECK-NEXT: vmov.s8 r0, q1[6] 1242; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1243; CHECK-NEXT: vmov.s8 r1, q3[6] 1244; CHECK-NEXT: vmov.s8 r2, q1[7] 1245; CHECK-NEXT: vmov.s8 r3, q3[7] 1246; CHECK-NEXT: smull r2, r3, r3, r2 1247; CHECK-NEXT: vcmp.i32 ne, q0, zr 1248; CHECK-NEXT: smull r0, r1, r1, r0 1249; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1250; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1251; CHECK-NEXT: vpsel q0, q0, q4 1252; CHECK-NEXT: vmov r0, r1, d0 1253; CHECK-NEXT: vmov r2, r3, d1 1254; CHECK-NEXT: adds.w r0, r0, r12 1255; CHECK-NEXT: adc.w r1, r1, lr 1256; CHECK-NEXT: adds.w r12, r0, r2 1257; CHECK-NEXT: vmov.u8 r2, q5[8] 1258; CHECK-NEXT: adc.w lr, r1, r3 1259; CHECK-NEXT: vmov.16 q6[0], r2 1260; CHECK-NEXT: vmov.u8 r2, q5[9] 1261; CHECK-NEXT: vmov.16 q6[1], r2 1262; CHECK-NEXT: vmov.u8 r2, q5[10] 1263; CHECK-NEXT: vmov.16 q6[2], r2 1264; CHECK-NEXT: vmov.u8 r2, q5[11] 1265; CHECK-NEXT: vmov.16 q6[3], r2 1266; CHECK-NEXT: vmov.u8 r2, q5[12] 1267; CHECK-NEXT: vmov.16 q6[4], r2 1268; CHECK-NEXT: vmov.u8 r2, q5[13] 1269; CHECK-NEXT: vmov.16 q6[5], r2 1270; CHECK-NEXT: vmov.u8 r2, q5[14] 1271; CHECK-NEXT: vmov.16 q6[6], r2 1272; CHECK-NEXT: vmov.u8 r2, q5[15] 1273; CHECK-NEXT: vmov.16 q6[7], r2 1274; CHECK-NEXT: vmov.s8 r0, q1[8] 1275; CHECK-NEXT: vcmp.i16 ne, q6, zr 1276; CHECK-NEXT: vmov.s8 r1, q3[8] 1277; CHECK-NEXT: vpsel q5, q2, q7 1278; CHECK-NEXT: smull r0, r1, r1, r0 1279; CHECK-NEXT: vmov.u16 r2, q5[2] 1280; CHECK-NEXT: vmov.u16 r3, q5[0] 1281; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1282; CHECK-NEXT: vmov.u16 r2, q5[3] 1283; CHECK-NEXT: vmov.u16 r3, q5[1] 1284; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1285; CHECK-NEXT: vcmp.i32 ne, q0, zr 1286; CHECK-NEXT: vpsel q6, q2, q7 1287; CHECK-NEXT: vmov r2, r3, d12 1288; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1289; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1290; CHECK-NEXT: vmov.s8 r2, q1[9] 1291; CHECK-NEXT: vmov.s8 r3, q3[9] 1292; CHECK-NEXT: vcmp.i32 ne, q0, zr 1293; CHECK-NEXT: smull r2, r3, r3, r2 1294; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1295; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1296; CHECK-NEXT: vpsel q0, q0, q4 1297; CHECK-NEXT: vmov r0, r1, d0 1298; CHECK-NEXT: vmov r2, r3, d1 1299; CHECK-NEXT: adds.w r0, r0, r12 1300; CHECK-NEXT: adc.w r1, r1, lr 1301; CHECK-NEXT: adds.w r12, r0, r2 1302; CHECK-NEXT: adc.w lr, r1, r3 1303; CHECK-NEXT: vmov r2, r3, d13 1304; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1305; CHECK-NEXT: vmov.s8 r0, q1[10] 1306; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1307; CHECK-NEXT: vmov.s8 r1, q3[10] 1308; CHECK-NEXT: vmov.s8 r2, q1[11] 1309; CHECK-NEXT: vmov.s8 r3, q3[11] 1310; CHECK-NEXT: smull r2, r3, r3, r2 1311; CHECK-NEXT: vcmp.i32 ne, q0, zr 1312; CHECK-NEXT: smull r0, r1, r1, r0 1313; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1314; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1315; CHECK-NEXT: vpsel q0, q0, q4 1316; CHECK-NEXT: vmov r0, r1, d0 1317; CHECK-NEXT: vmov r2, r3, d1 1318; CHECK-NEXT: adds.w r0, r0, r12 1319; CHECK-NEXT: adc.w r1, r1, lr 1320; CHECK-NEXT: adds.w r12, r0, r2 1321; CHECK-NEXT: adc.w lr, r1, r3 1322; CHECK-NEXT: vmov.u16 r2, q5[6] 1323; CHECK-NEXT: vmov.u16 r3, q5[4] 1324; CHECK-NEXT: vmov.s8 r0, q1[12] 1325; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 1326; CHECK-NEXT: vmov.u16 r2, q5[7] 1327; CHECK-NEXT: vmov.u16 r3, q5[5] 1328; CHECK-NEXT: vmov.s8 r1, q3[12] 1329; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 1330; CHECK-NEXT: smull r0, r1, r1, r0 1331; CHECK-NEXT: vcmp.i32 ne, q0, zr 1332; CHECK-NEXT: vpsel q2, q2, q7 1333; CHECK-NEXT: vmov r2, r3, d4 1334; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1335; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1336; CHECK-NEXT: vmov.s8 r2, q1[13] 1337; CHECK-NEXT: vmov.s8 r3, q3[13] 1338; CHECK-NEXT: vcmp.i32 ne, q0, zr 1339; CHECK-NEXT: smull r2, r3, r3, r2 1340; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1341; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1342; CHECK-NEXT: vpsel q0, q0, q4 1343; CHECK-NEXT: vmov r0, r1, d0 1344; CHECK-NEXT: vmov r2, r3, d1 1345; CHECK-NEXT: adds.w r0, r0, r12 1346; CHECK-NEXT: adc.w r1, r1, lr 1347; CHECK-NEXT: adds.w r12, r0, r2 1348; CHECK-NEXT: adc.w lr, r1, r3 1349; CHECK-NEXT: vmov r2, r3, d5 1350; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 1351; CHECK-NEXT: vmov.s8 r0, q1[14] 1352; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 1353; CHECK-NEXT: vmov.s8 r1, q3[14] 1354; CHECK-NEXT: vmov.s8 r2, q1[15] 1355; CHECK-NEXT: vmov.s8 r3, q3[15] 1356; CHECK-NEXT: smull r2, r3, r3, r2 1357; CHECK-NEXT: vcmp.i32 ne, q0, zr 1358; CHECK-NEXT: smull r0, r1, r1, r0 1359; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 1360; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 1361; CHECK-NEXT: vpsel q0, q0, q4 1362; CHECK-NEXT: vmov r0, r1, d0 1363; CHECK-NEXT: vmov r2, r3, d1 1364; CHECK-NEXT: adds.w r0, r0, r12 1365; CHECK-NEXT: adc.w r1, r1, lr 1366; CHECK-NEXT: adds r0, r0, r2 1367; CHECK-NEXT: adcs r1, r3 1368; CHECK-NEXT: add sp, #16 1369; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1370; CHECK-NEXT: pop {r7, pc} 1371entry: 1372 %c = icmp eq <16 x i8> %b, zeroinitializer 1373 %xx = sext <16 x i8> %x to <16 x i64> 1374 %yy = sext <16 x i8> %y to <16 x i64> 1375 %m = mul <16 x i64> %xx, %yy 1376 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 1377 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 1378 ret i64 %z 1379} 1380 1381define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 1382; CHECK-LABEL: add_v8i8_v8i64_zext: 1383; CHECK: @ %bb.0: @ %entry 1384; CHECK-NEXT: vmovlb.u8 q2, q2 1385; CHECK-NEXT: vmovlb.u8 q1, q1 1386; CHECK-NEXT: vmovlb.u8 q0, q0 1387; CHECK-NEXT: vpt.i16 eq, q2, zr 1388; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 1389; CHECK-NEXT: bx lr 1390entry: 1391 %c = icmp eq <8 x i8> %b, zeroinitializer 1392 %xx = zext <8 x i8> %x to <8 x i64> 1393 %yy = zext <8 x i8> %y to <8 x i64> 1394 %m = mul <8 x i64> %xx, %yy 1395 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 1396 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1397 ret i64 %z 1398} 1399 1400define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { 1401; CHECK-LABEL: add_v8i8_v8i64_sext: 1402; CHECK: @ %bb.0: @ %entry 1403; CHECK-NEXT: vmovlb.u8 q2, q2 1404; CHECK-NEXT: vmovlb.s8 q1, q1 1405; CHECK-NEXT: vmovlb.s8 q0, q0 1406; CHECK-NEXT: vpt.i16 eq, q2, zr 1407; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 1408; CHECK-NEXT: bx lr 1409entry: 1410 %c = icmp eq <8 x i8> %b, zeroinitializer 1411 %xx = sext <8 x i8> %x to <8 x i64> 1412 %yy = sext <8 x i8> %y to <8 x i64> 1413 %m = mul <8 x i64> %xx, %yy 1414 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 1415 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1416 ret i64 %z 1417} 1418 1419define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 1420; CHECK-LABEL: add_v4i8_v4i64_zext: 1421; CHECK: @ %bb.0: @ %entry 1422; CHECK-NEXT: vmov.i32 q3, #0xff 1423; CHECK-NEXT: vand q2, q2, q3 1424; CHECK-NEXT: vand q1, q1, q3 1425; CHECK-NEXT: vand q0, q0, q3 1426; CHECK-NEXT: vpt.i32 eq, q2, zr 1427; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 1428; CHECK-NEXT: bx lr 1429entry: 1430 %c = icmp eq <4 x i8> %b, zeroinitializer 1431 %xx = zext <4 x i8> %x to <4 x i64> 1432 %yy = zext <4 x i8> %y to <4 x i64> 1433 %m = mul <4 x i64> %xx, %yy 1434 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1435 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1436 ret i64 %z 1437} 1438 1439define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { 1440; CHECK-LABEL: add_v4i8_v4i64_sext: 1441; CHECK: @ %bb.0: @ %entry 1442; CHECK-NEXT: vmov.i32 q3, #0xff 1443; CHECK-NEXT: vmovlb.s8 q1, q1 1444; CHECK-NEXT: vmovlb.s8 q0, q0 1445; CHECK-NEXT: vand q2, q2, q3 1446; CHECK-NEXT: vmovlb.s16 q1, q1 1447; CHECK-NEXT: vmovlb.s16 q0, q0 1448; CHECK-NEXT: vpt.i32 eq, q2, zr 1449; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 1450; CHECK-NEXT: bx lr 1451entry: 1452 %c = icmp eq <4 x i8> %b, zeroinitializer 1453 %xx = sext <4 x i8> %x to <4 x i64> 1454 %yy = sext <4 x i8> %y to <4 x i64> 1455 %m = mul <4 x i64> %xx, %yy 1456 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1457 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1458 ret i64 %z 1459} 1460 1461define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { 1462; CHECK-LABEL: add_v4i8i16_v4i64_zext: 1463; CHECK: @ %bb.0: @ %entry 1464; CHECK-NEXT: vmov.i32 q3, #0xff 1465; CHECK-NEXT: vmovlb.u16 q1, q1 1466; CHECK-NEXT: vand q2, q2, q3 1467; CHECK-NEXT: vand q0, q0, q3 1468; CHECK-NEXT: vpt.i32 eq, q2, zr 1469; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 1470; CHECK-NEXT: bx lr 1471entry: 1472 %c = icmp eq <4 x i8> %b, zeroinitializer 1473 %xx = zext <4 x i8> %x to <4 x i64> 1474 %yy = zext <4 x i16> %y to <4 x i64> 1475 %m = mul <4 x i64> %xx, %yy 1476 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1477 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1478 ret i64 %z 1479} 1480 1481define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { 1482; CHECK-LABEL: add_v4i8i16_v4i64_sext: 1483; CHECK: @ %bb.0: @ %entry 1484; CHECK-NEXT: vmov.i32 q3, #0xff 1485; CHECK-NEXT: vmovlb.s8 q0, q0 1486; CHECK-NEXT: vand q2, q2, q3 1487; CHECK-NEXT: vmovlb.s16 q1, q1 1488; CHECK-NEXT: vmovlb.s16 q0, q0 1489; CHECK-NEXT: vpt.i32 eq, q2, zr 1490; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 1491; CHECK-NEXT: bx lr 1492entry: 1493 %c = icmp eq <4 x i8> %b, zeroinitializer 1494 %xx = sext <4 x i8> %x to <4 x i64> 1495 %yy = sext <4 x i16> %y to <4 x i64> 1496 %m = mul <4 x i64> %xx, %yy 1497 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1498 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1499 ret i64 %z 1500} 1501 1502define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { 1503; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext: 1504; CHECK: @ %bb.0: @ %entry 1505; CHECK-NEXT: vmov.i32 q3, #0xff 1506; CHECK-NEXT: vmovlb.u16 q1, q1 1507; CHECK-NEXT: vand q2, q2, q3 1508; CHECK-NEXT: vand q0, q0, q3 1509; CHECK-NEXT: vpt.i32 eq, q2, zr 1510; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 1511; CHECK-NEXT: bx lr 1512entry: 1513 %c = icmp eq <4 x i8> %b, zeroinitializer 1514 %xx = zext <4 x i8> %x to <4 x i32> 1515 %yy = zext <4 x i16> %y to <4 x i32> 1516 %mm = mul <4 x i32> %xx, %yy 1517 %m = zext <4 x i32> %mm to <4 x i64> 1518 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1519 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1520 ret i64 %z 1521} 1522 1523define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { 1524; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext: 1525; CHECK: @ %bb.0: @ %entry 1526; CHECK-NEXT: vmov.i32 q3, #0xff 1527; CHECK-NEXT: vmovlb.s8 q0, q0 1528; CHECK-NEXT: vand q2, q2, q3 1529; CHECK-NEXT: vmovlb.s16 q1, q1 1530; CHECK-NEXT: vmovlb.s16 q0, q0 1531; CHECK-NEXT: vpt.i32 eq, q2, zr 1532; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 1533; CHECK-NEXT: bx lr 1534entry: 1535 %c = icmp eq <4 x i8> %b, zeroinitializer 1536 %xx = sext <4 x i8> %x to <4 x i32> 1537 %yy = sext <4 x i16> %y to <4 x i32> 1538 %mm = mul <4 x i32> %xx, %yy 1539 %m = sext <4 x i32> %mm to <4 x i64> 1540 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1541 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1542 ret i64 %z 1543} 1544 1545define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { 1546; CHECK-LABEL: add_v2i8_v2i64_zext: 1547; CHECK: @ %bb.0: @ %entry 1548; CHECK-NEXT: vmov.i64 q3, #0xff 1549; CHECK-NEXT: vand q1, q1, q3 1550; CHECK-NEXT: vand q0, q0, q3 1551; CHECK-NEXT: vmov r0, s6 1552; CHECK-NEXT: vmov r1, s2 1553; CHECK-NEXT: vmov r2, s4 1554; CHECK-NEXT: vand q1, q2, q3 1555; CHECK-NEXT: vmov r3, s0 1556; CHECK-NEXT: umull r0, r1, r1, r0 1557; CHECK-NEXT: umull r2, r3, r3, r2 1558; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 1559; CHECK-NEXT: vmov r0, s4 1560; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 1561; CHECK-NEXT: movs r1, #0 1562; CHECK-NEXT: cmp r0, #0 1563; CHECK-NEXT: csetm r0, eq 1564; CHECK-NEXT: bfi r1, r0, #0, #8 1565; CHECK-NEXT: vmov r0, s6 1566; CHECK-NEXT: vmov.i32 q1, #0x0 1567; CHECK-NEXT: cmp r0, #0 1568; CHECK-NEXT: csetm r0, eq 1569; CHECK-NEXT: bfi r1, r0, #8, #8 1570; CHECK-NEXT: vmsr p0, r1 1571; CHECK-NEXT: vpsel q0, q0, q1 1572; CHECK-NEXT: vmov r0, r1, d1 1573; CHECK-NEXT: vmov r2, r3, d0 1574; CHECK-NEXT: adds r0, r0, r2 1575; CHECK-NEXT: adcs r1, r3 1576; CHECK-NEXT: bx lr 1577entry: 1578 %c = icmp eq <2 x i8> %b, zeroinitializer 1579 %xx = zext <2 x i8> %x to <2 x i64> 1580 %yy = zext <2 x i8> %y to <2 x i64> 1581 %m = mul <2 x i64> %xx, %yy 1582 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1583 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1584 ret i64 %z 1585} 1586 1587define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { 1588; CHECK-LABEL: add_v2i8_v2i64_sext: 1589; CHECK: @ %bb.0: @ %entry 1590; CHECK-NEXT: vmov.i32 q3, #0xff 1591; CHECK-NEXT: movs r1, #0 1592; CHECK-NEXT: vand q2, q2, q3 1593; CHECK-NEXT: vmov r2, s4 1594; CHECK-NEXT: vmov r0, s8 1595; CHECK-NEXT: vmov r3, s0 1596; CHECK-NEXT: cmp r0, #0 1597; CHECK-NEXT: sxtb r2, r2 1598; CHECK-NEXT: csetm r0, eq 1599; CHECK-NEXT: bfi r1, r0, #0, #8 1600; CHECK-NEXT: vmov r0, s10 1601; CHECK-NEXT: sxtb r3, r3 1602; CHECK-NEXT: smull r2, r3, r3, r2 1603; CHECK-NEXT: cmp r0, #0 1604; CHECK-NEXT: csetm r0, eq 1605; CHECK-NEXT: bfi r1, r0, #8, #8 1606; CHECK-NEXT: vmov r0, s6 1607; CHECK-NEXT: vmsr p0, r1 1608; CHECK-NEXT: vmov r1, s2 1609; CHECK-NEXT: vmov.i32 q1, #0x0 1610; CHECK-NEXT: sxtb r0, r0 1611; CHECK-NEXT: sxtb r1, r1 1612; CHECK-NEXT: smull r0, r1, r1, r0 1613; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 1614; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 1615; CHECK-NEXT: vpsel q0, q0, q1 1616; CHECK-NEXT: vmov r0, r1, d1 1617; CHECK-NEXT: vmov r2, r3, d0 1618; CHECK-NEXT: adds r0, r0, r2 1619; CHECK-NEXT: adcs r1, r3 1620; CHECK-NEXT: bx lr 1621entry: 1622 %c = icmp eq <2 x i8> %b, zeroinitializer 1623 %xx = sext <2 x i8> %x to <2 x i64> 1624 %yy = sext <2 x i8> %y to <2 x i64> 1625 %m = mul <2 x i64> %xx, %yy 1626 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1627 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1628 ret i64 %z 1629} 1630 1631define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) { 1632; CHECK-LABEL: add_v2i64_v2i64: 1633; CHECK: @ %bb.0: @ %entry 1634; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} 1635; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 1636; CHECK-NEXT: vmov r0, r12, d3 1637; CHECK-NEXT: vmov r2, lr, d1 1638; CHECK-NEXT: vmov r4, r9, d2 1639; CHECK-NEXT: vmov.i32 q1, #0x0 1640; CHECK-NEXT: vmov r6, r7, d0 1641; CHECK-NEXT: umull r1, r8, r2, r0 1642; CHECK-NEXT: umull r3, r5, r6, r4 1643; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 1644; CHECK-NEXT: mla r1, r2, r12, r8 1645; CHECK-NEXT: mla r0, lr, r0, r1 1646; CHECK-NEXT: mla r1, r6, r9, r5 1647; CHECK-NEXT: mla r1, r7, r4, r1 1648; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 1649; CHECK-NEXT: vmov r0, r1, d4 1650; CHECK-NEXT: orrs r0, r1 1651; CHECK-NEXT: mov.w r1, #0 1652; CHECK-NEXT: csetm r0, eq 1653; CHECK-NEXT: bfi r1, r0, #0, #8 1654; CHECK-NEXT: vmov r0, r2, d5 1655; CHECK-NEXT: orrs r0, r2 1656; CHECK-NEXT: csetm r0, eq 1657; CHECK-NEXT: bfi r1, r0, #8, #8 1658; CHECK-NEXT: vmsr p0, r1 1659; CHECK-NEXT: vpsel q0, q0, q1 1660; CHECK-NEXT: vmov r0, r1, d1 1661; CHECK-NEXT: vmov r2, r3, d0 1662; CHECK-NEXT: adds r0, r0, r2 1663; CHECK-NEXT: adcs r1, r3 1664; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} 1665entry: 1666 %c = icmp eq <2 x i64> %b, zeroinitializer 1667 %m = mul <2 x i64> %x, %y 1668 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1669 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1670 ret i64 %z 1671} 1672 1673define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) { 1674; CHECK-LABEL: add_v4i32_v4i32_acc: 1675; CHECK: @ %bb.0: @ %entry 1676; CHECK-NEXT: vpt.i32 eq, q2, zr 1677; CHECK-NEXT: vmlavat.u32 r0, q0, q1 1678; CHECK-NEXT: bx lr 1679entry: 1680 %c = icmp eq <4 x i32> %b, zeroinitializer 1681 %m = mul <4 x i32> %x, %y 1682 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 1683 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 1684 %r = add i32 %z, %a 1685 ret i32 %r 1686} 1687 1688define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) { 1689; CHECK-LABEL: add_v4i32_v4i64_acc_zext: 1690; CHECK: @ %bb.0: @ %entry 1691; CHECK-NEXT: vpt.i32 eq, q2, zr 1692; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1 1693; CHECK-NEXT: bx lr 1694entry: 1695 %c = icmp eq <4 x i32> %b, zeroinitializer 1696 %xx = zext <4 x i32> %x to <4 x i64> 1697 %yy = zext <4 x i32> %y to <4 x i64> 1698 %m = mul <4 x i64> %xx, %yy 1699 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1700 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1701 %r = add i64 %z, %a 1702 ret i64 %r 1703} 1704 1705define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) { 1706; CHECK-LABEL: add_v4i32_v4i64_acc_sext: 1707; CHECK: @ %bb.0: @ %entry 1708; CHECK-NEXT: vpt.i32 eq, q2, zr 1709; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1 1710; CHECK-NEXT: bx lr 1711entry: 1712 %c = icmp eq <4 x i32> %b, zeroinitializer 1713 %xx = sext <4 x i32> %x to <4 x i64> 1714 %yy = sext <4 x i32> %y to <4 x i64> 1715 %m = mul <4 x i64> %xx, %yy 1716 %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer 1717 %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) 1718 %r = add i64 %z, %a 1719 ret i64 %r 1720} 1721 1722define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) { 1723; CHECK-LABEL: add_v2i32_v2i64_acc_zext: 1724; CHECK: @ %bb.0: @ %entry 1725; CHECK-NEXT: .save {r7, lr} 1726; CHECK-NEXT: push {r7, lr} 1727; CHECK-NEXT: vmov r2, s8 1728; CHECK-NEXT: movs r3, #0 1729; CHECK-NEXT: vmullb.u32 q3, q0, q1 1730; CHECK-NEXT: vmov.i32 q0, #0x0 1731; CHECK-NEXT: cmp r2, #0 1732; CHECK-NEXT: csetm r2, eq 1733; CHECK-NEXT: bfi r3, r2, #0, #8 1734; CHECK-NEXT: vmov r2, s10 1735; CHECK-NEXT: cmp r2, #0 1736; CHECK-NEXT: csetm r2, eq 1737; CHECK-NEXT: bfi r3, r2, #8, #8 1738; CHECK-NEXT: vmsr p0, r3 1739; CHECK-NEXT: vpsel q0, q3, q0 1740; CHECK-NEXT: vmov lr, r12, d1 1741; CHECK-NEXT: vmov r3, r2, d0 1742; CHECK-NEXT: adds.w r3, r3, lr 1743; CHECK-NEXT: adc.w r2, r2, r12 1744; CHECK-NEXT: adds r0, r0, r3 1745; CHECK-NEXT: adcs r1, r2 1746; CHECK-NEXT: pop {r7, pc} 1747entry: 1748 %c = icmp eq <2 x i32> %b, zeroinitializer 1749 %xx = zext <2 x i32> %x to <2 x i64> 1750 %yy = zext <2 x i32> %y to <2 x i64> 1751 %m = mul <2 x i64> %xx, %yy 1752 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1753 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1754 %r = add i64 %z, %a 1755 ret i64 %r 1756} 1757 1758define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) { 1759; CHECK-LABEL: add_v2i32_v2i64_acc_sext: 1760; CHECK: @ %bb.0: @ %entry 1761; CHECK-NEXT: .save {r7, lr} 1762; CHECK-NEXT: push {r7, lr} 1763; CHECK-NEXT: vmov r2, s8 1764; CHECK-NEXT: movs r3, #0 1765; CHECK-NEXT: vmullb.s32 q3, q0, q1 1766; CHECK-NEXT: vmov.i32 q0, #0x0 1767; CHECK-NEXT: cmp r2, #0 1768; CHECK-NEXT: csetm r2, eq 1769; CHECK-NEXT: bfi r3, r2, #0, #8 1770; CHECK-NEXT: vmov r2, s10 1771; CHECK-NEXT: cmp r2, #0 1772; CHECK-NEXT: csetm r2, eq 1773; CHECK-NEXT: bfi r3, r2, #8, #8 1774; CHECK-NEXT: vmsr p0, r3 1775; CHECK-NEXT: vpsel q0, q3, q0 1776; CHECK-NEXT: vmov lr, r12, d1 1777; CHECK-NEXT: vmov r3, r2, d0 1778; CHECK-NEXT: adds.w r3, r3, lr 1779; CHECK-NEXT: adc.w r2, r2, r12 1780; CHECK-NEXT: adds r0, r0, r3 1781; CHECK-NEXT: adcs r1, r2 1782; CHECK-NEXT: pop {r7, pc} 1783entry: 1784 %c = icmp eq <2 x i32> %b, zeroinitializer 1785 %xx = sext <2 x i32> %x to <2 x i64> 1786 %yy = sext <2 x i32> %y to <2 x i64> 1787 %m = mul <2 x i64> %xx, %yy 1788 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 1789 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 1790 %r = add i64 %z, %a 1791 ret i64 %r 1792} 1793 1794define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) { 1795; CHECK-LABEL: add_v8i16_v8i32_acc_zext: 1796; CHECK: @ %bb.0: @ %entry 1797; CHECK-NEXT: vpt.i16 eq, q2, zr 1798; CHECK-NEXT: vmlavat.u16 r0, q0, q1 1799; CHECK-NEXT: bx lr 1800entry: 1801 %c = icmp eq <8 x i16> %b, zeroinitializer 1802 %xx = zext <8 x i16> %x to <8 x i32> 1803 %yy = zext <8 x i16> %y to <8 x i32> 1804 %m = mul <8 x i32> %xx, %yy 1805 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 1806 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 1807 %r = add i32 %z, %a 1808 ret i32 %r 1809} 1810 1811define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) { 1812; CHECK-LABEL: add_v8i16_v8i32_acc_sext: 1813; CHECK: @ %bb.0: @ %entry 1814; CHECK-NEXT: vpt.i16 eq, q2, zr 1815; CHECK-NEXT: vmlavat.s16 r0, q0, q1 1816; CHECK-NEXT: bx lr 1817entry: 1818 %c = icmp eq <8 x i16> %b, zeroinitializer 1819 %xx = sext <8 x i16> %x to <8 x i32> 1820 %yy = sext <8 x i16> %y to <8 x i32> 1821 %m = mul <8 x i32> %xx, %yy 1822 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 1823 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 1824 %r = add i32 %z, %a 1825 ret i32 %r 1826} 1827 1828define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) { 1829; CHECK-LABEL: add_v4i16_v4i32_acc_zext: 1830; CHECK: @ %bb.0: @ %entry 1831; CHECK-NEXT: vmovlb.u16 q2, q2 1832; CHECK-NEXT: vmovlb.u16 q1, q1 1833; CHECK-NEXT: vmovlb.u16 q0, q0 1834; CHECK-NEXT: vpt.i32 eq, q2, zr 1835; CHECK-NEXT: vmlavat.u32 r0, q0, q1 1836; CHECK-NEXT: bx lr 1837entry: 1838 %c = icmp eq <4 x i16> %b, zeroinitializer 1839 %xx = zext <4 x i16> %x to <4 x i32> 1840 %yy = zext <4 x i16> %y to <4 x i32> 1841 %m = mul <4 x i32> %xx, %yy 1842 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 1843 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 1844 %r = add i32 %z, %a 1845 ret i32 %r 1846} 1847 1848define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) { 1849; CHECK-LABEL: add_v4i16_v4i32_acc_sext: 1850; CHECK: @ %bb.0: @ %entry 1851; CHECK-NEXT: vmovlb.u16 q2, q2 1852; CHECK-NEXT: vmovlb.s16 q1, q1 1853; CHECK-NEXT: vmovlb.s16 q0, q0 1854; CHECK-NEXT: vpt.i32 eq, q2, zr 1855; CHECK-NEXT: vmlavat.u32 r0, q0, q1 1856; CHECK-NEXT: bx lr 1857entry: 1858 %c = icmp eq <4 x i16> %b, zeroinitializer 1859 %xx = sext <4 x i16> %x to <4 x i32> 1860 %yy = sext <4 x i16> %y to <4 x i32> 1861 %m = mul <4 x i32> %xx, %yy 1862 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 1863 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 1864 %r = add i32 %z, %a 1865 ret i32 %r 1866} 1867 1868define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) { 1869; CHECK-LABEL: add_v8i16_v8i16_acc: 1870; CHECK: @ %bb.0: @ %entry 1871; CHECK-NEXT: vpt.i16 eq, q2, zr 1872; CHECK-NEXT: vmlavat.u16 r0, q0, q1 1873; CHECK-NEXT: uxth r0, r0 1874; CHECK-NEXT: bx lr 1875entry: 1876 %c = icmp eq <8 x i16> %b, zeroinitializer 1877 %m = mul <8 x i16> %x, %y 1878 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 1879 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 1880 %r = add i16 %z, %a 1881 ret i16 %r 1882} 1883 1884define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1885; CHECK-LABEL: add_v8i16_v8i64_acc_zext: 1886; CHECK: @ %bb.0: @ %entry 1887; CHECK-NEXT: vpt.i16 eq, q2, zr 1888; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1 1889; CHECK-NEXT: bx lr 1890entry: 1891 %c = icmp eq <8 x i16> %b, zeroinitializer 1892 %xx = zext <8 x i16> %x to <8 x i64> 1893 %yy = zext <8 x i16> %y to <8 x i64> 1894 %m = mul <8 x i64> %xx, %yy 1895 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 1896 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1897 %r = add i64 %z, %a 1898 ret i64 %r 1899} 1900 1901define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1902; CHECK-LABEL: add_v8i16_v8i64_acc_sext: 1903; CHECK: @ %bb.0: @ %entry 1904; CHECK-NEXT: vpt.i16 eq, q2, zr 1905; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1 1906; CHECK-NEXT: bx lr 1907entry: 1908 %c = icmp eq <8 x i16> %b, zeroinitializer 1909 %xx = sext <8 x i16> %x to <8 x i64> 1910 %yy = sext <8 x i16> %y to <8 x i64> 1911 %m = mul <8 x i64> %xx, %yy 1912 %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer 1913 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1914 %r = add i64 %z, %a 1915 ret i64 %r 1916} 1917 1918define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1919; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext: 1920; CHECK: @ %bb.0: @ %entry 1921; CHECK-NEXT: vpt.i16 eq, q2, zr 1922; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1 1923; CHECK-NEXT: bx lr 1924entry: 1925 %c = icmp eq <8 x i16> %b, zeroinitializer 1926 %xx = zext <8 x i16> %x to <8 x i32> 1927 %yy = zext <8 x i16> %y to <8 x i32> 1928 %m = mul <8 x i32> %xx, %yy 1929 %ma = zext <8 x i32> %m to <8 x i64> 1930 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 1931 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1932 %r = add i64 %z, %a 1933 ret i64 %r 1934} 1935 1936define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1937; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext: 1938; CHECK: @ %bb.0: @ %entry 1939; CHECK-NEXT: vpt.i16 eq, q2, zr 1940; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1 1941; CHECK-NEXT: bx lr 1942entry: 1943 %c = icmp eq <8 x i16> %b, zeroinitializer 1944 %xx = sext <8 x i16> %x to <8 x i32> 1945 %yy = sext <8 x i16> %y to <8 x i32> 1946 %m = mul <8 x i32> %xx, %yy 1947 %ma = sext <8 x i32> %m to <8 x i64> 1948 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 1949 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1950 %r = add i64 %z, %a 1951 ret i64 %r 1952} 1953 1954define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) { 1955; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext: 1956; CHECK: @ %bb.0: @ %entry 1957; CHECK-NEXT: vpt.i16 eq, q2, zr 1958; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q0 1959; CHECK-NEXT: bx lr 1960entry: 1961 %c = icmp eq <8 x i16> %b, zeroinitializer 1962 %xx = sext <8 x i16> %x to <8 x i32> 1963 %m = mul <8 x i32> %xx, %xx 1964 %ma = zext <8 x i32> %m to <8 x i64> 1965 %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer 1966 %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) 1967 %r = add i64 %z, %a 1968 ret i64 %r 1969} 1970 1971define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) { 1972; CHECK-LABEL: add_v2i16_v2i64_acc_zext: 1973; CHECK: @ %bb.0: @ %entry 1974; CHECK-NEXT: .save {r7, lr} 1975; CHECK-NEXT: push {r7, lr} 1976; CHECK-NEXT: vmov.i64 q3, #0xffff 1977; CHECK-NEXT: vand q1, q1, q3 1978; CHECK-NEXT: vand q0, q0, q3 1979; CHECK-NEXT: vmov r2, s6 1980; CHECK-NEXT: vmov r3, s2 1981; CHECK-NEXT: umull lr, r12, r3, r2 1982; CHECK-NEXT: vmov r3, s4 1983; CHECK-NEXT: vmov r2, s0 1984; CHECK-NEXT: vand q1, q2, q3 1985; CHECK-NEXT: umull r2, r3, r2, r3 1986; CHECK-NEXT: vmov q0[2], q0[0], r2, lr 1987; CHECK-NEXT: vmov r2, s4 1988; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 1989; CHECK-NEXT: movs r3, #0 1990; CHECK-NEXT: cmp r2, #0 1991; CHECK-NEXT: csetm r2, eq 1992; CHECK-NEXT: bfi r3, r2, #0, #8 1993; CHECK-NEXT: vmov r2, s6 1994; CHECK-NEXT: vmov.i32 q1, #0x0 1995; CHECK-NEXT: cmp r2, #0 1996; CHECK-NEXT: csetm r2, eq 1997; CHECK-NEXT: bfi r3, r2, #8, #8 1998; CHECK-NEXT: vmsr p0, r3 1999; CHECK-NEXT: vpsel q0, q0, q1 2000; CHECK-NEXT: vmov lr, r12, d1 2001; CHECK-NEXT: vmov r3, r2, d0 2002; CHECK-NEXT: adds.w r3, r3, lr 2003; CHECK-NEXT: adc.w r2, r2, r12 2004; CHECK-NEXT: adds r0, r0, r3 2005; CHECK-NEXT: adcs r1, r2 2006; CHECK-NEXT: pop {r7, pc} 2007entry: 2008 %c = icmp eq <2 x i16> %b, zeroinitializer 2009 %xx = zext <2 x i16> %x to <2 x i64> 2010 %yy = zext <2 x i16> %y to <2 x i64> 2011 %m = mul <2 x i64> %xx, %yy 2012 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 2013 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 2014 %r = add i64 %z, %a 2015 ret i64 %r 2016} 2017 2018define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) { 2019; CHECK-LABEL: add_v2i16_v2i64_acc_sext: 2020; CHECK: @ %bb.0: @ %entry 2021; CHECK-NEXT: .save {r7, lr} 2022; CHECK-NEXT: push {r7, lr} 2023; CHECK-NEXT: vmov.i32 q3, #0xffff 2024; CHECK-NEXT: movs r3, #0 2025; CHECK-NEXT: vand q2, q2, q3 2026; CHECK-NEXT: vmov r2, s8 2027; CHECK-NEXT: cmp r2, #0 2028; CHECK-NEXT: csetm r2, eq 2029; CHECK-NEXT: bfi r3, r2, #0, #8 2030; CHECK-NEXT: vmov r2, s10 2031; CHECK-NEXT: cmp r2, #0 2032; CHECK-NEXT: csetm r2, eq 2033; CHECK-NEXT: bfi r3, r2, #8, #8 2034; CHECK-NEXT: vmov r2, s6 2035; CHECK-NEXT: vmsr p0, r3 2036; CHECK-NEXT: vmov r3, s2 2037; CHECK-NEXT: sxth r2, r2 2038; CHECK-NEXT: sxth r3, r3 2039; CHECK-NEXT: smull lr, r12, r3, r2 2040; CHECK-NEXT: vmov r3, s4 2041; CHECK-NEXT: vmov r2, s0 2042; CHECK-NEXT: vmov.i32 q1, #0x0 2043; CHECK-NEXT: sxth r3, r3 2044; CHECK-NEXT: sxth r2, r2 2045; CHECK-NEXT: smull r2, r3, r2, r3 2046; CHECK-NEXT: vmov q0[2], q0[0], r2, lr 2047; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 2048; CHECK-NEXT: vpsel q0, q0, q1 2049; CHECK-NEXT: vmov lr, r12, d1 2050; CHECK-NEXT: vmov r3, r2, d0 2051; CHECK-NEXT: adds.w r3, r3, lr 2052; CHECK-NEXT: adc.w r2, r2, r12 2053; CHECK-NEXT: adds r0, r0, r3 2054; CHECK-NEXT: adcs r1, r2 2055; CHECK-NEXT: pop {r7, pc} 2056entry: 2057 %c = icmp eq <2 x i16> %b, zeroinitializer 2058 %xx = sext <2 x i16> %x to <2 x i64> 2059 %yy = sext <2 x i16> %y to <2 x i64> 2060 %m = mul <2 x i64> %xx, %yy 2061 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 2062 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 2063 %r = add i64 %z, %a 2064 ret i64 %r 2065} 2066 2067define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 2068; CHECK-LABEL: add_v16i8_v16i32_acc_zext: 2069; CHECK: @ %bb.0: @ %entry 2070; CHECK-NEXT: vpt.i8 eq, q2, zr 2071; CHECK-NEXT: vmlavat.u8 r0, q0, q1 2072; CHECK-NEXT: bx lr 2073entry: 2074 %c = icmp eq <16 x i8> %b, zeroinitializer 2075 %xx = zext <16 x i8> %x to <16 x i32> 2076 %yy = zext <16 x i8> %y to <16 x i32> 2077 %m = mul <16 x i32> %xx, %yy 2078 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 2079 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2080 %r = add i32 %z, %a 2081 ret i32 %r 2082} 2083 2084define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 2085; CHECK-LABEL: add_v16i8_v16i32_acc_sext: 2086; CHECK: @ %bb.0: @ %entry 2087; CHECK-NEXT: vpt.i8 eq, q2, zr 2088; CHECK-NEXT: vmlavat.s8 r0, q0, q1 2089; CHECK-NEXT: bx lr 2090entry: 2091 %c = icmp eq <16 x i8> %b, zeroinitializer 2092 %xx = sext <16 x i8> %x to <16 x i32> 2093 %yy = sext <16 x i8> %y to <16 x i32> 2094 %m = mul <16 x i32> %xx, %yy 2095 %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer 2096 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2097 %r = add i32 %z, %a 2098 ret i32 %r 2099} 2100 2101define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 2102; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext: 2103; CHECK: @ %bb.0: @ %entry 2104; CHECK-NEXT: vpt.i8 eq, q2, zr 2105; CHECK-NEXT: vmlavat.u8 r0, q0, q1 2106; CHECK-NEXT: bx lr 2107entry: 2108 %c = icmp eq <16 x i8> %b, zeroinitializer 2109 %xx = zext <16 x i8> %x to <16 x i16> 2110 %yy = zext <16 x i8> %y to <16 x i16> 2111 %m = mul <16 x i16> %xx, %yy 2112 %ma = zext <16 x i16> %m to <16 x i32> 2113 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 2114 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2115 %r = add i32 %z, %a 2116 ret i32 %r 2117} 2118 2119define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 2120; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext: 2121; CHECK: @ %bb.0: @ %entry 2122; CHECK-NEXT: vpt.i8 eq, q2, zr 2123; CHECK-NEXT: vmlavat.s8 r0, q0, q1 2124; CHECK-NEXT: bx lr 2125entry: 2126 %c = icmp eq <16 x i8> %b, zeroinitializer 2127 %xx = sext <16 x i8> %x to <16 x i16> 2128 %yy = sext <16 x i8> %y to <16 x i16> 2129 %m = mul <16 x i16> %xx, %yy 2130 %ma = sext <16 x i16> %m to <16 x i32> 2131 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 2132 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2133 %r = add i32 %z, %a 2134 ret i32 %r 2135} 2136 2137define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) { 2138; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext: 2139; CHECK: @ %bb.0: @ %entry 2140; CHECK-NEXT: vpt.i8 eq, q2, zr 2141; CHECK-NEXT: vmlavat.s8 r0, q0, q0 2142; CHECK-NEXT: bx lr 2143entry: 2144 %c = icmp eq <16 x i8> %b, zeroinitializer 2145 %xx = sext <16 x i8> %x to <16 x i16> 2146 %m = mul <16 x i16> %xx, %xx 2147 %ma = zext <16 x i16> %m to <16 x i32> 2148 %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer 2149 %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) 2150 %r = add i32 %z, %a 2151 ret i32 %r 2152} 2153 2154define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) { 2155; CHECK-LABEL: add_v4i8_v4i32_acc_zext: 2156; CHECK: @ %bb.0: @ %entry 2157; CHECK-NEXT: vmov.i32 q3, #0xff 2158; CHECK-NEXT: vand q2, q2, q3 2159; CHECK-NEXT: vand q1, q1, q3 2160; CHECK-NEXT: vand q0, q0, q3 2161; CHECK-NEXT: vpt.i32 eq, q2, zr 2162; CHECK-NEXT: vmlavat.u32 r0, q0, q1 2163; CHECK-NEXT: bx lr 2164entry: 2165 %c = icmp eq <4 x i8> %b, zeroinitializer 2166 %xx = zext <4 x i8> %x to <4 x i32> 2167 %yy = zext <4 x i8> %y to <4 x i32> 2168 %m = mul <4 x i32> %xx, %yy 2169 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 2170 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 2171 %r = add i32 %z, %a 2172 ret i32 %r 2173} 2174 2175define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) { 2176; CHECK-LABEL: add_v4i8_v4i32_acc_sext: 2177; CHECK: @ %bb.0: @ %entry 2178; CHECK-NEXT: vmov.i32 q3, #0xff 2179; CHECK-NEXT: vmovlb.s8 q1, q1 2180; CHECK-NEXT: vmovlb.s8 q0, q0 2181; CHECK-NEXT: vand q2, q2, q3 2182; CHECK-NEXT: vmovlb.s16 q1, q1 2183; CHECK-NEXT: vmovlb.s16 q0, q0 2184; CHECK-NEXT: vpt.i32 eq, q2, zr 2185; CHECK-NEXT: vmlavat.u32 r0, q0, q1 2186; CHECK-NEXT: bx lr 2187entry: 2188 %c = icmp eq <4 x i8> %b, zeroinitializer 2189 %xx = sext <4 x i8> %x to <4 x i32> 2190 %yy = sext <4 x i8> %y to <4 x i32> 2191 %m = mul <4 x i32> %xx, %yy 2192 %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer 2193 %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) 2194 %r = add i32 %z, %a 2195 ret i32 %r 2196} 2197 2198define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) { 2199; CHECK-LABEL: add_v16i8_v16i16_acc_zext: 2200; CHECK: @ %bb.0: @ %entry 2201; CHECK-NEXT: vpt.i8 eq, q2, zr 2202; CHECK-NEXT: vmlavat.u8 r0, q0, q1 2203; CHECK-NEXT: uxth r0, r0 2204; CHECK-NEXT: bx lr 2205entry: 2206 %c = icmp eq <16 x i8> %b, zeroinitializer 2207 %xx = zext <16 x i8> %x to <16 x i16> 2208 %yy = zext <16 x i8> %y to <16 x i16> 2209 %m = mul <16 x i16> %xx, %yy 2210 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 2211 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 2212 %r = add i16 %z, %a 2213 ret i16 %r 2214} 2215 2216define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) { 2217; CHECK-LABEL: add_v16i8_v16i16_acc_sext: 2218; CHECK: @ %bb.0: @ %entry 2219; CHECK-NEXT: vpt.i8 eq, q2, zr 2220; CHECK-NEXT: vmlavat.s8 r0, q0, q1 2221; CHECK-NEXT: sxth r0, r0 2222; CHECK-NEXT: bx lr 2223entry: 2224 %c = icmp eq <16 x i8> %b, zeroinitializer 2225 %xx = sext <16 x i8> %x to <16 x i16> 2226 %yy = sext <16 x i8> %y to <16 x i16> 2227 %m = mul <16 x i16> %xx, %yy 2228 %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer 2229 %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 2230 %r = add i16 %z, %a 2231 ret i16 %r 2232} 2233 2234define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) { 2235; CHECK-LABEL: add_v8i8_v8i16_acc_zext: 2236; CHECK: @ %bb.0: @ %entry 2237; CHECK-NEXT: vmovlb.u8 q2, q2 2238; CHECK-NEXT: vmovlb.u8 q1, q1 2239; CHECK-NEXT: vmovlb.u8 q0, q0 2240; CHECK-NEXT: vpt.i16 eq, q2, zr 2241; CHECK-NEXT: vmlavat.u16 r0, q0, q1 2242; CHECK-NEXT: uxth r0, r0 2243; CHECK-NEXT: bx lr 2244entry: 2245 %c = icmp eq <8 x i8> %b, zeroinitializer 2246 %xx = zext <8 x i8> %x to <8 x i16> 2247 %yy = zext <8 x i8> %y to <8 x i16> 2248 %m = mul <8 x i16> %xx, %yy 2249 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 2250 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 2251 %r = add i16 %z, %a 2252 ret i16 %r 2253} 2254 2255define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) { 2256; CHECK-LABEL: add_v8i8_v8i16_acc_sext: 2257; CHECK: @ %bb.0: @ %entry 2258; CHECK-NEXT: vmovlb.u8 q2, q2 2259; CHECK-NEXT: vmovlb.s8 q1, q1 2260; CHECK-NEXT: vmovlb.s8 q0, q0 2261; CHECK-NEXT: vpt.i16 eq, q2, zr 2262; CHECK-NEXT: vmlavat.u16 r0, q0, q1 2263; CHECK-NEXT: sxth r0, r0 2264; CHECK-NEXT: bx lr 2265entry: 2266 %c = icmp eq <8 x i8> %b, zeroinitializer 2267 %xx = sext <8 x i8> %x to <8 x i16> 2268 %yy = sext <8 x i8> %y to <8 x i16> 2269 %m = mul <8 x i16> %xx, %yy 2270 %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer 2271 %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) 2272 %r = add i16 %z, %a 2273 ret i16 %r 2274} 2275 2276define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) { 2277; CHECK-LABEL: add_v16i8_v16i8_acc: 2278; CHECK: @ %bb.0: @ %entry 2279; CHECK-NEXT: vpt.i8 eq, q2, zr 2280; CHECK-NEXT: vmlavat.u8 r0, q0, q1 2281; CHECK-NEXT: uxtb r0, r0 2282; CHECK-NEXT: bx lr 2283entry: 2284 %c = icmp eq <16 x i8> %b, zeroinitializer 2285 %m = mul <16 x i8> %x, %y 2286 %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer 2287 %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) 2288 %r = add i8 %z, %a 2289 ret i8 %r 2290} 2291 2292define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { 2293; CHECK-LABEL: add_v16i8_v16i64_acc_zext: 2294; CHECK: @ %bb.0: @ %entry 2295; CHECK-NEXT: .save {r4, r5, r7, lr} 2296; CHECK-NEXT: push {r4, r5, r7, lr} 2297; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 2298; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 2299; CHECK-NEXT: .pad #32 2300; CHECK-NEXT: sub sp, #32 2301; CHECK-NEXT: vmov q3, q0 2302; CHECK-NEXT: vmov.i8 q0, #0x0 2303; CHECK-NEXT: vcmp.i8 eq, q2, zr 2304; CHECK-NEXT: vmov.i8 q2, #0xff 2305; CHECK-NEXT: vpsel q6, q2, q0 2306; CHECK-NEXT: vmov q4, q0 2307; CHECK-NEXT: vmov.u8 r2, q6[0] 2308; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 2309; CHECK-NEXT: vmov.16 q0[0], r2 2310; CHECK-NEXT: vmov.u8 r2, q6[1] 2311; CHECK-NEXT: vmov.16 q0[1], r2 2312; CHECK-NEXT: vmov.u8 r2, q6[2] 2313; CHECK-NEXT: vmov.16 q0[2], r2 2314; CHECK-NEXT: vmov.u8 r2, q6[3] 2315; CHECK-NEXT: vmov.16 q0[3], r2 2316; CHECK-NEXT: vmov.u8 r2, q6[4] 2317; CHECK-NEXT: vmov.16 q0[4], r2 2318; CHECK-NEXT: vmov.u8 r2, q6[5] 2319; CHECK-NEXT: vmov.16 q0[5], r2 2320; CHECK-NEXT: vmov.u8 r2, q6[6] 2321; CHECK-NEXT: vmov.16 q0[6], r2 2322; CHECK-NEXT: vmov.u8 r2, q6[7] 2323; CHECK-NEXT: vmov.16 q0[7], r2 2324; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill 2325; CHECK-NEXT: vcmp.i16 ne, q0, zr 2326; CHECK-NEXT: vmov.u8 r4, q3[2] 2327; CHECK-NEXT: vpsel q7, q2, q4 2328; CHECK-NEXT: vmov.u16 r2, q7[2] 2329; CHECK-NEXT: vmov.u16 r3, q7[0] 2330; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 2331; CHECK-NEXT: vmov.u16 r2, q7[3] 2332; CHECK-NEXT: vmov.u16 r3, q7[1] 2333; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 2334; CHECK-NEXT: vcmp.i32 ne, q0, zr 2335; CHECK-NEXT: vpsel q0, q2, q4 2336; CHECK-NEXT: vmov r2, r3, d0 2337; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 2338; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 2339; CHECK-NEXT: vmov.u8 r2, q1[1] 2340; CHECK-NEXT: vmov.u8 r3, q1[0] 2341; CHECK-NEXT: vcmp.i32 ne, q2, zr 2342; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 2343; CHECK-NEXT: vmov.u8 r3, q3[1] 2344; CHECK-NEXT: vmov.u8 r2, q3[0] 2345; CHECK-NEXT: vmov.i64 q2, #0xff 2346; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 2347; CHECK-NEXT: vand q5, q5, q2 2348; CHECK-NEXT: vand q4, q4, q2 2349; CHECK-NEXT: vmov r12, s22 2350; CHECK-NEXT: vmov r2, s18 2351; CHECK-NEXT: vmov r3, s20 2352; CHECK-NEXT: vmov.i32 q5, #0x0 2353; CHECK-NEXT: umull lr, r12, r2, r12 2354; CHECK-NEXT: vmov r2, s16 2355; CHECK-NEXT: umull r2, r3, r2, r3 2356; CHECK-NEXT: vmov q4[2], q4[0], r2, lr 2357; CHECK-NEXT: vmov q4[3], q4[1], r3, r12 2358; CHECK-NEXT: vpsel q4, q4, q5 2359; CHECK-NEXT: vmov lr, r12, d9 2360; CHECK-NEXT: vmov r3, r2, d8 2361; CHECK-NEXT: adds.w lr, lr, r3 2362; CHECK-NEXT: adc.w r12, r12, r2 2363; CHECK-NEXT: vmov r2, r3, d1 2364; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 2365; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 2366; CHECK-NEXT: vmov.u8 r2, q1[3] 2367; CHECK-NEXT: vmov.u8 r3, q1[2] 2368; CHECK-NEXT: vcmp.i32 ne, q0, zr 2369; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 2370; CHECK-NEXT: vmov.u8 r3, q3[3] 2371; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 2372; CHECK-NEXT: vand q0, q0, q2 2373; CHECK-NEXT: vand q4, q4, q2 2374; CHECK-NEXT: vmov r2, s2 2375; CHECK-NEXT: vmov r3, s18 2376; CHECK-NEXT: vmov r5, s16 2377; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 2378; CHECK-NEXT: vmov r4, s0 2379; CHECK-NEXT: umull r2, r3, r3, r2 2380; CHECK-NEXT: umull r4, r5, r5, r4 2381; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 2382; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 2383; CHECK-NEXT: vpsel q0, q0, q5 2384; CHECK-NEXT: vmov r2, r3, d0 2385; CHECK-NEXT: vmov r5, r4, d1 2386; CHECK-NEXT: adds.w r2, r2, lr 2387; CHECK-NEXT: adc.w r3, r3, r12 2388; CHECK-NEXT: adds.w r12, r2, r5 2389; CHECK-NEXT: adc.w lr, r3, r4 2390; CHECK-NEXT: vmov.u16 r5, q7[6] 2391; CHECK-NEXT: vmov.u16 r4, q7[4] 2392; CHECK-NEXT: vmov.u8 r2, q3[4] 2393; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2394; CHECK-NEXT: vmov.u16 r5, q7[7] 2395; CHECK-NEXT: vmov.u16 r4, q7[5] 2396; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2397; CHECK-NEXT: vcmp.i32 ne, q0, zr 2398; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 2399; CHECK-NEXT: vpsel q0, q0, q4 2400; CHECK-NEXT: vmov r5, r4, d0 2401; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 2402; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 2403; CHECK-NEXT: vmov.u8 r5, q1[5] 2404; CHECK-NEXT: vmov.u8 r4, q1[4] 2405; CHECK-NEXT: vcmp.i32 ne, q4, zr 2406; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 2407; CHECK-NEXT: vmov.u8 r4, q3[5] 2408; CHECK-NEXT: vmov q7[2], q7[0], r2, r4 2409; CHECK-NEXT: vand q4, q4, q2 2410; CHECK-NEXT: vand q7, q7, q2 2411; CHECK-NEXT: vmov r5, s18 2412; CHECK-NEXT: vmov r2, s30 2413; CHECK-NEXT: vmov r3, s28 2414; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload 2415; CHECK-NEXT: vmov r4, s16 2416; CHECK-NEXT: umull r2, r5, r2, r5 2417; CHECK-NEXT: umull r3, r4, r3, r4 2418; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 2419; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 2420; CHECK-NEXT: vpsel q4, q4, q5 2421; CHECK-NEXT: vmov r2, r3, d8 2422; CHECK-NEXT: vmov r5, r4, d9 2423; CHECK-NEXT: adds.w r2, r2, r12 2424; CHECK-NEXT: adc.w r3, r3, lr 2425; CHECK-NEXT: adds.w r12, r2, r5 2426; CHECK-NEXT: adc.w lr, r3, r4 2427; CHECK-NEXT: vmov r5, r4, d1 2428; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2429; CHECK-NEXT: vmov.u8 r2, q3[6] 2430; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2431; CHECK-NEXT: vmov.u8 r5, q1[7] 2432; CHECK-NEXT: vmov.u8 r4, q1[6] 2433; CHECK-NEXT: vcmp.i32 ne, q0, zr 2434; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2435; CHECK-NEXT: vmov.u8 r4, q3[7] 2436; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 2437; CHECK-NEXT: vand q0, q0, q2 2438; CHECK-NEXT: vand q4, q4, q2 2439; CHECK-NEXT: vmov r5, s2 2440; CHECK-NEXT: vmov r2, s18 2441; CHECK-NEXT: vmov r3, s16 2442; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 2443; CHECK-NEXT: vmov r4, s0 2444; CHECK-NEXT: umull r2, r5, r2, r5 2445; CHECK-NEXT: umull r3, r4, r3, r4 2446; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 2447; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2448; CHECK-NEXT: vpsel q0, q0, q5 2449; CHECK-NEXT: vmov r2, r3, d0 2450; CHECK-NEXT: vmov r5, r4, d1 2451; CHECK-NEXT: adds.w r2, r2, r12 2452; CHECK-NEXT: adc.w r3, r3, lr 2453; CHECK-NEXT: adds.w r12, r2, r5 2454; CHECK-NEXT: vmov.u8 r5, q6[8] 2455; CHECK-NEXT: adc.w lr, r3, r4 2456; CHECK-NEXT: vmov.16 q0[0], r5 2457; CHECK-NEXT: vmov.u8 r5, q6[9] 2458; CHECK-NEXT: vmov.16 q0[1], r5 2459; CHECK-NEXT: vmov.u8 r5, q6[10] 2460; CHECK-NEXT: vmov.16 q0[2], r5 2461; CHECK-NEXT: vmov.u8 r5, q6[11] 2462; CHECK-NEXT: vmov.16 q0[3], r5 2463; CHECK-NEXT: vmov.u8 r5, q6[12] 2464; CHECK-NEXT: vmov.16 q0[4], r5 2465; CHECK-NEXT: vmov.u8 r5, q6[13] 2466; CHECK-NEXT: vmov.16 q0[5], r5 2467; CHECK-NEXT: vmov.u8 r5, q6[14] 2468; CHECK-NEXT: vmov.16 q0[6], r5 2469; CHECK-NEXT: vmov.u8 r5, q6[15] 2470; CHECK-NEXT: vmov.16 q0[7], r5 2471; CHECK-NEXT: vmov.u8 r2, q3[8] 2472; CHECK-NEXT: vcmp.i16 ne, q0, zr 2473; CHECK-NEXT: vpsel q6, q7, q4 2474; CHECK-NEXT: vmov.u16 r5, q6[2] 2475; CHECK-NEXT: vmov.u16 r4, q6[0] 2476; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2477; CHECK-NEXT: vmov.u16 r5, q6[3] 2478; CHECK-NEXT: vmov.u16 r4, q6[1] 2479; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2480; CHECK-NEXT: vcmp.i32 ne, q0, zr 2481; CHECK-NEXT: vpsel q0, q7, q4 2482; CHECK-NEXT: vmov r5, r4, d0 2483; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 2484; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 2485; CHECK-NEXT: vmov.u8 r5, q1[9] 2486; CHECK-NEXT: vmov.u8 r4, q1[8] 2487; CHECK-NEXT: vcmp.i32 ne, q4, zr 2488; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 2489; CHECK-NEXT: vmov.u8 r4, q3[9] 2490; CHECK-NEXT: vmov q7[2], q7[0], r2, r4 2491; CHECK-NEXT: vand q4, q4, q2 2492; CHECK-NEXT: vand q7, q7, q2 2493; CHECK-NEXT: vmov r5, s18 2494; CHECK-NEXT: vmov r2, s30 2495; CHECK-NEXT: vmov r4, s16 2496; CHECK-NEXT: vmov r3, s28 2497; CHECK-NEXT: umull r2, r5, r2, r5 2498; CHECK-NEXT: umull r3, r4, r3, r4 2499; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 2500; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 2501; CHECK-NEXT: vpsel q4, q4, q5 2502; CHECK-NEXT: vmov r2, r3, d8 2503; CHECK-NEXT: vmov r5, r4, d9 2504; CHECK-NEXT: adds.w r2, r2, r12 2505; CHECK-NEXT: adc.w r3, r3, lr 2506; CHECK-NEXT: adds.w r12, r2, r5 2507; CHECK-NEXT: adc.w lr, r3, r4 2508; CHECK-NEXT: vmov r5, r4, d1 2509; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2510; CHECK-NEXT: vmov.u8 r2, q3[10] 2511; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2512; CHECK-NEXT: vmov.u8 r5, q1[11] 2513; CHECK-NEXT: vmov.u8 r4, q1[10] 2514; CHECK-NEXT: vcmp.i32 ne, q0, zr 2515; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2516; CHECK-NEXT: vmov.u8 r4, q3[11] 2517; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 2518; CHECK-NEXT: vand q0, q0, q2 2519; CHECK-NEXT: vand q4, q4, q2 2520; CHECK-NEXT: vmov r5, s2 2521; CHECK-NEXT: vmov r2, s18 2522; CHECK-NEXT: vmov r3, s16 2523; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload 2524; CHECK-NEXT: vmov r4, s0 2525; CHECK-NEXT: umull r2, r5, r2, r5 2526; CHECK-NEXT: umull r3, r4, r3, r4 2527; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 2528; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2529; CHECK-NEXT: vpsel q0, q0, q5 2530; CHECK-NEXT: vmov r2, r3, d0 2531; CHECK-NEXT: vmov r5, r4, d1 2532; CHECK-NEXT: adds.w r2, r2, r12 2533; CHECK-NEXT: adc.w r3, r3, lr 2534; CHECK-NEXT: adds.w r12, r2, r5 2535; CHECK-NEXT: adc.w lr, r3, r4 2536; CHECK-NEXT: vmov.u16 r5, q6[6] 2537; CHECK-NEXT: vmov.u16 r4, q6[4] 2538; CHECK-NEXT: vmov.u8 r2, q3[12] 2539; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2540; CHECK-NEXT: vmov.u16 r5, q6[7] 2541; CHECK-NEXT: vmov.u16 r4, q6[5] 2542; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2543; CHECK-NEXT: vcmp.i32 ne, q0, zr 2544; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 2545; CHECK-NEXT: vpsel q0, q0, q4 2546; CHECK-NEXT: vmov r5, r4, d0 2547; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 2548; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 2549; CHECK-NEXT: vmov.u8 r5, q1[13] 2550; CHECK-NEXT: vmov.u8 r4, q1[12] 2551; CHECK-NEXT: vcmp.i32 ne, q4, zr 2552; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 2553; CHECK-NEXT: vmov.u8 r4, q3[13] 2554; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 2555; CHECK-NEXT: vand q4, q4, q2 2556; CHECK-NEXT: vand q6, q6, q2 2557; CHECK-NEXT: vmov r5, s18 2558; CHECK-NEXT: vmov r2, s26 2559; CHECK-NEXT: vmov r4, s16 2560; CHECK-NEXT: vmov r3, s24 2561; CHECK-NEXT: umull r2, r5, r2, r5 2562; CHECK-NEXT: umull r3, r4, r3, r4 2563; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 2564; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 2565; CHECK-NEXT: vpsel q4, q4, q5 2566; CHECK-NEXT: vmov r2, r3, d8 2567; CHECK-NEXT: vmov r5, r4, d9 2568; CHECK-NEXT: adds.w r2, r2, r12 2569; CHECK-NEXT: adc.w r3, r3, lr 2570; CHECK-NEXT: adds.w r12, r2, r5 2571; CHECK-NEXT: adc.w lr, r3, r4 2572; CHECK-NEXT: vmov r5, r4, d1 2573; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2574; CHECK-NEXT: vmov.u8 r2, q3[14] 2575; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2576; CHECK-NEXT: vmov.u8 r5, q1[15] 2577; CHECK-NEXT: vmov.u8 r4, q1[14] 2578; CHECK-NEXT: vcmp.i32 ne, q0, zr 2579; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2580; CHECK-NEXT: vmov.u8 r4, q3[15] 2581; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 2582; CHECK-NEXT: vand q0, q0, q2 2583; CHECK-NEXT: vand q1, q1, q2 2584; CHECK-NEXT: vmov r5, s2 2585; CHECK-NEXT: vmov r2, s6 2586; CHECK-NEXT: vmov r4, s0 2587; CHECK-NEXT: vmov r3, s4 2588; CHECK-NEXT: umull r2, r5, r2, r5 2589; CHECK-NEXT: umull r3, r4, r3, r4 2590; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 2591; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2592; CHECK-NEXT: vpsel q0, q0, q5 2593; CHECK-NEXT: vmov r2, r3, d0 2594; CHECK-NEXT: vmov r5, r4, d1 2595; CHECK-NEXT: adds.w r2, r2, r12 2596; CHECK-NEXT: adc.w r3, r3, lr 2597; CHECK-NEXT: adds r2, r2, r5 2598; CHECK-NEXT: adcs r3, r4 2599; CHECK-NEXT: adds r0, r0, r2 2600; CHECK-NEXT: adcs r1, r3 2601; CHECK-NEXT: add sp, #32 2602; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 2603; CHECK-NEXT: pop {r4, r5, r7, pc} 2604entry: 2605 %c = icmp eq <16 x i8> %b, zeroinitializer 2606 %xx = zext <16 x i8> %x to <16 x i64> 2607 %yy = zext <16 x i8> %y to <16 x i64> 2608 %m = mul <16 x i64> %xx, %yy 2609 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 2610 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 2611 %r = add i64 %z, %a 2612 ret i64 %r 2613} 2614 2615define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { 2616; CHECK-LABEL: add_v16i8_v16i64_acc_sext: 2617; CHECK: @ %bb.0: @ %entry 2618; CHECK-NEXT: .save {r4, r5, r7, lr} 2619; CHECK-NEXT: push {r4, r5, r7, lr} 2620; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 2621; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 2622; CHECK-NEXT: .pad #16 2623; CHECK-NEXT: sub sp, #16 2624; CHECK-NEXT: vmov q3, q0 2625; CHECK-NEXT: vcmp.i8 eq, q2, zr 2626; CHECK-NEXT: vmov.i8 q0, #0x0 2627; CHECK-NEXT: vmov.i8 q2, #0xff 2628; CHECK-NEXT: vpsel q5, q2, q0 2629; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 2630; CHECK-NEXT: vmov.u8 r2, q5[0] 2631; CHECK-NEXT: vmov.s8 r4, q1[2] 2632; CHECK-NEXT: vmov.16 q4[0], r2 2633; CHECK-NEXT: vmov.u8 r2, q5[1] 2634; CHECK-NEXT: vmov.16 q4[1], r2 2635; CHECK-NEXT: vmov.u8 r2, q5[2] 2636; CHECK-NEXT: vmov.16 q4[2], r2 2637; CHECK-NEXT: vmov.u8 r2, q5[3] 2638; CHECK-NEXT: vmov.16 q4[3], r2 2639; CHECK-NEXT: vmov.u8 r2, q5[4] 2640; CHECK-NEXT: vmov.16 q4[4], r2 2641; CHECK-NEXT: vmov.u8 r2, q5[5] 2642; CHECK-NEXT: vmov.16 q4[5], r2 2643; CHECK-NEXT: vmov.u8 r2, q5[6] 2644; CHECK-NEXT: vmov.16 q4[6], r2 2645; CHECK-NEXT: vmov.u8 r2, q5[7] 2646; CHECK-NEXT: vmov.16 q4[7], r2 2647; CHECK-NEXT: vmov.s8 r5, q3[2] 2648; CHECK-NEXT: vcmp.i16 ne, q4, zr 2649; CHECK-NEXT: smull r4, r5, r5, r4 2650; CHECK-NEXT: vpsel q6, q2, q0 2651; CHECK-NEXT: vmov.u16 r2, q6[2] 2652; CHECK-NEXT: vmov.u16 r3, q6[0] 2653; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 2654; CHECK-NEXT: vmov.u16 r2, q6[3] 2655; CHECK-NEXT: vmov.u16 r3, q6[1] 2656; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 2657; CHECK-NEXT: vcmp.i32 ne, q4, zr 2658; CHECK-NEXT: vpsel q7, q2, q0 2659; CHECK-NEXT: vmov r2, r3, d14 2660; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 2661; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 2662; CHECK-NEXT: vmov.s8 r2, q1[1] 2663; CHECK-NEXT: vmov.s8 r3, q3[1] 2664; CHECK-NEXT: vcmp.i32 ne, q4, zr 2665; CHECK-NEXT: smull lr, r12, r3, r2 2666; CHECK-NEXT: vmov.s8 r3, q1[0] 2667; CHECK-NEXT: vmov.s8 r2, q3[0] 2668; CHECK-NEXT: vmov.i32 q4, #0x0 2669; CHECK-NEXT: smull r2, r3, r2, r3 2670; CHECK-NEXT: vmov q0[2], q0[0], r2, lr 2671; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 2672; CHECK-NEXT: vpsel q0, q0, q4 2673; CHECK-NEXT: vmov lr, r12, d1 2674; CHECK-NEXT: vmov r3, r2, d0 2675; CHECK-NEXT: adds.w lr, lr, r3 2676; CHECK-NEXT: adc.w r12, r12, r2 2677; CHECK-NEXT: vmov r2, r3, d15 2678; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 2679; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload 2680; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 2681; CHECK-NEXT: vmov.s8 r2, q1[3] 2682; CHECK-NEXT: vmov.s8 r3, q3[3] 2683; CHECK-NEXT: vcmp.i32 ne, q0, zr 2684; CHECK-NEXT: smull r2, r3, r3, r2 2685; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 2686; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 2687; CHECK-NEXT: vpsel q0, q0, q4 2688; CHECK-NEXT: vmov r2, r3, d0 2689; CHECK-NEXT: vmov r5, r4, d1 2690; CHECK-NEXT: adds.w r2, r2, lr 2691; CHECK-NEXT: adc.w r3, r3, r12 2692; CHECK-NEXT: adds.w r12, r2, r5 2693; CHECK-NEXT: adc.w lr, r3, r4 2694; CHECK-NEXT: vmov.u16 r5, q6[6] 2695; CHECK-NEXT: vmov.u16 r4, q6[4] 2696; CHECK-NEXT: vmov.s8 r2, q1[4] 2697; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2698; CHECK-NEXT: vmov.u16 r5, q6[7] 2699; CHECK-NEXT: vmov.u16 r4, q6[5] 2700; CHECK-NEXT: vmov.s8 r3, q3[4] 2701; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2702; CHECK-NEXT: smull r2, r3, r3, r2 2703; CHECK-NEXT: vcmp.i32 ne, q0, zr 2704; CHECK-NEXT: vpsel q6, q2, q7 2705; CHECK-NEXT: vmov r5, r4, d12 2706; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2707; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2708; CHECK-NEXT: vmov.s8 r5, q1[5] 2709; CHECK-NEXT: vmov.s8 r4, q3[5] 2710; CHECK-NEXT: vcmp.i32 ne, q0, zr 2711; CHECK-NEXT: smull r5, r4, r4, r5 2712; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 2713; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 2714; CHECK-NEXT: vpsel q0, q0, q4 2715; CHECK-NEXT: vmov r2, r3, d0 2716; CHECK-NEXT: vmov r5, r4, d1 2717; CHECK-NEXT: adds.w r2, r2, r12 2718; CHECK-NEXT: adc.w r3, r3, lr 2719; CHECK-NEXT: adds.w r12, r2, r5 2720; CHECK-NEXT: adc.w lr, r3, r4 2721; CHECK-NEXT: vmov r5, r4, d13 2722; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2723; CHECK-NEXT: vmov.s8 r2, q1[6] 2724; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2725; CHECK-NEXT: vmov.s8 r3, q3[6] 2726; CHECK-NEXT: vmov.s8 r5, q1[7] 2727; CHECK-NEXT: vmov.s8 r4, q3[7] 2728; CHECK-NEXT: smull r5, r4, r4, r5 2729; CHECK-NEXT: vcmp.i32 ne, q0, zr 2730; CHECK-NEXT: smull r2, r3, r3, r2 2731; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 2732; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 2733; CHECK-NEXT: vpsel q0, q0, q4 2734; CHECK-NEXT: vmov r2, r3, d0 2735; CHECK-NEXT: vmov r5, r4, d1 2736; CHECK-NEXT: adds.w r2, r2, r12 2737; CHECK-NEXT: adc.w r3, r3, lr 2738; CHECK-NEXT: adds.w r12, r2, r5 2739; CHECK-NEXT: vmov.u8 r5, q5[8] 2740; CHECK-NEXT: adc.w lr, r3, r4 2741; CHECK-NEXT: vmov.16 q6[0], r5 2742; CHECK-NEXT: vmov.u8 r5, q5[9] 2743; CHECK-NEXT: vmov.16 q6[1], r5 2744; CHECK-NEXT: vmov.u8 r5, q5[10] 2745; CHECK-NEXT: vmov.16 q6[2], r5 2746; CHECK-NEXT: vmov.u8 r5, q5[11] 2747; CHECK-NEXT: vmov.16 q6[3], r5 2748; CHECK-NEXT: vmov.u8 r5, q5[12] 2749; CHECK-NEXT: vmov.16 q6[4], r5 2750; CHECK-NEXT: vmov.u8 r5, q5[13] 2751; CHECK-NEXT: vmov.16 q6[5], r5 2752; CHECK-NEXT: vmov.u8 r5, q5[14] 2753; CHECK-NEXT: vmov.16 q6[6], r5 2754; CHECK-NEXT: vmov.u8 r5, q5[15] 2755; CHECK-NEXT: vmov.16 q6[7], r5 2756; CHECK-NEXT: vmov.s8 r2, q1[8] 2757; CHECK-NEXT: vcmp.i16 ne, q6, zr 2758; CHECK-NEXT: vmov.s8 r3, q3[8] 2759; CHECK-NEXT: vpsel q5, q2, q7 2760; CHECK-NEXT: smull r2, r3, r3, r2 2761; CHECK-NEXT: vmov.u16 r5, q5[2] 2762; CHECK-NEXT: vmov.u16 r4, q5[0] 2763; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2764; CHECK-NEXT: vmov.u16 r5, q5[3] 2765; CHECK-NEXT: vmov.u16 r4, q5[1] 2766; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2767; CHECK-NEXT: vcmp.i32 ne, q0, zr 2768; CHECK-NEXT: vpsel q6, q2, q7 2769; CHECK-NEXT: vmov r5, r4, d12 2770; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2771; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2772; CHECK-NEXT: vmov.s8 r5, q1[9] 2773; CHECK-NEXT: vmov.s8 r4, q3[9] 2774; CHECK-NEXT: vcmp.i32 ne, q0, zr 2775; CHECK-NEXT: smull r5, r4, r4, r5 2776; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 2777; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 2778; CHECK-NEXT: vpsel q0, q0, q4 2779; CHECK-NEXT: vmov r2, r3, d0 2780; CHECK-NEXT: vmov r5, r4, d1 2781; CHECK-NEXT: adds.w r2, r2, r12 2782; CHECK-NEXT: adc.w r3, r3, lr 2783; CHECK-NEXT: adds.w r12, r2, r5 2784; CHECK-NEXT: adc.w lr, r3, r4 2785; CHECK-NEXT: vmov r5, r4, d13 2786; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2787; CHECK-NEXT: vmov.s8 r2, q1[10] 2788; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2789; CHECK-NEXT: vmov.s8 r3, q3[10] 2790; CHECK-NEXT: vmov.s8 r5, q1[11] 2791; CHECK-NEXT: vmov.s8 r4, q3[11] 2792; CHECK-NEXT: smull r5, r4, r4, r5 2793; CHECK-NEXT: vcmp.i32 ne, q0, zr 2794; CHECK-NEXT: smull r2, r3, r3, r2 2795; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 2796; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 2797; CHECK-NEXT: vpsel q0, q0, q4 2798; CHECK-NEXT: vmov r2, r3, d0 2799; CHECK-NEXT: vmov r5, r4, d1 2800; CHECK-NEXT: adds.w r2, r2, r12 2801; CHECK-NEXT: adc.w r3, r3, lr 2802; CHECK-NEXT: adds.w r12, r2, r5 2803; CHECK-NEXT: adc.w lr, r3, r4 2804; CHECK-NEXT: vmov.u16 r5, q5[6] 2805; CHECK-NEXT: vmov.u16 r4, q5[4] 2806; CHECK-NEXT: vmov.s8 r2, q1[12] 2807; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 2808; CHECK-NEXT: vmov.u16 r5, q5[7] 2809; CHECK-NEXT: vmov.u16 r4, q5[5] 2810; CHECK-NEXT: vmov.s8 r3, q3[12] 2811; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 2812; CHECK-NEXT: smull r2, r3, r3, r2 2813; CHECK-NEXT: vcmp.i32 ne, q0, zr 2814; CHECK-NEXT: vpsel q2, q2, q7 2815; CHECK-NEXT: vmov r5, r4, d4 2816; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2817; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2818; CHECK-NEXT: vmov.s8 r5, q1[13] 2819; CHECK-NEXT: vmov.s8 r4, q3[13] 2820; CHECK-NEXT: vcmp.i32 ne, q0, zr 2821; CHECK-NEXT: smull r5, r4, r4, r5 2822; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 2823; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 2824; CHECK-NEXT: vpsel q0, q0, q4 2825; CHECK-NEXT: vmov r2, r3, d0 2826; CHECK-NEXT: vmov r5, r4, d1 2827; CHECK-NEXT: adds.w r2, r2, r12 2828; CHECK-NEXT: adc.w r3, r3, lr 2829; CHECK-NEXT: adds.w r12, r2, r5 2830; CHECK-NEXT: adc.w lr, r3, r4 2831; CHECK-NEXT: vmov r5, r4, d5 2832; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 2833; CHECK-NEXT: vmov.s8 r2, q1[14] 2834; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 2835; CHECK-NEXT: vmov.s8 r3, q3[14] 2836; CHECK-NEXT: vmov.s8 r5, q1[15] 2837; CHECK-NEXT: vmov.s8 r4, q3[15] 2838; CHECK-NEXT: smull r5, r4, r4, r5 2839; CHECK-NEXT: vcmp.i32 ne, q0, zr 2840; CHECK-NEXT: smull r2, r3, r3, r2 2841; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 2842; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 2843; CHECK-NEXT: vpsel q0, q0, q4 2844; CHECK-NEXT: vmov r2, r3, d0 2845; CHECK-NEXT: vmov r5, r4, d1 2846; CHECK-NEXT: adds.w r2, r2, r12 2847; CHECK-NEXT: adc.w r3, r3, lr 2848; CHECK-NEXT: adds r2, r2, r5 2849; CHECK-NEXT: adcs r3, r4 2850; CHECK-NEXT: adds r0, r0, r2 2851; CHECK-NEXT: adcs r1, r3 2852; CHECK-NEXT: add sp, #16 2853; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 2854; CHECK-NEXT: pop {r4, r5, r7, pc} 2855entry: 2856 %c = icmp eq <16 x i8> %b, zeroinitializer 2857 %xx = sext <16 x i8> %x to <16 x i64> 2858 %yy = sext <16 x i8> %y to <16 x i64> 2859 %m = mul <16 x i64> %xx, %yy 2860 %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer 2861 %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) 2862 %r = add i64 %z, %a 2863 ret i64 %r 2864} 2865 2866define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) { 2867; CHECK-LABEL: add_v2i8_v2i64_acc_zext: 2868; CHECK: @ %bb.0: @ %entry 2869; CHECK-NEXT: .save {r7, lr} 2870; CHECK-NEXT: push {r7, lr} 2871; CHECK-NEXT: vmov.i64 q3, #0xff 2872; CHECK-NEXT: vand q1, q1, q3 2873; CHECK-NEXT: vand q0, q0, q3 2874; CHECK-NEXT: vmov r2, s6 2875; CHECK-NEXT: vmov r3, s2 2876; CHECK-NEXT: umull lr, r12, r3, r2 2877; CHECK-NEXT: vmov r3, s4 2878; CHECK-NEXT: vmov r2, s0 2879; CHECK-NEXT: vand q1, q2, q3 2880; CHECK-NEXT: umull r2, r3, r2, r3 2881; CHECK-NEXT: vmov q0[2], q0[0], r2, lr 2882; CHECK-NEXT: vmov r2, s4 2883; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 2884; CHECK-NEXT: movs r3, #0 2885; CHECK-NEXT: cmp r2, #0 2886; CHECK-NEXT: csetm r2, eq 2887; CHECK-NEXT: bfi r3, r2, #0, #8 2888; CHECK-NEXT: vmov r2, s6 2889; CHECK-NEXT: vmov.i32 q1, #0x0 2890; CHECK-NEXT: cmp r2, #0 2891; CHECK-NEXT: csetm r2, eq 2892; CHECK-NEXT: bfi r3, r2, #8, #8 2893; CHECK-NEXT: vmsr p0, r3 2894; CHECK-NEXT: vpsel q0, q0, q1 2895; CHECK-NEXT: vmov lr, r12, d1 2896; CHECK-NEXT: vmov r3, r2, d0 2897; CHECK-NEXT: adds.w r3, r3, lr 2898; CHECK-NEXT: adc.w r2, r2, r12 2899; CHECK-NEXT: adds r0, r0, r3 2900; CHECK-NEXT: adcs r1, r2 2901; CHECK-NEXT: pop {r7, pc} 2902entry: 2903 %c = icmp eq <2 x i8> %b, zeroinitializer 2904 %xx = zext <2 x i8> %x to <2 x i64> 2905 %yy = zext <2 x i8> %y to <2 x i64> 2906 %m = mul <2 x i64> %xx, %yy 2907 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 2908 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 2909 %r = add i64 %z, %a 2910 ret i64 %r 2911} 2912 2913define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) { 2914; CHECK-LABEL: add_v2i8_v2i64_acc_sext: 2915; CHECK: @ %bb.0: @ %entry 2916; CHECK-NEXT: .save {r7, lr} 2917; CHECK-NEXT: push {r7, lr} 2918; CHECK-NEXT: vmov.i32 q3, #0xff 2919; CHECK-NEXT: movs r3, #0 2920; CHECK-NEXT: vand q2, q2, q3 2921; CHECK-NEXT: vmov r2, s8 2922; CHECK-NEXT: cmp r2, #0 2923; CHECK-NEXT: csetm r2, eq 2924; CHECK-NEXT: bfi r3, r2, #0, #8 2925; CHECK-NEXT: vmov r2, s10 2926; CHECK-NEXT: cmp r2, #0 2927; CHECK-NEXT: csetm r2, eq 2928; CHECK-NEXT: bfi r3, r2, #8, #8 2929; CHECK-NEXT: vmov r2, s6 2930; CHECK-NEXT: vmsr p0, r3 2931; CHECK-NEXT: vmov r3, s2 2932; CHECK-NEXT: sxtb r2, r2 2933; CHECK-NEXT: sxtb r3, r3 2934; CHECK-NEXT: smull lr, r12, r3, r2 2935; CHECK-NEXT: vmov r3, s4 2936; CHECK-NEXT: vmov r2, s0 2937; CHECK-NEXT: vmov.i32 q1, #0x0 2938; CHECK-NEXT: sxtb r3, r3 2939; CHECK-NEXT: sxtb r2, r2 2940; CHECK-NEXT: smull r2, r3, r2, r3 2941; CHECK-NEXT: vmov q0[2], q0[0], r2, lr 2942; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 2943; CHECK-NEXT: vpsel q0, q0, q1 2944; CHECK-NEXT: vmov lr, r12, d1 2945; CHECK-NEXT: vmov r3, r2, d0 2946; CHECK-NEXT: adds.w r3, r3, lr 2947; CHECK-NEXT: adc.w r2, r2, r12 2948; CHECK-NEXT: adds r0, r0, r3 2949; CHECK-NEXT: adcs r1, r2 2950; CHECK-NEXT: pop {r7, pc} 2951entry: 2952 %c = icmp eq <2 x i8> %b, zeroinitializer 2953 %xx = sext <2 x i8> %x to <2 x i64> 2954 %yy = sext <2 x i8> %y to <2 x i64> 2955 %m = mul <2 x i64> %xx, %yy 2956 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 2957 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 2958 %r = add i64 %z, %a 2959 ret i64 %r 2960} 2961 2962define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) { 2963; CHECK-LABEL: add_v2i64_v2i64_acc: 2964; CHECK: @ %bb.0: @ %entry 2965; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 2966; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 2967; CHECK-NEXT: vmov r2, r12, d3 2968; CHECK-NEXT: vmov r3, lr, d1 2969; CHECK-NEXT: vmov r6, r9, d2 2970; CHECK-NEXT: vmov.i32 q1, #0x0 2971; CHECK-NEXT: vmov r5, r11, d0 2972; CHECK-NEXT: umull r10, r8, r3, r2 2973; CHECK-NEXT: umull r4, r7, r5, r6 2974; CHECK-NEXT: mla r3, r3, r12, r8 2975; CHECK-NEXT: vmov q0[2], q0[0], r4, r10 2976; CHECK-NEXT: mla r2, lr, r2, r3 2977; CHECK-NEXT: mla r3, r5, r9, r7 2978; CHECK-NEXT: mla r3, r11, r6, r3 2979; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 2980; CHECK-NEXT: vmov r2, r3, d4 2981; CHECK-NEXT: orrs r2, r3 2982; CHECK-NEXT: mov.w r3, #0 2983; CHECK-NEXT: csetm r2, eq 2984; CHECK-NEXT: bfi r3, r2, #0, #8 2985; CHECK-NEXT: vmov r2, r7, d5 2986; CHECK-NEXT: orrs r2, r7 2987; CHECK-NEXT: csetm r2, eq 2988; CHECK-NEXT: bfi r3, r2, #8, #8 2989; CHECK-NEXT: vmsr p0, r3 2990; CHECK-NEXT: vpsel q0, q0, q1 2991; CHECK-NEXT: vmov r2, r3, d1 2992; CHECK-NEXT: vmov r7, r6, d0 2993; CHECK-NEXT: adds r2, r2, r7 2994; CHECK-NEXT: adcs r3, r6 2995; CHECK-NEXT: adds r0, r0, r2 2996; CHECK-NEXT: adcs r1, r3 2997; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 2998entry: 2999 %c = icmp eq <2 x i64> %b, zeroinitializer 3000 %m = mul <2 x i64> %x, %y 3001 %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer 3002 %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) 3003 %r = add i64 %z, %a 3004 ret i64 %r 3005} 3006 3007declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 3008declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 3009declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 3010declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 3011declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 3012declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 3013declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 3014declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 3015declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 3016declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 3017