1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4; Various reductions generated fro SLP vectorizing unrolled loops. Generated 5; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed. 6 7define i32 @addv2i32i32(ptr %x) { 8; CHECK-LABEL: addv2i32i32: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: ldrd r0, r1, [r0] 11; CHECK-NEXT: add r0, r1 12; CHECK-NEXT: bx lr 13entry: 14 %0 = load i32, ptr %x, align 4 15 %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1 16 %1 = load i32, ptr %arrayidx.1, align 4 17 %add.1 = add nsw i32 %1, %0 18 ret i32 %add.1 19} 20 21define i32 @addv4i32i32(ptr %x) { 22; CHECK-LABEL: addv4i32i32: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vldrw.u32 q0, [r0] 25; CHECK-NEXT: vaddv.u32 r0, q0 26; CHECK-NEXT: bx lr 27entry: 28 %0 = load <4 x i32>, ptr %x, align 4 29 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) 30 ret i32 %1 31} 32 33define i32 @addv8i32i32(ptr %x) { 34; CHECK-LABEL: addv8i32i32: 35; CHECK: @ %bb.0: @ %entry 36; CHECK-NEXT: vldrw.u32 q1, [r0] 37; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 38; CHECK-NEXT: vaddv.u32 r0, q1 39; CHECK-NEXT: vaddva.u32 r0, q0 40; CHECK-NEXT: bx lr 41entry: 42 %0 = load <8 x i32>, ptr %x, align 4 43 %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) 44 ret i32 %1 45} 46 47define i32 @addv16i32i32(ptr %x) { 48; CHECK-LABEL: addv16i32i32: 49; CHECK: @ %bb.0: @ %entry 50; CHECK-NEXT: vldrw.u32 q1, [r0] 51; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 52; CHECK-NEXT: vaddv.u32 r2, q1 53; CHECK-NEXT: vaddva.u32 r2, q0 54; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 55; CHECK-NEXT: vaddva.u32 r2, q0 56; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 57; CHECK-NEXT: vaddva.u32 r2, q0 58; CHECK-NEXT: mov r0, r2 59; CHECK-NEXT: bx lr 60entry: 61 %0 = load <16 x i32>, ptr %x, align 4 62 %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) 63 ret i32 %1 64} 65 66define i32 @addv24i32i32(ptr %x) { 67; CHECK-LABEL: addv24i32i32: 68; CHECK: @ %bb.0: @ %entry 69; CHECK-NEXT: vldrw.u32 q1, [r0] 70; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 71; CHECK-NEXT: vaddv.u32 r2, q1 72; CHECK-NEXT: vaddva.u32 r2, q0 73; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 74; CHECK-NEXT: vaddva.u32 r2, q0 75; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 76; CHECK-NEXT: vaddva.u32 r2, q0 77; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 78; CHECK-NEXT: vaddva.u32 r2, q0 79; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 80; CHECK-NEXT: vaddva.u32 r2, q0 81; CHECK-NEXT: mov r0, r2 82; CHECK-NEXT: bx lr 83entry: 84 %0 = load <8 x i32>, ptr %x, align 4 85 %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8 86 %1 = load <16 x i32>, ptr %arrayidx.8, align 4 87 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 88 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) 89 %op.rdx = add nsw i32 %2, %3 90 ret i32 %op.rdx 91} 92 93define i32 @addv32i32i32(ptr %x) { 94; CHECK-LABEL: addv32i32i32: 95; CHECK: @ %bb.0: @ %entry 96; CHECK-NEXT: vldrw.u32 q1, [r0] 97; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 98; CHECK-NEXT: mov r1, r0 99; CHECK-NEXT: vaddv.u32 r0, q1 100; CHECK-NEXT: vaddva.u32 r0, q0 101; CHECK-NEXT: vldrw.u32 q0, [r1, #32] 102; CHECK-NEXT: vaddva.u32 r0, q0 103; CHECK-NEXT: vldrw.u32 q0, [r1, #48] 104; CHECK-NEXT: vaddva.u32 r0, q0 105; CHECK-NEXT: vldrw.u32 q0, [r1, #64] 106; CHECK-NEXT: vaddva.u32 r0, q0 107; CHECK-NEXT: vldrw.u32 q0, [r1, #80] 108; CHECK-NEXT: vaddva.u32 r0, q0 109; CHECK-NEXT: vldrw.u32 q0, [r1, #96] 110; CHECK-NEXT: vaddva.u32 r0, q0 111; CHECK-NEXT: vldrw.u32 q0, [r1, #112] 112; CHECK-NEXT: vaddva.u32 r0, q0 113; CHECK-NEXT: bx lr 114entry: 115 %0 = load <32 x i32>, ptr %x, align 4 116 %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %0) 117 ret i32 %1 118} 119 120define i32 @addv64i32i32(ptr %x) { 121; CHECK-LABEL: addv64i32i32: 122; CHECK: @ %bb.0: @ %entry 123; CHECK-NEXT: vldrw.u32 q1, [r0] 124; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 125; CHECK-NEXT: vaddv.u32 r2, q1 126; CHECK-NEXT: vaddva.u32 r2, q0 127; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 128; CHECK-NEXT: vaddva.u32 r2, q0 129; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 130; CHECK-NEXT: vaddva.u32 r2, q0 131; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 132; CHECK-NEXT: vaddva.u32 r2, q0 133; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 134; CHECK-NEXT: vaddva.u32 r2, q0 135; CHECK-NEXT: vldrw.u32 q0, [r0, #96] 136; CHECK-NEXT: vaddva.u32 r2, q0 137; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 138; CHECK-NEXT: vaddva.u32 r2, q0 139; CHECK-NEXT: vldrw.u32 q0, [r0, #128] 140; CHECK-NEXT: vaddva.u32 r2, q0 141; CHECK-NEXT: vldrw.u32 q0, [r0, #144] 142; CHECK-NEXT: vaddva.u32 r2, q0 143; CHECK-NEXT: vldrw.u32 q0, [r0, #160] 144; CHECK-NEXT: vaddva.u32 r2, q0 145; CHECK-NEXT: vldrw.u32 q0, [r0, #176] 146; CHECK-NEXT: vaddva.u32 r2, q0 147; CHECK-NEXT: vldrw.u32 q0, [r0, #192] 148; CHECK-NEXT: vaddva.u32 r2, q0 149; CHECK-NEXT: vldrw.u32 q0, [r0, #208] 150; CHECK-NEXT: vaddva.u32 r2, q0 151; CHECK-NEXT: vldrw.u32 q0, [r0, #224] 152; CHECK-NEXT: vaddva.u32 r2, q0 153; CHECK-NEXT: vldrw.u32 q0, [r0, #240] 154; CHECK-NEXT: vaddva.u32 r2, q0 155; CHECK-NEXT: mov r0, r2 156; CHECK-NEXT: bx lr 157entry: 158 %0 = load <64 x i32>, ptr %x, align 4 159 %1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0) 160 ret i32 %1 161} 162 163define i32 @addv128i32i32(ptr %x) { 164; CHECK-LABEL: addv128i32i32: 165; CHECK: @ %bb.0: @ %entry 166; CHECK-NEXT: vldrw.u32 q1, [r0] 167; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 168; CHECK-NEXT: vaddv.u32 r2, q1 169; CHECK-NEXT: vaddva.u32 r2, q0 170; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 171; CHECK-NEXT: vaddva.u32 r2, q0 172; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 173; CHECK-NEXT: vaddva.u32 r2, q0 174; CHECK-NEXT: vldrw.u32 q0, [r0, #64] 175; CHECK-NEXT: vaddva.u32 r2, q0 176; CHECK-NEXT: vldrw.u32 q0, [r0, #80] 177; CHECK-NEXT: vaddva.u32 r2, q0 178; CHECK-NEXT: vldrw.u32 q0, [r0, #96] 179; CHECK-NEXT: vaddva.u32 r2, q0 180; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 181; CHECK-NEXT: vaddva.u32 r2, q0 182; CHECK-NEXT: vldrw.u32 q0, [r0, #128] 183; CHECK-NEXT: vaddva.u32 r2, q0 184; CHECK-NEXT: vldrw.u32 q0, [r0, #144] 185; CHECK-NEXT: vaddva.u32 r2, q0 186; CHECK-NEXT: vldrw.u32 q0, [r0, #160] 187; CHECK-NEXT: vaddva.u32 r2, q0 188; CHECK-NEXT: vldrw.u32 q0, [r0, #176] 189; CHECK-NEXT: vaddva.u32 r2, q0 190; CHECK-NEXT: vldrw.u32 q0, [r0, #192] 191; CHECK-NEXT: vaddva.u32 r2, q0 192; CHECK-NEXT: vldrw.u32 q0, [r0, #208] 193; CHECK-NEXT: vaddva.u32 r2, q0 194; CHECK-NEXT: vldrw.u32 q0, [r0, #224] 195; CHECK-NEXT: vaddva.u32 r2, q0 196; CHECK-NEXT: vldrw.u32 q0, [r0, #240] 197; CHECK-NEXT: vaddva.u32 r2, q0 198; CHECK-NEXT: vldrw.u32 q0, [r0, #256] 199; CHECK-NEXT: vaddva.u32 r2, q0 200; CHECK-NEXT: vldrw.u32 q0, [r0, #272] 201; CHECK-NEXT: vaddva.u32 r2, q0 202; CHECK-NEXT: vldrw.u32 q0, [r0, #288] 203; CHECK-NEXT: vaddva.u32 r2, q0 204; CHECK-NEXT: vldrw.u32 q0, [r0, #304] 205; CHECK-NEXT: vaddva.u32 r2, q0 206; CHECK-NEXT: vldrw.u32 q0, [r0, #320] 207; CHECK-NEXT: vaddva.u32 r2, q0 208; CHECK-NEXT: vldrw.u32 q0, [r0, #336] 209; CHECK-NEXT: vaddva.u32 r2, q0 210; CHECK-NEXT: vldrw.u32 q0, [r0, #352] 211; CHECK-NEXT: vaddva.u32 r2, q0 212; CHECK-NEXT: vldrw.u32 q0, [r0, #368] 213; CHECK-NEXT: vaddva.u32 r2, q0 214; CHECK-NEXT: vldrw.u32 q0, [r0, #384] 215; CHECK-NEXT: vaddva.u32 r2, q0 216; CHECK-NEXT: vldrw.u32 q0, [r0, #400] 217; CHECK-NEXT: vaddva.u32 r2, q0 218; CHECK-NEXT: vldrw.u32 q0, [r0, #416] 219; CHECK-NEXT: vaddva.u32 r2, q0 220; CHECK-NEXT: vldrw.u32 q0, [r0, #432] 221; CHECK-NEXT: vaddva.u32 r2, q0 222; CHECK-NEXT: vldrw.u32 q0, [r0, #448] 223; CHECK-NEXT: vaddva.u32 r2, q0 224; CHECK-NEXT: vldrw.u32 q0, [r0, #464] 225; CHECK-NEXT: vaddva.u32 r2, q0 226; CHECK-NEXT: vldrw.u32 q0, [r0, #480] 227; CHECK-NEXT: vaddva.u32 r2, q0 228; CHECK-NEXT: vldrw.u32 q0, [r0, #496] 229; CHECK-NEXT: vaddva.u32 r2, q0 230; CHECK-NEXT: mov r0, r2 231; CHECK-NEXT: bx lr 232entry: 233 %wide.load = load <4 x i32>, ptr %x, align 4 234 %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) 235 %1 = getelementptr inbounds i32, ptr %x, i32 4 236 %wide.load.1 = load <4 x i32>, ptr %1, align 4 237 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1) 238 %3 = add i32 %2, %0 239 %4 = getelementptr inbounds i32, ptr %x, i32 8 240 %wide.load.2 = load <4 x i32>, ptr %4, align 4 241 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2) 242 %6 = add i32 %5, %3 243 %7 = getelementptr inbounds i32, ptr %x, i32 12 244 %wide.load.3 = load <4 x i32>, ptr %7, align 4 245 %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3) 246 %9 = add i32 %8, %6 247 %10 = getelementptr inbounds i32, ptr %x, i32 16 248 %wide.load.4 = load <4 x i32>, ptr %10, align 4 249 %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4) 250 %12 = add i32 %11, %9 251 %13 = getelementptr inbounds i32, ptr %x, i32 20 252 %wide.load.5 = load <4 x i32>, ptr %13, align 4 253 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5) 254 %15 = add i32 %14, %12 255 %16 = getelementptr inbounds i32, ptr %x, i32 24 256 %wide.load.6 = load <4 x i32>, ptr %16, align 4 257 %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6) 258 %18 = add i32 %17, %15 259 %19 = getelementptr inbounds i32, ptr %x, i32 28 260 %wide.load.7 = load <4 x i32>, ptr %19, align 4 261 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7) 262 %21 = add i32 %20, %18 263 %22 = getelementptr inbounds i32, ptr %x, i32 32 264 %wide.load.8 = load <4 x i32>, ptr %22, align 4 265 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8) 266 %24 = add i32 %23, %21 267 %25 = getelementptr inbounds i32, ptr %x, i32 36 268 %wide.load.9 = load <4 x i32>, ptr %25, align 4 269 %26 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9) 270 %27 = add i32 %26, %24 271 %28 = getelementptr inbounds i32, ptr %x, i32 40 272 %wide.load.10 = load <4 x i32>, ptr %28, align 4 273 %29 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10) 274 %30 = add i32 %29, %27 275 %31 = getelementptr inbounds i32, ptr %x, i32 44 276 %wide.load.11 = load <4 x i32>, ptr %31, align 4 277 %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11) 278 %33 = add i32 %32, %30 279 %34 = getelementptr inbounds i32, ptr %x, i32 48 280 %wide.load.12 = load <4 x i32>, ptr %34, align 4 281 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12) 282 %36 = add i32 %35, %33 283 %37 = getelementptr inbounds i32, ptr %x, i32 52 284 %wide.load.13 = load <4 x i32>, ptr %37, align 4 285 %38 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13) 286 %39 = add i32 %38, %36 287 %40 = getelementptr inbounds i32, ptr %x, i32 56 288 %wide.load.14 = load <4 x i32>, ptr %40, align 4 289 %41 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14) 290 %42 = add i32 %41, %39 291 %43 = getelementptr inbounds i32, ptr %x, i32 60 292 %wide.load.15 = load <4 x i32>, ptr %43, align 4 293 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15) 294 %45 = add i32 %44, %42 295 %46 = getelementptr inbounds i32, ptr %x, i32 64 296 %wide.load.16 = load <4 x i32>, ptr %46, align 4 297 %47 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16) 298 %48 = add i32 %47, %45 299 %49 = getelementptr inbounds i32, ptr %x, i32 68 300 %wide.load.17 = load <4 x i32>, ptr %49, align 4 301 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17) 302 %51 = add i32 %50, %48 303 %52 = getelementptr inbounds i32, ptr %x, i32 72 304 %wide.load.18 = load <4 x i32>, ptr %52, align 4 305 %53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18) 306 %54 = add i32 %53, %51 307 %55 = getelementptr inbounds i32, ptr %x, i32 76 308 %wide.load.19 = load <4 x i32>, ptr %55, align 4 309 %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19) 310 %57 = add i32 %56, %54 311 %58 = getelementptr inbounds i32, ptr %x, i32 80 312 %wide.load.20 = load <4 x i32>, ptr %58, align 4 313 %59 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20) 314 %60 = add i32 %59, %57 315 %61 = getelementptr inbounds i32, ptr %x, i32 84 316 %wide.load.21 = load <4 x i32>, ptr %61, align 4 317 %62 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21) 318 %63 = add i32 %62, %60 319 %64 = getelementptr inbounds i32, ptr %x, i32 88 320 %wide.load.22 = load <4 x i32>, ptr %64, align 4 321 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22) 322 %66 = add i32 %65, %63 323 %67 = getelementptr inbounds i32, ptr %x, i32 92 324 %wide.load.23 = load <4 x i32>, ptr %67, align 4 325 %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23) 326 %69 = add i32 %68, %66 327 %70 = getelementptr inbounds i32, ptr %x, i32 96 328 %wide.load.24 = load <4 x i32>, ptr %70, align 4 329 %71 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24) 330 %72 = add i32 %71, %69 331 %73 = getelementptr inbounds i32, ptr %x, i32 100 332 %wide.load.25 = load <4 x i32>, ptr %73, align 4 333 %74 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25) 334 %75 = add i32 %74, %72 335 %76 = getelementptr inbounds i32, ptr %x, i32 104 336 %wide.load.26 = load <4 x i32>, ptr %76, align 4 337 %77 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26) 338 %78 = add i32 %77, %75 339 %79 = getelementptr inbounds i32, ptr %x, i32 108 340 %wide.load.27 = load <4 x i32>, ptr %79, align 4 341 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27) 342 %81 = add i32 %80, %78 343 %82 = getelementptr inbounds i32, ptr %x, i32 112 344 %wide.load.28 = load <4 x i32>, ptr %82, align 4 345 %83 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28) 346 %84 = add i32 %83, %81 347 %85 = getelementptr inbounds i32, ptr %x, i32 116 348 %wide.load.29 = load <4 x i32>, ptr %85, align 4 349 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29) 350 %87 = add i32 %86, %84 351 %88 = getelementptr inbounds i32, ptr %x, i32 120 352 %wide.load.30 = load <4 x i32>, ptr %88, align 4 353 %89 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30) 354 %90 = add i32 %89, %87 355 %91 = getelementptr inbounds i32, ptr %x, i32 124 356 %wide.load.31 = load <4 x i32>, ptr %91, align 4 357 %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31) 358 %93 = add i32 %92, %90 359 ret i32 %93 360} 361 362define i32 @addv2i32i16(ptr %x) { 363; CHECK-LABEL: addv2i32i16: 364; CHECK: @ %bb.0: @ %entry 365; CHECK-NEXT: ldrsh.w r1, [r0] 366; CHECK-NEXT: ldrsh.w r0, [r0, #2] 367; CHECK-NEXT: add r0, r1 368; CHECK-NEXT: bx lr 369entry: 370 %0 = load i16, ptr %x, align 2 371 %conv = sext i16 %0 to i32 372 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 373 %1 = load i16, ptr %arrayidx.1, align 2 374 %conv.1 = sext i16 %1 to i32 375 %add.1 = add nsw i32 %conv, %conv.1 376 ret i32 %add.1 377} 378 379define i32 @addv4i32i16(ptr %x) { 380; CHECK-LABEL: addv4i32i16: 381; CHECK: @ %bb.0: @ %entry 382; CHECK-NEXT: vldrh.s32 q0, [r0] 383; CHECK-NEXT: vaddv.u32 r0, q0 384; CHECK-NEXT: bx lr 385entry: 386 %0 = load <4 x i16>, ptr %x, align 2 387 %1 = sext <4 x i16> %0 to <4 x i32> 388 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 389 ret i32 %2 390} 391 392define i32 @addv8i32i16(ptr %x) { 393; CHECK-LABEL: addv8i32i16: 394; CHECK: @ %bb.0: @ %entry 395; CHECK-NEXT: vldrh.u16 q0, [r0] 396; CHECK-NEXT: vaddv.s16 r0, q0 397; CHECK-NEXT: bx lr 398entry: 399 %0 = load <8 x i16>, ptr %x, align 2 400 %1 = sext <8 x i16> %0 to <8 x i32> 401 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 402 ret i32 %2 403} 404 405define i32 @addv16i32i16(ptr %x) { 406; CHECK-LABEL: addv16i32i16: 407; CHECK: @ %bb.0: @ %entry 408; CHECK-NEXT: vldrh.s32 q1, [r0] 409; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 410; CHECK-NEXT: vaddv.u32 r2, q1 411; CHECK-NEXT: vaddva.u32 r2, q0 412; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 413; CHECK-NEXT: vaddva.u32 r2, q0 414; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 415; CHECK-NEXT: vaddva.u32 r2, q0 416; CHECK-NEXT: mov r0, r2 417; CHECK-NEXT: bx lr 418entry: 419 %0 = load <16 x i16>, ptr %x, align 2 420 %1 = sext <16 x i16> %0 to <16 x i32> 421 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 422 ret i32 %2 423} 424 425define i32 @addv24i32i16(ptr %x) { 426; CHECK-LABEL: addv24i32i16: 427; CHECK: @ %bb.0: @ %entry 428; CHECK-NEXT: vldrh.s32 q1, [r0] 429; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 430; CHECK-NEXT: vaddv.u32 r2, q1 431; CHECK-NEXT: vaddva.u32 r2, q0 432; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 433; CHECK-NEXT: vaddva.u32 r2, q0 434; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 435; CHECK-NEXT: vaddva.u32 r2, q0 436; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 437; CHECK-NEXT: vaddva.s16 r2, q0 438; CHECK-NEXT: mov r0, r2 439; CHECK-NEXT: bx lr 440entry: 441 %0 = load <16 x i16>, ptr %x, align 2 442 %1 = sext <16 x i16> %0 to <16 x i32> 443 %arrayidx.16 = getelementptr inbounds i16, ptr %x, i32 16 444 %2 = load <8 x i16>, ptr %arrayidx.16, align 2 445 %3 = sext <8 x i16> %2 to <8 x i32> 446 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 447 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) 448 %op.rdx = add nsw i32 %4, %5 449 ret i32 %op.rdx 450} 451 452define i32 @addv32i32i16(ptr %x) { 453; CHECK-LABEL: addv32i32i16: 454; CHECK: @ %bb.0: @ %entry 455; CHECK-NEXT: vldrh.s32 q1, [r0] 456; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 457; CHECK-NEXT: vaddv.u32 r2, q1 458; CHECK-NEXT: vaddva.u32 r2, q0 459; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 460; CHECK-NEXT: vaddva.u32 r2, q0 461; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 462; CHECK-NEXT: vaddva.u32 r2, q0 463; CHECK-NEXT: vldrh.s32 q0, [r0, #32] 464; CHECK-NEXT: vaddva.u32 r2, q0 465; CHECK-NEXT: vldrh.s32 q0, [r0, #40] 466; CHECK-NEXT: vaddva.u32 r2, q0 467; CHECK-NEXT: vldrh.s32 q0, [r0, #48] 468; CHECK-NEXT: vaddva.u32 r2, q0 469; CHECK-NEXT: vldrh.s32 q0, [r0, #56] 470; CHECK-NEXT: vaddva.u32 r2, q0 471; CHECK-NEXT: mov r0, r2 472; CHECK-NEXT: bx lr 473entry: 474 %0 = load <32 x i16>, ptr %x, align 2 475 %1 = sext <32 x i16> %0 to <32 x i32> 476 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 477 ret i32 %2 478} 479 480define i32 @addv64i32i16(ptr %x) { 481; CHECK-LABEL: addv64i32i16: 482; CHECK: @ %bb.0: @ %entry 483; CHECK-NEXT: vldrh.s32 q1, [r0] 484; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 485; CHECK-NEXT: ldrsh.w r1, [r0, #120] 486; CHECK-NEXT: vaddv.u32 r2, q1 487; CHECK-NEXT: ldrsh.w r3, [r0, #122] 488; CHECK-NEXT: vaddva.u32 r2, q0 489; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 490; CHECK-NEXT: ldrsh.w r12, [r0, #124] 491; CHECK-NEXT: vaddva.u32 r2, q0 492; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 493; CHECK-NEXT: vaddva.u32 r2, q0 494; CHECK-NEXT: vldrh.s32 q0, [r0, #32] 495; CHECK-NEXT: vaddva.u32 r2, q0 496; CHECK-NEXT: vldrh.s32 q0, [r0, #40] 497; CHECK-NEXT: vaddva.u32 r2, q0 498; CHECK-NEXT: vldrh.s32 q0, [r0, #48] 499; CHECK-NEXT: vaddva.u32 r2, q0 500; CHECK-NEXT: vldrh.s32 q0, [r0, #56] 501; CHECK-NEXT: vaddva.u32 r2, q0 502; CHECK-NEXT: vldrh.s32 q0, [r0, #64] 503; CHECK-NEXT: vaddva.u32 r2, q0 504; CHECK-NEXT: vldrh.s32 q0, [r0, #72] 505; CHECK-NEXT: vaddva.u32 r2, q0 506; CHECK-NEXT: vldrh.s32 q0, [r0, #80] 507; CHECK-NEXT: vaddva.u32 r2, q0 508; CHECK-NEXT: vldrh.s32 q0, [r0, #88] 509; CHECK-NEXT: vaddva.u32 r2, q0 510; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 511; CHECK-NEXT: vaddva.s16 r2, q0 512; CHECK-NEXT: vldrh.s32 q0, [r0, #112] 513; CHECK-NEXT: ldrsh.w r0, [r0, #126] 514; CHECK-NEXT: vaddva.u32 r2, q0 515; CHECK-NEXT: add r1, r2 516; CHECK-NEXT: add r1, r3 517; CHECK-NEXT: add r1, r12 518; CHECK-NEXT: add r0, r1 519; CHECK-NEXT: bx lr 520entry: 521 %0 = load <32 x i16>, ptr %x, align 2 522 %1 = sext <32 x i16> %0 to <32 x i32> 523 %arrayidx.32 = getelementptr inbounds i16, ptr %x, i32 32 524 %2 = load <16 x i16>, ptr %arrayidx.32, align 2 525 %3 = sext <16 x i16> %2 to <16 x i32> 526 %arrayidx.48 = getelementptr inbounds i16, ptr %x, i32 48 527 %4 = load <8 x i16>, ptr %arrayidx.48, align 2 528 %5 = sext <8 x i16> %4 to <8 x i32> 529 %arrayidx.56 = getelementptr inbounds i16, ptr %x, i32 56 530 %6 = load <4 x i16>, ptr %arrayidx.56, align 2 531 %7 = sext <4 x i16> %6 to <4 x i32> 532 %arrayidx.60 = getelementptr inbounds i16, ptr %x, i32 60 533 %8 = load i16, ptr %arrayidx.60, align 2 534 %conv.60 = sext i16 %8 to i32 535 %arrayidx.61 = getelementptr inbounds i16, ptr %x, i32 61 536 %9 = load i16, ptr %arrayidx.61, align 2 537 %conv.61 = sext i16 %9 to i32 538 %arrayidx.62 = getelementptr inbounds i16, ptr %x, i32 62 539 %10 = load i16, ptr %arrayidx.62, align 2 540 %conv.62 = sext i16 %10 to i32 541 %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 542 %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) 543 %op.rdx = add nsw i32 %11, %12 544 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) 545 %op.rdx8 = add nsw i32 %op.rdx, %13 546 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) 547 %op.rdx9 = add nsw i32 %op.rdx8, %14 548 %15 = add nsw i32 %op.rdx9, %conv.60 549 %16 = add nsw i32 %15, %conv.61 550 %17 = add nsw i32 %16, %conv.62 551 %arrayidx.63 = getelementptr inbounds i16, ptr %x, i32 63 552 %18 = load i16, ptr %arrayidx.63, align 2 553 %conv.63 = sext i16 %18 to i32 554 %add.63 = add nsw i32 %17, %conv.63 555 ret i32 %add.63 556} 557 558define i32 @addv128i32i16(ptr %x) { 559; CHECK-LABEL: addv128i32i16: 560; CHECK: @ %bb.0: @ %entry 561; CHECK-NEXT: vldrh.u16 q1, [r0] 562; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 563; CHECK-NEXT: vaddv.s16 r2, q1 564; CHECK-NEXT: vaddva.s16 r2, q0 565; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 566; CHECK-NEXT: vaddva.s16 r2, q0 567; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 568; CHECK-NEXT: vaddva.s16 r2, q0 569; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 570; CHECK-NEXT: vaddva.s16 r2, q0 571; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 572; CHECK-NEXT: vaddva.s16 r2, q0 573; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 574; CHECK-NEXT: vaddva.s16 r2, q0 575; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 576; CHECK-NEXT: vaddva.s16 r2, q0 577; CHECK-NEXT: vldrh.u16 q0, [r0, #128] 578; CHECK-NEXT: vaddva.s16 r2, q0 579; CHECK-NEXT: vldrh.u16 q0, [r0, #144] 580; CHECK-NEXT: vaddva.s16 r2, q0 581; CHECK-NEXT: vldrh.u16 q0, [r0, #160] 582; CHECK-NEXT: vaddva.s16 r2, q0 583; CHECK-NEXT: vldrh.u16 q0, [r0, #176] 584; CHECK-NEXT: vaddva.s16 r2, q0 585; CHECK-NEXT: vldrh.u16 q0, [r0, #192] 586; CHECK-NEXT: vaddva.s16 r2, q0 587; CHECK-NEXT: vldrh.u16 q0, [r0, #208] 588; CHECK-NEXT: vaddva.s16 r2, q0 589; CHECK-NEXT: vldrh.u16 q0, [r0, #224] 590; CHECK-NEXT: vaddva.s16 r2, q0 591; CHECK-NEXT: vldrh.u16 q0, [r0, #240] 592; CHECK-NEXT: vaddva.s16 r2, q0 593; CHECK-NEXT: mov r0, r2 594; CHECK-NEXT: bx lr 595entry: 596 %wide.load = load <8 x i16>, ptr %x, align 2 597 %0 = sext <8 x i16> %wide.load to <8 x i32> 598 %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) 599 %2 = getelementptr inbounds i16, ptr %x, i32 8 600 %wide.load.1 = load <8 x i16>, ptr %2, align 2 601 %3 = sext <8 x i16> %wide.load.1 to <8 x i32> 602 %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) 603 %5 = add i32 %4, %1 604 %6 = getelementptr inbounds i16, ptr %x, i32 16 605 %wide.load.2 = load <8 x i16>, ptr %6, align 2 606 %7 = sext <8 x i16> %wide.load.2 to <8 x i32> 607 %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) 608 %9 = add i32 %8, %5 609 %10 = getelementptr inbounds i16, ptr %x, i32 24 610 %wide.load.3 = load <8 x i16>, ptr %10, align 2 611 %11 = sext <8 x i16> %wide.load.3 to <8 x i32> 612 %12 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %11) 613 %13 = add i32 %12, %9 614 %14 = getelementptr inbounds i16, ptr %x, i32 32 615 %wide.load.4 = load <8 x i16>, ptr %14, align 2 616 %15 = sext <8 x i16> %wide.load.4 to <8 x i32> 617 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) 618 %17 = add i32 %16, %13 619 %18 = getelementptr inbounds i16, ptr %x, i32 40 620 %wide.load.5 = load <8 x i16>, ptr %18, align 2 621 %19 = sext <8 x i16> %wide.load.5 to <8 x i32> 622 %20 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19) 623 %21 = add i32 %20, %17 624 %22 = getelementptr inbounds i16, ptr %x, i32 48 625 %wide.load.6 = load <8 x i16>, ptr %22, align 2 626 %23 = sext <8 x i16> %wide.load.6 to <8 x i32> 627 %24 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %23) 628 %25 = add i32 %24, %21 629 %26 = getelementptr inbounds i16, ptr %x, i32 56 630 %wide.load.7 = load <8 x i16>, ptr %26, align 2 631 %27 = sext <8 x i16> %wide.load.7 to <8 x i32> 632 %28 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %27) 633 %29 = add i32 %28, %25 634 %30 = getelementptr inbounds i16, ptr %x, i32 64 635 %wide.load.8 = load <8 x i16>, ptr %30, align 2 636 %31 = sext <8 x i16> %wide.load.8 to <8 x i32> 637 %32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %31) 638 %33 = add i32 %32, %29 639 %34 = getelementptr inbounds i16, ptr %x, i32 72 640 %wide.load.9 = load <8 x i16>, ptr %34, align 2 641 %35 = sext <8 x i16> %wide.load.9 to <8 x i32> 642 %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35) 643 %37 = add i32 %36, %33 644 %38 = getelementptr inbounds i16, ptr %x, i32 80 645 %wide.load.10 = load <8 x i16>, ptr %38, align 2 646 %39 = sext <8 x i16> %wide.load.10 to <8 x i32> 647 %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39) 648 %41 = add i32 %40, %37 649 %42 = getelementptr inbounds i16, ptr %x, i32 88 650 %wide.load.11 = load <8 x i16>, ptr %42, align 2 651 %43 = sext <8 x i16> %wide.load.11 to <8 x i32> 652 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) 653 %45 = add i32 %44, %41 654 %46 = getelementptr inbounds i16, ptr %x, i32 96 655 %wide.load.12 = load <8 x i16>, ptr %46, align 2 656 %47 = sext <8 x i16> %wide.load.12 to <8 x i32> 657 %48 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47) 658 %49 = add i32 %48, %45 659 %50 = getelementptr inbounds i16, ptr %x, i32 104 660 %wide.load.13 = load <8 x i16>, ptr %50, align 2 661 %51 = sext <8 x i16> %wide.load.13 to <8 x i32> 662 %52 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %51) 663 %53 = add i32 %52, %49 664 %54 = getelementptr inbounds i16, ptr %x, i32 112 665 %wide.load.14 = load <8 x i16>, ptr %54, align 2 666 %55 = sext <8 x i16> %wide.load.14 to <8 x i32> 667 %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55) 668 %57 = add i32 %56, %53 669 %58 = getelementptr inbounds i16, ptr %x, i32 120 670 %wide.load.15 = load <8 x i16>, ptr %58, align 2 671 %59 = sext <8 x i16> %wide.load.15 to <8 x i32> 672 %60 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %59) 673 %61 = add i32 %60, %57 674 ret i32 %61 675} 676 677define i32 @addv2i32i8(ptr %x) { 678; CHECK-LABEL: addv2i32i8: 679; CHECK: @ %bb.0: @ %entry 680; CHECK-NEXT: ldrb r1, [r0] 681; CHECK-NEXT: ldrb r0, [r0, #1] 682; CHECK-NEXT: add r0, r1 683; CHECK-NEXT: bx lr 684entry: 685 %0 = load i8, ptr %x, align 1 686 %conv = zext i8 %0 to i32 687 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 688 %1 = load i8, ptr %arrayidx.1, align 1 689 %conv.1 = zext i8 %1 to i32 690 %add.1 = add nuw nsw i32 %conv, %conv.1 691 ret i32 %add.1 692} 693 694define i32 @addv4i32i8(ptr %x) { 695; CHECK-LABEL: addv4i32i8: 696; CHECK: @ %bb.0: @ %entry 697; CHECK-NEXT: vldrb.u32 q0, [r0] 698; CHECK-NEXT: vaddv.u32 r0, q0 699; CHECK-NEXT: bx lr 700entry: 701 %0 = load <4 x i8>, ptr %x, align 1 702 %1 = zext <4 x i8> %0 to <4 x i32> 703 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 704 ret i32 %2 705} 706 707define i32 @addv8i32i8(ptr %x) { 708; CHECK-LABEL: addv8i32i8: 709; CHECK: @ %bb.0: @ %entry 710; CHECK-NEXT: vldrb.u16 q0, [r0] 711; CHECK-NEXT: vaddv.u16 r0, q0 712; CHECK-NEXT: bx lr 713entry: 714 %0 = load <8 x i8>, ptr %x, align 1 715 %1 = zext <8 x i8> %0 to <8 x i32> 716 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 717 ret i32 %2 718} 719 720define i32 @addv16i32i8(ptr %x) { 721; CHECK-LABEL: addv16i32i8: 722; CHECK: @ %bb.0: @ %entry 723; CHECK-NEXT: vldrb.u8 q0, [r0] 724; CHECK-NEXT: vaddv.u8 r0, q0 725; CHECK-NEXT: bx lr 726entry: 727 %0 = load <16 x i8>, ptr %x, align 1 728 %1 = zext <16 x i8> %0 to <16 x i32> 729 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 730 ret i32 %2 731} 732 733define i32 @addv24i32i8(ptr %x) { 734; CHECK-LABEL: addv24i32i8: 735; CHECK: @ %bb.0: @ %entry 736; CHECK-NEXT: vldrb.u8 q1, [r0] 737; CHECK-NEXT: vldrb.u16 q0, [r0, #16] 738; CHECK-NEXT: vaddv.u8 r0, q1 739; CHECK-NEXT: vaddva.u16 r0, q0 740; CHECK-NEXT: bx lr 741entry: 742 %0 = load <16 x i8>, ptr %x, align 1 743 %1 = zext <16 x i8> %0 to <16 x i32> 744 %arrayidx.16 = getelementptr inbounds i8, ptr %x, i32 16 745 %2 = load <8 x i8>, ptr %arrayidx.16, align 1 746 %3 = zext <8 x i8> %2 to <8 x i32> 747 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 748 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) 749 %op.rdx = add nuw nsw i32 %4, %5 750 ret i32 %op.rdx 751} 752 753define i32 @addv32i32i8(ptr %x) { 754; CHECK-LABEL: addv32i32i8: 755; CHECK: @ %bb.0: @ %entry 756; CHECK-NEXT: vldrb.u32 q1, [r0] 757; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 758; CHECK-NEXT: vaddv.u32 r2, q1 759; CHECK-NEXT: vaddva.u32 r2, q0 760; CHECK-NEXT: vldrb.u32 q0, [r0, #8] 761; CHECK-NEXT: vaddva.u32 r2, q0 762; CHECK-NEXT: vldrb.u32 q0, [r0, #12] 763; CHECK-NEXT: vaddva.u32 r2, q0 764; CHECK-NEXT: vldrb.u32 q0, [r0, #16] 765; CHECK-NEXT: vaddva.u32 r2, q0 766; CHECK-NEXT: vldrb.u32 q0, [r0, #20] 767; CHECK-NEXT: vaddva.u32 r2, q0 768; CHECK-NEXT: vldrb.u32 q0, [r0, #24] 769; CHECK-NEXT: vaddva.u32 r2, q0 770; CHECK-NEXT: vldrb.u32 q0, [r0, #28] 771; CHECK-NEXT: vaddva.u32 r2, q0 772; CHECK-NEXT: mov r0, r2 773; CHECK-NEXT: bx lr 774entry: 775 %0 = load <32 x i8>, ptr %x, align 1 776 %1 = zext <32 x i8> %0 to <32 x i32> 777 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 778 ret i32 %2 779} 780 781define i32 @addv64i32i8(ptr %x) { 782; CHECK-LABEL: addv64i32i8: 783; CHECK: @ %bb.0: @ %entry 784; CHECK-NEXT: vldrb.u32 q1, [r0] 785; CHECK-NEXT: vldrb.u32 q0, [r0, #4] 786; CHECK-NEXT: ldrb.w r1, [r0, #60] 787; CHECK-NEXT: vaddv.u32 r2, q1 788; CHECK-NEXT: ldrb.w r3, [r0, #61] 789; CHECK-NEXT: vaddva.u32 r2, q0 790; CHECK-NEXT: vldrb.u32 q0, [r0, #8] 791; CHECK-NEXT: ldrb.w r12, [r0, #62] 792; CHECK-NEXT: vaddva.u32 r2, q0 793; CHECK-NEXT: vldrb.u32 q0, [r0, #12] 794; CHECK-NEXT: vaddva.u32 r2, q0 795; CHECK-NEXT: vldrb.u32 q0, [r0, #16] 796; CHECK-NEXT: vaddva.u32 r2, q0 797; CHECK-NEXT: vldrb.u32 q0, [r0, #20] 798; CHECK-NEXT: vaddva.u32 r2, q0 799; CHECK-NEXT: vldrb.u32 q0, [r0, #24] 800; CHECK-NEXT: vaddva.u32 r2, q0 801; CHECK-NEXT: vldrb.u32 q0, [r0, #28] 802; CHECK-NEXT: vaddva.u32 r2, q0 803; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 804; CHECK-NEXT: vaddva.u8 r2, q0 805; CHECK-NEXT: vldrb.u16 q0, [r0, #48] 806; CHECK-NEXT: vaddva.u16 r2, q0 807; CHECK-NEXT: vldrb.u32 q0, [r0, #56] 808; CHECK-NEXT: ldrb.w r0, [r0, #63] 809; CHECK-NEXT: vaddva.u32 r2, q0 810; CHECK-NEXT: add r1, r2 811; CHECK-NEXT: add r1, r3 812; CHECK-NEXT: add r1, r12 813; CHECK-NEXT: add r0, r1 814; CHECK-NEXT: bx lr 815entry: 816 %0 = load <32 x i8>, ptr %x, align 1 817 %1 = zext <32 x i8> %0 to <32 x i32> 818 %arrayidx.32 = getelementptr inbounds i8, ptr %x, i32 32 819 %2 = load <16 x i8>, ptr %arrayidx.32, align 1 820 %3 = zext <16 x i8> %2 to <16 x i32> 821 %arrayidx.48 = getelementptr inbounds i8, ptr %x, i32 48 822 %4 = load <8 x i8>, ptr %arrayidx.48, align 1 823 %5 = zext <8 x i8> %4 to <8 x i32> 824 %arrayidx.56 = getelementptr inbounds i8, ptr %x, i32 56 825 %6 = load <4 x i8>, ptr %arrayidx.56, align 1 826 %7 = zext <4 x i8> %6 to <4 x i32> 827 %arrayidx.60 = getelementptr inbounds i8, ptr %x, i32 60 828 %8 = load i8, ptr %arrayidx.60, align 1 829 %conv.60 = zext i8 %8 to i32 830 %arrayidx.61 = getelementptr inbounds i8, ptr %x, i32 61 831 %9 = load i8, ptr %arrayidx.61, align 1 832 %conv.61 = zext i8 %9 to i32 833 %arrayidx.62 = getelementptr inbounds i8, ptr %x, i32 62 834 %10 = load i8, ptr %arrayidx.62, align 1 835 %conv.62 = zext i8 %10 to i32 836 %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 837 %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) 838 %op.rdx = add nuw nsw i32 %11, %12 839 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) 840 %op.rdx8 = add nuw nsw i32 %op.rdx, %13 841 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) 842 %op.rdx9 = add nuw nsw i32 %op.rdx8, %14 843 %15 = add nuw nsw i32 %op.rdx9, %conv.60 844 %16 = add nuw nsw i32 %15, %conv.61 845 %17 = add nuw nsw i32 %16, %conv.62 846 %arrayidx.63 = getelementptr inbounds i8, ptr %x, i32 63 847 %18 = load i8, ptr %arrayidx.63, align 1 848 %conv.63 = zext i8 %18 to i32 849 %add.63 = add nuw nsw i32 %17, %conv.63 850 ret i32 %add.63 851} 852 853define i32 @addv128i32i8(ptr %x) { 854; CHECK-LABEL: addv128i32i8: 855; CHECK: @ %bb.0: @ %entry 856; CHECK-NEXT: vldrb.u8 q1, [r0] 857; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 858; CHECK-NEXT: mov r1, r0 859; CHECK-NEXT: vaddv.u8 r0, q1 860; CHECK-NEXT: vaddva.u8 r0, q0 861; CHECK-NEXT: vldrb.u8 q0, [r1, #32] 862; CHECK-NEXT: vaddva.u8 r0, q0 863; CHECK-NEXT: vldrb.u8 q0, [r1, #48] 864; CHECK-NEXT: vaddva.u8 r0, q0 865; CHECK-NEXT: vldrb.u8 q0, [r1, #64] 866; CHECK-NEXT: vaddva.u8 r0, q0 867; CHECK-NEXT: vldrb.u8 q0, [r1, #80] 868; CHECK-NEXT: vaddva.u8 r0, q0 869; CHECK-NEXT: vldrb.u8 q0, [r1, #96] 870; CHECK-NEXT: vaddva.u8 r0, q0 871; CHECK-NEXT: vldrb.u8 q0, [r1, #112] 872; CHECK-NEXT: vaddva.u8 r0, q0 873; CHECK-NEXT: bx lr 874entry: 875 %wide.load = load <16 x i8>, ptr %x, align 1 876 %0 = zext <16 x i8> %wide.load to <16 x i32> 877 %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) 878 %2 = getelementptr inbounds i8, ptr %x, i32 16 879 %wide.load.1 = load <16 x i8>, ptr %2, align 1 880 %3 = zext <16 x i8> %wide.load.1 to <16 x i32> 881 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) 882 %5 = add i32 %4, %1 883 %6 = getelementptr inbounds i8, ptr %x, i32 32 884 %wide.load.2 = load <16 x i8>, ptr %6, align 1 885 %7 = zext <16 x i8> %wide.load.2 to <16 x i32> 886 %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7) 887 %9 = add i32 %8, %5 888 %10 = getelementptr inbounds i8, ptr %x, i32 48 889 %wide.load.3 = load <16 x i8>, ptr %10, align 1 890 %11 = zext <16 x i8> %wide.load.3 to <16 x i32> 891 %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %11) 892 %13 = add i32 %12, %9 893 %14 = getelementptr inbounds i8, ptr %x, i32 64 894 %wide.load.4 = load <16 x i8>, ptr %14, align 1 895 %15 = zext <16 x i8> %wide.load.4 to <16 x i32> 896 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) 897 %17 = add i32 %16, %13 898 %18 = getelementptr inbounds i8, ptr %x, i32 80 899 %wide.load.5 = load <16 x i8>, ptr %18, align 1 900 %19 = zext <16 x i8> %wide.load.5 to <16 x i32> 901 %20 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19) 902 %21 = add i32 %20, %17 903 %22 = getelementptr inbounds i8, ptr %x, i32 96 904 %wide.load.6 = load <16 x i8>, ptr %22, align 1 905 %23 = zext <16 x i8> %wide.load.6 to <16 x i32> 906 %24 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %23) 907 %25 = add i32 %24, %21 908 %26 = getelementptr inbounds i8, ptr %x, i32 112 909 %wide.load.7 = load <16 x i8>, ptr %26, align 1 910 %27 = zext <16 x i8> %wide.load.7 to <16 x i32> 911 %28 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %27) 912 %29 = add i32 %28, %25 913 ret i32 %29 914} 915 916define signext i16 @addv2i16i16(ptr %x) { 917; CHECK-LABEL: addv2i16i16: 918; CHECK: @ %bb.0: @ %entry 919; CHECK-NEXT: ldrh r1, [r0] 920; CHECK-NEXT: ldrh r0, [r0, #2] 921; CHECK-NEXT: add r0, r1 922; CHECK-NEXT: sxth r0, r0 923; CHECK-NEXT: bx lr 924entry: 925 %0 = load i16, ptr %x, align 2 926 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 927 %1 = load i16, ptr %arrayidx.1, align 2 928 %add.1 = add i16 %1, %0 929 ret i16 %add.1 930} 931 932define signext i16 @addv4i16i16(ptr %x) { 933; CHECK-LABEL: addv4i16i16: 934; CHECK: @ %bb.0: @ %entry 935; CHECK-NEXT: vldrh.u32 q0, [r0] 936; CHECK-NEXT: vaddv.u32 r0, q0 937; CHECK-NEXT: sxth r0, r0 938; CHECK-NEXT: bx lr 939entry: 940 %0 = load <4 x i16>, ptr %x, align 2 941 %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %0) 942 ret i16 %1 943} 944 945define signext i16 @addv8i16i16(ptr %x) { 946; CHECK-LABEL: addv8i16i16: 947; CHECK: @ %bb.0: @ %entry 948; CHECK-NEXT: vldrh.u16 q0, [r0] 949; CHECK-NEXT: vaddv.u16 r0, q0 950; CHECK-NEXT: sxth r0, r0 951; CHECK-NEXT: bx lr 952entry: 953 %0 = load <8 x i16>, ptr %x, align 2 954 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) 955 ret i16 %1 956} 957 958define signext i16 @addv16i16i16(ptr %x) { 959; CHECK-LABEL: addv16i16i16: 960; CHECK: @ %bb.0: @ %entry 961; CHECK-NEXT: vldrh.u16 q1, [r0] 962; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 963; CHECK-NEXT: vaddv.u16 r0, q1 964; CHECK-NEXT: vaddva.u16 r0, q0 965; CHECK-NEXT: sxth r0, r0 966; CHECK-NEXT: bx lr 967entry: 968 %0 = load <16 x i16>, ptr %x, align 2 969 %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0) 970 ret i16 %1 971} 972 973define signext i16 @addv24i16i16(ptr %x) { 974; CHECK-LABEL: addv24i16i16: 975; CHECK: @ %bb.0: @ %entry 976; CHECK-NEXT: vldrh.u16 q1, [r0] 977; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 978; CHECK-NEXT: vaddv.u16 r2, q1 979; CHECK-NEXT: vaddva.u16 r2, q0 980; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 981; CHECK-NEXT: vaddva.u16 r2, q0 982; CHECK-NEXT: sxth r0, r2 983; CHECK-NEXT: bx lr 984entry: 985 %0 = load <8 x i16>, ptr %x, align 2 986 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 987 %1 = load <16 x i16>, ptr %arrayidx.8, align 2 988 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) 989 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) 990 %op.rdx = add i16 %2, %3 991 ret i16 %op.rdx 992} 993 994define signext i16 @addv32i16i16(ptr %x) { 995; CHECK-LABEL: addv32i16i16: 996; CHECK: @ %bb.0: @ %entry 997; CHECK-NEXT: vldrh.u16 q1, [r0] 998; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 999; CHECK-NEXT: vaddv.u16 r2, q1 1000; CHECK-NEXT: vaddva.u16 r2, q0 1001; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1002; CHECK-NEXT: vaddva.u16 r2, q0 1003; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 1004; CHECK-NEXT: vaddva.u16 r2, q0 1005; CHECK-NEXT: sxth r0, r2 1006; CHECK-NEXT: bx lr 1007entry: 1008 %0 = load <32 x i16>, ptr %x, align 2 1009 %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %0) 1010 ret i16 %1 1011} 1012 1013define signext i16 @addv64i16i16(ptr %x) { 1014; CHECK-LABEL: addv64i16i16: 1015; CHECK: @ %bb.0: @ %entry 1016; CHECK-NEXT: vldrh.u16 q1, [r0] 1017; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1018; CHECK-NEXT: vaddv.u16 r2, q1 1019; CHECK-NEXT: vaddva.u16 r2, q0 1020; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1021; CHECK-NEXT: vaddva.u16 r2, q0 1022; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 1023; CHECK-NEXT: vaddva.u16 r2, q0 1024; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 1025; CHECK-NEXT: vaddva.u16 r2, q0 1026; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 1027; CHECK-NEXT: vaddva.u16 r2, q0 1028; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 1029; CHECK-NEXT: vaddva.u16 r2, q0 1030; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 1031; CHECK-NEXT: vaddva.u16 r2, q0 1032; CHECK-NEXT: sxth r0, r2 1033; CHECK-NEXT: bx lr 1034entry: 1035 %0 = load <64 x i16>, ptr %x, align 2 1036 %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %0) 1037 ret i16 %1 1038} 1039 1040define signext i16 @addv128i16i16(ptr %x) { 1041; CHECK-LABEL: addv128i16i16: 1042; CHECK: @ %bb.0: @ %entry 1043; CHECK-NEXT: vldrh.u16 q1, [r0] 1044; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 1045; CHECK-NEXT: vaddv.u16 r2, q1 1046; CHECK-NEXT: vaddva.u16 r2, q0 1047; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 1048; CHECK-NEXT: vaddva.u16 r2, q0 1049; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 1050; CHECK-NEXT: vaddva.u16 r2, q0 1051; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 1052; CHECK-NEXT: vaddva.u16 r2, q0 1053; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 1054; CHECK-NEXT: vaddva.u16 r2, q0 1055; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 1056; CHECK-NEXT: vaddva.u16 r2, q0 1057; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 1058; CHECK-NEXT: vaddva.u16 r2, q0 1059; CHECK-NEXT: vldrh.u16 q0, [r0, #128] 1060; CHECK-NEXT: vaddva.u16 r2, q0 1061; CHECK-NEXT: vldrh.u16 q0, [r0, #144] 1062; CHECK-NEXT: vaddva.u16 r2, q0 1063; CHECK-NEXT: vldrh.u16 q0, [r0, #160] 1064; CHECK-NEXT: vaddva.u16 r2, q0 1065; CHECK-NEXT: vldrh.u16 q0, [r0, #176] 1066; CHECK-NEXT: vaddva.u16 r2, q0 1067; CHECK-NEXT: vldrh.u16 q0, [r0, #192] 1068; CHECK-NEXT: vaddva.u16 r2, q0 1069; CHECK-NEXT: vldrh.u16 q0, [r0, #208] 1070; CHECK-NEXT: vaddva.u16 r2, q0 1071; CHECK-NEXT: vldrh.u16 q0, [r0, #224] 1072; CHECK-NEXT: vaddva.u16 r2, q0 1073; CHECK-NEXT: vldrh.u16 q0, [r0, #240] 1074; CHECK-NEXT: vaddva.u16 r2, q0 1075; CHECK-NEXT: sxth r0, r2 1076; CHECK-NEXT: bx lr 1077entry: 1078 %wide.load = load <8 x i16>, ptr %x, align 2 1079 %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load) 1080 %1 = getelementptr inbounds i16, ptr %x, i32 8 1081 %wide.load.1 = load <8 x i16>, ptr %1, align 2 1082 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1) 1083 %3 = add i16 %2, %0 1084 %4 = getelementptr inbounds i16, ptr %x, i32 16 1085 %wide.load.2 = load <8 x i16>, ptr %4, align 2 1086 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2) 1087 %6 = add i16 %5, %3 1088 %7 = getelementptr inbounds i16, ptr %x, i32 24 1089 %wide.load.3 = load <8 x i16>, ptr %7, align 2 1090 %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3) 1091 %9 = add i16 %8, %6 1092 %10 = getelementptr inbounds i16, ptr %x, i32 32 1093 %wide.load.4 = load <8 x i16>, ptr %10, align 2 1094 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4) 1095 %12 = add i16 %11, %9 1096 %13 = getelementptr inbounds i16, ptr %x, i32 40 1097 %wide.load.5 = load <8 x i16>, ptr %13, align 2 1098 %14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5) 1099 %15 = add i16 %14, %12 1100 %16 = getelementptr inbounds i16, ptr %x, i32 48 1101 %wide.load.6 = load <8 x i16>, ptr %16, align 2 1102 %17 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6) 1103 %18 = add i16 %17, %15 1104 %19 = getelementptr inbounds i16, ptr %x, i32 56 1105 %wide.load.7 = load <8 x i16>, ptr %19, align 2 1106 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7) 1107 %21 = add i16 %20, %18 1108 %22 = getelementptr inbounds i16, ptr %x, i32 64 1109 %wide.load.8 = load <8 x i16>, ptr %22, align 2 1110 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8) 1111 %24 = add i16 %23, %21 1112 %25 = getelementptr inbounds i16, ptr %x, i32 72 1113 %wide.load.9 = load <8 x i16>, ptr %25, align 2 1114 %26 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9) 1115 %27 = add i16 %26, %24 1116 %28 = getelementptr inbounds i16, ptr %x, i32 80 1117 %wide.load.10 = load <8 x i16>, ptr %28, align 2 1118 %29 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10) 1119 %30 = add i16 %29, %27 1120 %31 = getelementptr inbounds i16, ptr %x, i32 88 1121 %wide.load.11 = load <8 x i16>, ptr %31, align 2 1122 %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11) 1123 %33 = add i16 %32, %30 1124 %34 = getelementptr inbounds i16, ptr %x, i32 96 1125 %wide.load.12 = load <8 x i16>, ptr %34, align 2 1126 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12) 1127 %36 = add i16 %35, %33 1128 %37 = getelementptr inbounds i16, ptr %x, i32 104 1129 %wide.load.13 = load <8 x i16>, ptr %37, align 2 1130 %38 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13) 1131 %39 = add i16 %38, %36 1132 %40 = getelementptr inbounds i16, ptr %x, i32 112 1133 %wide.load.14 = load <8 x i16>, ptr %40, align 2 1134 %41 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14) 1135 %42 = add i16 %41, %39 1136 %43 = getelementptr inbounds i16, ptr %x, i32 120 1137 %wide.load.15 = load <8 x i16>, ptr %43, align 2 1138 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15) 1139 %45 = add i16 %44, %42 1140 ret i16 %45 1141} 1142 1143define zeroext i8 @addv2i8i8(ptr %x) { 1144; CHECK-LABEL: addv2i8i8: 1145; CHECK: @ %bb.0: @ %entry 1146; CHECK-NEXT: ldrb r1, [r0] 1147; CHECK-NEXT: ldrb r0, [r0, #1] 1148; CHECK-NEXT: add r0, r1 1149; CHECK-NEXT: uxtb r0, r0 1150; CHECK-NEXT: bx lr 1151entry: 1152 %0 = load i8, ptr %x, align 1 1153 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 1154 %1 = load i8, ptr %arrayidx.1, align 1 1155 %add.1 = add i8 %1, %0 1156 ret i8 %add.1 1157} 1158 1159define zeroext i8 @addv4i8i8(ptr %x) { 1160; CHECK-LABEL: addv4i8i8: 1161; CHECK: @ %bb.0: @ %entry 1162; CHECK-NEXT: vldrb.u32 q0, [r0] 1163; CHECK-NEXT: vaddv.u32 r0, q0 1164; CHECK-NEXT: uxtb r0, r0 1165; CHECK-NEXT: bx lr 1166entry: 1167 %0 = load <4 x i8>, ptr %x, align 1 1168 %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %0) 1169 ret i8 %1 1170} 1171 1172define zeroext i8 @addv8i8i8(ptr %x) { 1173; CHECK-LABEL: addv8i8i8: 1174; CHECK: @ %bb.0: @ %entry 1175; CHECK-NEXT: vldrb.u16 q0, [r0] 1176; CHECK-NEXT: vaddv.u16 r0, q0 1177; CHECK-NEXT: uxtb r0, r0 1178; CHECK-NEXT: bx lr 1179entry: 1180 %0 = load <8 x i8>, ptr %x, align 1 1181 %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0) 1182 ret i8 %1 1183} 1184 1185define zeroext i8 @addv16i8i8(ptr %x) { 1186; CHECK-LABEL: addv16i8i8: 1187; CHECK: @ %bb.0: @ %entry 1188; CHECK-NEXT: vldrb.u8 q0, [r0] 1189; CHECK-NEXT: vaddv.u8 r0, q0 1190; CHECK-NEXT: uxtb r0, r0 1191; CHECK-NEXT: bx lr 1192entry: 1193 %0 = load <16 x i8>, ptr %x, align 1 1194 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) 1195 ret i8 %1 1196} 1197 1198define zeroext i8 @addv24i8i8(ptr %x) { 1199; CHECK-LABEL: addv24i8i8: 1200; CHECK: @ %bb.0: @ %entry 1201; CHECK-NEXT: vldrb.u16 q1, [r0] 1202; CHECK-NEXT: vldrb.u8 q0, [r0, #8] 1203; CHECK-NEXT: vaddv.u16 r0, q1 1204; CHECK-NEXT: vaddva.u8 r0, q0 1205; CHECK-NEXT: uxtb r0, r0 1206; CHECK-NEXT: bx lr 1207entry: 1208 %0 = load <8 x i8>, ptr %x, align 1 1209 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 1210 %1 = load <16 x i8>, ptr %arrayidx.8, align 1 1211 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1) 1212 %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0) 1213 %op.rdx = add i8 %2, %3 1214 ret i8 %op.rdx 1215} 1216 1217define zeroext i8 @addv32i8i8(ptr %x) { 1218; CHECK-LABEL: addv32i8i8: 1219; CHECK: @ %bb.0: @ %entry 1220; CHECK-NEXT: vldrb.u8 q1, [r0] 1221; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 1222; CHECK-NEXT: vaddv.u8 r0, q1 1223; CHECK-NEXT: vaddva.u8 r0, q0 1224; CHECK-NEXT: uxtb r0, r0 1225; CHECK-NEXT: bx lr 1226entry: 1227 %0 = load <32 x i8>, ptr %x, align 1 1228 %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %0) 1229 ret i8 %1 1230} 1231 1232define zeroext i8 @addv64i8i8(ptr %x) { 1233; CHECK-LABEL: addv64i8i8: 1234; CHECK: @ %bb.0: @ %entry 1235; CHECK-NEXT: vldrb.u8 q1, [r0] 1236; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 1237; CHECK-NEXT: vaddv.u8 r2, q1 1238; CHECK-NEXT: vaddva.u8 r2, q0 1239; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 1240; CHECK-NEXT: vaddva.u8 r2, q0 1241; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 1242; CHECK-NEXT: vaddva.u8 r2, q0 1243; CHECK-NEXT: uxtb r0, r2 1244; CHECK-NEXT: bx lr 1245entry: 1246 %0 = load <64 x i8>, ptr %x, align 1 1247 %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %0) 1248 ret i8 %1 1249} 1250 1251define zeroext i8 @addv128i8i8(ptr %x) { 1252; CHECK-LABEL: addv128i8i8: 1253; CHECK: @ %bb.0: @ %entry 1254; CHECK-NEXT: vldrb.u8 q1, [r0] 1255; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 1256; CHECK-NEXT: vaddv.u8 r2, q1 1257; CHECK-NEXT: vaddva.u8 r2, q0 1258; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 1259; CHECK-NEXT: vaddva.u8 r2, q0 1260; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 1261; CHECK-NEXT: vaddva.u8 r2, q0 1262; CHECK-NEXT: vldrb.u8 q0, [r0, #64] 1263; CHECK-NEXT: vaddva.u8 r2, q0 1264; CHECK-NEXT: vldrb.u8 q0, [r0, #80] 1265; CHECK-NEXT: vaddva.u8 r2, q0 1266; CHECK-NEXT: vldrb.u8 q0, [r0, #96] 1267; CHECK-NEXT: vaddva.u8 r2, q0 1268; CHECK-NEXT: vldrb.u8 q0, [r0, #112] 1269; CHECK-NEXT: vaddva.u8 r2, q0 1270; CHECK-NEXT: uxtb r0, r2 1271; CHECK-NEXT: bx lr 1272entry: 1273 %wide.load = load <16 x i8>, ptr %x, align 1 1274 %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load) 1275 %1 = getelementptr inbounds i8, ptr %x, i32 16 1276 %wide.load.1 = load <16 x i8>, ptr %1, align 1 1277 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1) 1278 %3 = add i8 %2, %0 1279 %4 = getelementptr inbounds i8, ptr %x, i32 32 1280 %wide.load.2 = load <16 x i8>, ptr %4, align 1 1281 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2) 1282 %6 = add i8 %5, %3 1283 %7 = getelementptr inbounds i8, ptr %x, i32 48 1284 %wide.load.3 = load <16 x i8>, ptr %7, align 1 1285 %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3) 1286 %9 = add i8 %8, %6 1287 %10 = getelementptr inbounds i8, ptr %x, i32 64 1288 %wide.load.4 = load <16 x i8>, ptr %10, align 1 1289 %11 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4) 1290 %12 = add i8 %11, %9 1291 %13 = getelementptr inbounds i8, ptr %x, i32 80 1292 %wide.load.5 = load <16 x i8>, ptr %13, align 1 1293 %14 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5) 1294 %15 = add i8 %14, %12 1295 %16 = getelementptr inbounds i8, ptr %x, i32 96 1296 %wide.load.6 = load <16 x i8>, ptr %16, align 1 1297 %17 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6) 1298 %18 = add i8 %17, %15 1299 %19 = getelementptr inbounds i8, ptr %x, i32 112 1300 %wide.load.7 = load <16 x i8>, ptr %19, align 1 1301 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7) 1302 %21 = add i8 %20, %18 1303 ret i8 %21 1304} 1305 1306 1307 1308define i32 @mlav2i32i32(ptr %x, ptr %y) { 1309; CHECK-LABEL: mlav2i32i32: 1310; CHECK: @ %bb.0: @ %entry 1311; CHECK-NEXT: ldrd r0, r2, [r0] 1312; CHECK-NEXT: ldrd r1, r3, [r1] 1313; CHECK-NEXT: muls r0, r1, r0 1314; CHECK-NEXT: mla r0, r3, r2, r0 1315; CHECK-NEXT: bx lr 1316entry: 1317 %0 = load i32, ptr %x, align 4 1318 %1 = load i32, ptr %y, align 4 1319 %mul = mul nsw i32 %1, %0 1320 %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1 1321 %2 = load i32, ptr %arrayidx.1, align 4 1322 %arrayidx1.1 = getelementptr inbounds i32, ptr %y, i32 1 1323 %3 = load i32, ptr %arrayidx1.1, align 4 1324 %mul.1 = mul nsw i32 %3, %2 1325 %add.1 = add nsw i32 %mul.1, %mul 1326 ret i32 %add.1 1327} 1328 1329define i32 @mlav4i32i32(ptr %x, ptr %y) { 1330; CHECK-LABEL: mlav4i32i32: 1331; CHECK: @ %bb.0: @ %entry 1332; CHECK-NEXT: vldrw.u32 q0, [r0] 1333; CHECK-NEXT: vldrw.u32 q1, [r1] 1334; CHECK-NEXT: vmlav.u32 r0, q1, q0 1335; CHECK-NEXT: bx lr 1336entry: 1337 %0 = load <4 x i32>, ptr %x, align 4 1338 %1 = load <4 x i32>, ptr %y, align 4 1339 %2 = mul nsw <4 x i32> %1, %0 1340 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 1341 ret i32 %3 1342} 1343 1344define i32 @mlav8i32i32(ptr %x, ptr %y) { 1345; CHECK-LABEL: mlav8i32i32: 1346; CHECK: @ %bb.0: @ %entry 1347; CHECK-NEXT: vldrw.u32 q0, [r0] 1348; CHECK-NEXT: vldrw.u32 q1, [r1] 1349; CHECK-NEXT: vmlav.u32 r2, q1, q0 1350; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1351; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1352; CHECK-NEXT: vmlava.u32 r2, q1, q0 1353; CHECK-NEXT: mov r0, r2 1354; CHECK-NEXT: bx lr 1355entry: 1356 %0 = load <8 x i32>, ptr %x, align 4 1357 %1 = load <8 x i32>, ptr %y, align 4 1358 %2 = mul nsw <8 x i32> %1, %0 1359 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 1360 ret i32 %3 1361} 1362 1363define i32 @mlav16i32i32(ptr %x, ptr %y) { 1364; CHECK-LABEL: mlav16i32i32: 1365; CHECK: @ %bb.0: @ %entry 1366; CHECK-NEXT: vldrw.u32 q0, [r0] 1367; CHECK-NEXT: vldrw.u32 q1, [r1] 1368; CHECK-NEXT: vmlav.u32 r2, q1, q0 1369; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 1370; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1371; CHECK-NEXT: vmlava.u32 r2, q1, q0 1372; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 1373; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1374; CHECK-NEXT: vmlava.u32 r2, q1, q0 1375; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 1376; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1377; CHECK-NEXT: vmlava.u32 r2, q1, q0 1378; CHECK-NEXT: mov r0, r2 1379; CHECK-NEXT: bx lr 1380entry: 1381 %0 = load <16 x i32>, ptr %x, align 4 1382 %1 = load <16 x i32>, ptr %y, align 4 1383 %2 = mul nsw <16 x i32> %1, %0 1384 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 1385 ret i32 %3 1386} 1387 1388define i32 @mlav24i32i32(ptr %x, ptr %y) { 1389; CHECK-LABEL: mlav24i32i32: 1390; CHECK: @ %bb.0: @ %entry 1391; CHECK-NEXT: vldrw.u32 q0, [r0] 1392; CHECK-NEXT: vldrw.u32 q1, [r1] 1393; CHECK-NEXT: mov r2, r0 1394; CHECK-NEXT: vmlav.u32 r0, q1, q0 1395; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1396; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1397; CHECK-NEXT: vmlava.u32 r0, q1, q0 1398; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1399; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1400; CHECK-NEXT: vmlava.u32 r0, q1, q0 1401; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1402; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1403; CHECK-NEXT: vmlava.u32 r0, q1, q0 1404; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1405; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1406; CHECK-NEXT: vmlava.u32 r0, q1, q0 1407; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1408; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1409; CHECK-NEXT: vmlava.u32 r0, q1, q0 1410; CHECK-NEXT: bx lr 1411entry: 1412 %0 = load <8 x i32>, ptr %x, align 4 1413 %1 = load <8 x i32>, ptr %y, align 4 1414 %2 = mul nsw <8 x i32> %1, %0 1415 %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8 1416 %arrayidx1.8 = getelementptr inbounds i32, ptr %y, i32 8 1417 %3 = load <16 x i32>, ptr %arrayidx.8, align 4 1418 %4 = load <16 x i32>, ptr %arrayidx1.8, align 4 1419 %5 = mul nsw <16 x i32> %4, %3 1420 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) 1421 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 1422 %op.rdx = add nsw i32 %6, %7 1423 ret i32 %op.rdx 1424} 1425 1426define i32 @mlav32i32i32(ptr %x, ptr %y) { 1427; CHECK-LABEL: mlav32i32i32: 1428; CHECK: @ %bb.0: @ %entry 1429; CHECK-NEXT: vldrw.u32 q0, [r0] 1430; CHECK-NEXT: vldrw.u32 q1, [r1] 1431; CHECK-NEXT: mov r2, r0 1432; CHECK-NEXT: vmlav.u32 r0, q1, q0 1433; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1434; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1435; CHECK-NEXT: vmlava.u32 r0, q1, q0 1436; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1437; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1438; CHECK-NEXT: vmlava.u32 r0, q1, q0 1439; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1440; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1441; CHECK-NEXT: vmlava.u32 r0, q1, q0 1442; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1443; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1444; CHECK-NEXT: vmlava.u32 r0, q1, q0 1445; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1446; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1447; CHECK-NEXT: vmlava.u32 r0, q1, q0 1448; CHECK-NEXT: vldrw.u32 q0, [r2, #96] 1449; CHECK-NEXT: vldrw.u32 q1, [r1, #96] 1450; CHECK-NEXT: vmlava.u32 r0, q1, q0 1451; CHECK-NEXT: vldrw.u32 q0, [r2, #112] 1452; CHECK-NEXT: vldrw.u32 q1, [r1, #112] 1453; CHECK-NEXT: vmlava.u32 r0, q1, q0 1454; CHECK-NEXT: bx lr 1455entry: 1456 %0 = load <32 x i32>, ptr %x, align 4 1457 %1 = load <32 x i32>, ptr %y, align 4 1458 %2 = mul nsw <32 x i32> %1, %0 1459 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) 1460 ret i32 %3 1461} 1462 1463define i32 @mlav64i32i32(ptr %x, ptr %y) { 1464; CHECK-LABEL: mlav64i32i32: 1465; CHECK: @ %bb.0: @ %entry 1466; CHECK-NEXT: vldrw.u32 q0, [r0] 1467; CHECK-NEXT: vldrw.u32 q1, [r1] 1468; CHECK-NEXT: mov r2, r0 1469; CHECK-NEXT: vmlav.u32 r0, q1, q0 1470; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1471; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1472; CHECK-NEXT: vmlava.u32 r0, q1, q0 1473; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1474; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1475; CHECK-NEXT: vmlava.u32 r0, q1, q0 1476; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1477; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1478; CHECK-NEXT: vmlava.u32 r0, q1, q0 1479; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1480; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1481; CHECK-NEXT: vmlava.u32 r0, q1, q0 1482; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1483; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1484; CHECK-NEXT: vmlava.u32 r0, q1, q0 1485; CHECK-NEXT: vldrw.u32 q0, [r2, #96] 1486; CHECK-NEXT: vldrw.u32 q1, [r1, #96] 1487; CHECK-NEXT: vmlava.u32 r0, q1, q0 1488; CHECK-NEXT: vldrw.u32 q0, [r2, #112] 1489; CHECK-NEXT: vldrw.u32 q1, [r1, #112] 1490; CHECK-NEXT: vmlava.u32 r0, q1, q0 1491; CHECK-NEXT: vldrw.u32 q0, [r2, #128] 1492; CHECK-NEXT: vldrw.u32 q1, [r1, #128] 1493; CHECK-NEXT: vmlava.u32 r0, q1, q0 1494; CHECK-NEXT: vldrw.u32 q0, [r2, #144] 1495; CHECK-NEXT: vldrw.u32 q1, [r1, #144] 1496; CHECK-NEXT: vmlava.u32 r0, q1, q0 1497; CHECK-NEXT: vldrw.u32 q0, [r2, #160] 1498; CHECK-NEXT: vldrw.u32 q1, [r1, #160] 1499; CHECK-NEXT: vmlava.u32 r0, q1, q0 1500; CHECK-NEXT: vldrw.u32 q0, [r2, #176] 1501; CHECK-NEXT: vldrw.u32 q1, [r1, #176] 1502; CHECK-NEXT: vmlava.u32 r0, q1, q0 1503; CHECK-NEXT: vldrw.u32 q0, [r2, #192] 1504; CHECK-NEXT: vldrw.u32 q1, [r1, #192] 1505; CHECK-NEXT: vmlava.u32 r0, q1, q0 1506; CHECK-NEXT: vldrw.u32 q0, [r2, #208] 1507; CHECK-NEXT: vldrw.u32 q1, [r1, #208] 1508; CHECK-NEXT: vmlava.u32 r0, q1, q0 1509; CHECK-NEXT: vldrw.u32 q0, [r2, #224] 1510; CHECK-NEXT: vldrw.u32 q1, [r1, #224] 1511; CHECK-NEXT: vmlava.u32 r0, q1, q0 1512; CHECK-NEXT: vldrw.u32 q0, [r2, #240] 1513; CHECK-NEXT: vldrw.u32 q1, [r1, #240] 1514; CHECK-NEXT: vmlava.u32 r0, q1, q0 1515; CHECK-NEXT: bx lr 1516entry: 1517 %wide.load = load <4 x i32>, ptr %x, align 4 1518 %wide.load10 = load <4 x i32>, ptr %y, align 4 1519 %0 = mul nsw <4 x i32> %wide.load10, %wide.load 1520 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) 1521 %2 = getelementptr inbounds i32, ptr %x, i32 4 1522 %wide.load.1 = load <4 x i32>, ptr %2, align 4 1523 %3 = getelementptr inbounds i32, ptr %y, i32 4 1524 %wide.load10.1 = load <4 x i32>, ptr %3, align 4 1525 %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 1526 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 1527 %6 = add i32 %5, %1 1528 %7 = getelementptr inbounds i32, ptr %x, i32 8 1529 %wide.load.2 = load <4 x i32>, ptr %7, align 4 1530 %8 = getelementptr inbounds i32, ptr %y, i32 8 1531 %wide.load10.2 = load <4 x i32>, ptr %8, align 4 1532 %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 1533 %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) 1534 %11 = add i32 %10, %6 1535 %12 = getelementptr inbounds i32, ptr %x, i32 12 1536 %wide.load.3 = load <4 x i32>, ptr %12, align 4 1537 %13 = getelementptr inbounds i32, ptr %y, i32 12 1538 %wide.load10.3 = load <4 x i32>, ptr %13, align 4 1539 %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 1540 %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) 1541 %16 = add i32 %15, %11 1542 %17 = getelementptr inbounds i32, ptr %x, i32 16 1543 %wide.load.4 = load <4 x i32>, ptr %17, align 4 1544 %18 = getelementptr inbounds i32, ptr %y, i32 16 1545 %wide.load10.4 = load <4 x i32>, ptr %18, align 4 1546 %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 1547 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) 1548 %21 = add i32 %20, %16 1549 %22 = getelementptr inbounds i32, ptr %x, i32 20 1550 %wide.load.5 = load <4 x i32>, ptr %22, align 4 1551 %23 = getelementptr inbounds i32, ptr %y, i32 20 1552 %wide.load10.5 = load <4 x i32>, ptr %23, align 4 1553 %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 1554 %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) 1555 %26 = add i32 %25, %21 1556 %27 = getelementptr inbounds i32, ptr %x, i32 24 1557 %wide.load.6 = load <4 x i32>, ptr %27, align 4 1558 %28 = getelementptr inbounds i32, ptr %y, i32 24 1559 %wide.load10.6 = load <4 x i32>, ptr %28, align 4 1560 %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 1561 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) 1562 %31 = add i32 %30, %26 1563 %32 = getelementptr inbounds i32, ptr %x, i32 28 1564 %wide.load.7 = load <4 x i32>, ptr %32, align 4 1565 %33 = getelementptr inbounds i32, ptr %y, i32 28 1566 %wide.load10.7 = load <4 x i32>, ptr %33, align 4 1567 %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 1568 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34) 1569 %36 = add i32 %35, %31 1570 %37 = getelementptr inbounds i32, ptr %x, i32 32 1571 %wide.load.8 = load <4 x i32>, ptr %37, align 4 1572 %38 = getelementptr inbounds i32, ptr %y, i32 32 1573 %wide.load10.8 = load <4 x i32>, ptr %38, align 4 1574 %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 1575 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39) 1576 %41 = add i32 %40, %36 1577 %42 = getelementptr inbounds i32, ptr %x, i32 36 1578 %wide.load.9 = load <4 x i32>, ptr %42, align 4 1579 %43 = getelementptr inbounds i32, ptr %y, i32 36 1580 %wide.load10.9 = load <4 x i32>, ptr %43, align 4 1581 %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 1582 %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44) 1583 %46 = add i32 %45, %41 1584 %47 = getelementptr inbounds i32, ptr %x, i32 40 1585 %wide.load.10 = load <4 x i32>, ptr %47, align 4 1586 %48 = getelementptr inbounds i32, ptr %y, i32 40 1587 %wide.load10.10 = load <4 x i32>, ptr %48, align 4 1588 %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 1589 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49) 1590 %51 = add i32 %50, %46 1591 %52 = getelementptr inbounds i32, ptr %x, i32 44 1592 %wide.load.11 = load <4 x i32>, ptr %52, align 4 1593 %53 = getelementptr inbounds i32, ptr %y, i32 44 1594 %wide.load10.11 = load <4 x i32>, ptr %53, align 4 1595 %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 1596 %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54) 1597 %56 = add i32 %55, %51 1598 %57 = getelementptr inbounds i32, ptr %x, i32 48 1599 %wide.load.12 = load <4 x i32>, ptr %57, align 4 1600 %58 = getelementptr inbounds i32, ptr %y, i32 48 1601 %wide.load10.12 = load <4 x i32>, ptr %58, align 4 1602 %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 1603 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59) 1604 %61 = add i32 %60, %56 1605 %62 = getelementptr inbounds i32, ptr %x, i32 52 1606 %wide.load.13 = load <4 x i32>, ptr %62, align 4 1607 %63 = getelementptr inbounds i32, ptr %y, i32 52 1608 %wide.load10.13 = load <4 x i32>, ptr %63, align 4 1609 %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 1610 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) 1611 %66 = add i32 %65, %61 1612 %67 = getelementptr inbounds i32, ptr %x, i32 56 1613 %wide.load.14 = load <4 x i32>, ptr %67, align 4 1614 %68 = getelementptr inbounds i32, ptr %y, i32 56 1615 %wide.load10.14 = load <4 x i32>, ptr %68, align 4 1616 %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 1617 %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69) 1618 %71 = add i32 %70, %66 1619 %72 = getelementptr inbounds i32, ptr %x, i32 60 1620 %wide.load.15 = load <4 x i32>, ptr %72, align 4 1621 %73 = getelementptr inbounds i32, ptr %y, i32 60 1622 %wide.load10.15 = load <4 x i32>, ptr %73, align 4 1623 %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 1624 %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74) 1625 %76 = add i32 %75, %71 1626 ret i32 %76 1627} 1628 1629define i32 @mlav128i32i32(ptr %x, ptr %y) { 1630; CHECK-LABEL: mlav128i32i32: 1631; CHECK: @ %bb.0: @ %entry 1632; CHECK-NEXT: vldrw.u32 q0, [r0] 1633; CHECK-NEXT: vldrw.u32 q1, [r1] 1634; CHECK-NEXT: mov r2, r0 1635; CHECK-NEXT: vmlav.u32 r0, q1, q0 1636; CHECK-NEXT: vldrw.u32 q0, [r2, #16] 1637; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 1638; CHECK-NEXT: vmlava.u32 r0, q1, q0 1639; CHECK-NEXT: vldrw.u32 q0, [r2, #32] 1640; CHECK-NEXT: vldrw.u32 q1, [r1, #32] 1641; CHECK-NEXT: vmlava.u32 r0, q1, q0 1642; CHECK-NEXT: vldrw.u32 q0, [r2, #48] 1643; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 1644; CHECK-NEXT: vmlava.u32 r0, q1, q0 1645; CHECK-NEXT: vldrw.u32 q0, [r2, #64] 1646; CHECK-NEXT: vldrw.u32 q1, [r1, #64] 1647; CHECK-NEXT: vmlava.u32 r0, q1, q0 1648; CHECK-NEXT: vldrw.u32 q0, [r2, #80] 1649; CHECK-NEXT: vldrw.u32 q1, [r1, #80] 1650; CHECK-NEXT: vmlava.u32 r0, q1, q0 1651; CHECK-NEXT: vldrw.u32 q0, [r2, #96] 1652; CHECK-NEXT: vldrw.u32 q1, [r1, #96] 1653; CHECK-NEXT: vmlava.u32 r0, q1, q0 1654; CHECK-NEXT: vldrw.u32 q0, [r2, #112] 1655; CHECK-NEXT: vldrw.u32 q1, [r1, #112] 1656; CHECK-NEXT: vmlava.u32 r0, q1, q0 1657; CHECK-NEXT: vldrw.u32 q0, [r2, #128] 1658; CHECK-NEXT: vldrw.u32 q1, [r1, #128] 1659; CHECK-NEXT: vmlava.u32 r0, q1, q0 1660; CHECK-NEXT: vldrw.u32 q0, [r2, #144] 1661; CHECK-NEXT: vldrw.u32 q1, [r1, #144] 1662; CHECK-NEXT: vmlava.u32 r0, q1, q0 1663; CHECK-NEXT: vldrw.u32 q0, [r2, #160] 1664; CHECK-NEXT: vldrw.u32 q1, [r1, #160] 1665; CHECK-NEXT: vmlava.u32 r0, q1, q0 1666; CHECK-NEXT: vldrw.u32 q0, [r2, #176] 1667; CHECK-NEXT: vldrw.u32 q1, [r1, #176] 1668; CHECK-NEXT: vmlava.u32 r0, q1, q0 1669; CHECK-NEXT: vldrw.u32 q0, [r2, #192] 1670; CHECK-NEXT: vldrw.u32 q1, [r1, #192] 1671; CHECK-NEXT: vmlava.u32 r0, q1, q0 1672; CHECK-NEXT: vldrw.u32 q0, [r2, #208] 1673; CHECK-NEXT: vldrw.u32 q1, [r1, #208] 1674; CHECK-NEXT: vmlava.u32 r0, q1, q0 1675; CHECK-NEXT: vldrw.u32 q0, [r2, #224] 1676; CHECK-NEXT: vldrw.u32 q1, [r1, #224] 1677; CHECK-NEXT: vmlava.u32 r0, q1, q0 1678; CHECK-NEXT: vldrw.u32 q0, [r2, #240] 1679; CHECK-NEXT: vldrw.u32 q1, [r1, #240] 1680; CHECK-NEXT: vmlava.u32 r0, q1, q0 1681; CHECK-NEXT: vldrw.u32 q0, [r2, #256] 1682; CHECK-NEXT: vldrw.u32 q1, [r1, #256] 1683; CHECK-NEXT: vmlava.u32 r0, q1, q0 1684; CHECK-NEXT: vldrw.u32 q0, [r2, #272] 1685; CHECK-NEXT: vldrw.u32 q1, [r1, #272] 1686; CHECK-NEXT: vmlava.u32 r0, q1, q0 1687; CHECK-NEXT: vldrw.u32 q0, [r2, #288] 1688; CHECK-NEXT: vldrw.u32 q1, [r1, #288] 1689; CHECK-NEXT: vmlava.u32 r0, q1, q0 1690; CHECK-NEXT: vldrw.u32 q0, [r2, #304] 1691; CHECK-NEXT: vldrw.u32 q1, [r1, #304] 1692; CHECK-NEXT: vmlava.u32 r0, q1, q0 1693; CHECK-NEXT: vldrw.u32 q0, [r2, #320] 1694; CHECK-NEXT: vldrw.u32 q1, [r1, #320] 1695; CHECK-NEXT: vmlava.u32 r0, q1, q0 1696; CHECK-NEXT: vldrw.u32 q0, [r2, #336] 1697; CHECK-NEXT: vldrw.u32 q1, [r1, #336] 1698; CHECK-NEXT: vmlava.u32 r0, q1, q0 1699; CHECK-NEXT: vldrw.u32 q0, [r2, #352] 1700; CHECK-NEXT: vldrw.u32 q1, [r1, #352] 1701; CHECK-NEXT: vmlava.u32 r0, q1, q0 1702; CHECK-NEXT: vldrw.u32 q0, [r2, #368] 1703; CHECK-NEXT: vldrw.u32 q1, [r1, #368] 1704; CHECK-NEXT: vmlava.u32 r0, q1, q0 1705; CHECK-NEXT: vldrw.u32 q0, [r2, #384] 1706; CHECK-NEXT: vldrw.u32 q1, [r1, #384] 1707; CHECK-NEXT: vmlava.u32 r0, q1, q0 1708; CHECK-NEXT: vldrw.u32 q0, [r2, #400] 1709; CHECK-NEXT: vldrw.u32 q1, [r1, #400] 1710; CHECK-NEXT: vmlava.u32 r0, q1, q0 1711; CHECK-NEXT: vldrw.u32 q0, [r2, #416] 1712; CHECK-NEXT: vldrw.u32 q1, [r1, #416] 1713; CHECK-NEXT: vmlava.u32 r0, q1, q0 1714; CHECK-NEXT: vldrw.u32 q0, [r2, #432] 1715; CHECK-NEXT: vldrw.u32 q1, [r1, #432] 1716; CHECK-NEXT: vmlava.u32 r0, q1, q0 1717; CHECK-NEXT: vldrw.u32 q0, [r2, #448] 1718; CHECK-NEXT: vldrw.u32 q1, [r1, #448] 1719; CHECK-NEXT: vmlava.u32 r0, q1, q0 1720; CHECK-NEXT: vldrw.u32 q0, [r2, #464] 1721; CHECK-NEXT: vldrw.u32 q1, [r1, #464] 1722; CHECK-NEXT: vmlava.u32 r0, q1, q0 1723; CHECK-NEXT: vldrw.u32 q0, [r2, #480] 1724; CHECK-NEXT: vldrw.u32 q1, [r1, #480] 1725; CHECK-NEXT: vmlava.u32 r0, q1, q0 1726; CHECK-NEXT: vldrw.u32 q0, [r2, #496] 1727; CHECK-NEXT: vldrw.u32 q1, [r1, #496] 1728; CHECK-NEXT: vmlava.u32 r0, q1, q0 1729; CHECK-NEXT: bx lr 1730entry: 1731 %wide.load = load <4 x i32>, ptr %x, align 4 1732 %wide.load10 = load <4 x i32>, ptr %y, align 4 1733 %0 = mul nsw <4 x i32> %wide.load10, %wide.load 1734 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) 1735 %2 = getelementptr inbounds i32, ptr %x, i32 4 1736 %wide.load.1 = load <4 x i32>, ptr %2, align 4 1737 %3 = getelementptr inbounds i32, ptr %y, i32 4 1738 %wide.load10.1 = load <4 x i32>, ptr %3, align 4 1739 %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 1740 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 1741 %6 = add i32 %5, %1 1742 %7 = getelementptr inbounds i32, ptr %x, i32 8 1743 %wide.load.2 = load <4 x i32>, ptr %7, align 4 1744 %8 = getelementptr inbounds i32, ptr %y, i32 8 1745 %wide.load10.2 = load <4 x i32>, ptr %8, align 4 1746 %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 1747 %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) 1748 %11 = add i32 %10, %6 1749 %12 = getelementptr inbounds i32, ptr %x, i32 12 1750 %wide.load.3 = load <4 x i32>, ptr %12, align 4 1751 %13 = getelementptr inbounds i32, ptr %y, i32 12 1752 %wide.load10.3 = load <4 x i32>, ptr %13, align 4 1753 %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 1754 %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) 1755 %16 = add i32 %15, %11 1756 %17 = getelementptr inbounds i32, ptr %x, i32 16 1757 %wide.load.4 = load <4 x i32>, ptr %17, align 4 1758 %18 = getelementptr inbounds i32, ptr %y, i32 16 1759 %wide.load10.4 = load <4 x i32>, ptr %18, align 4 1760 %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 1761 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) 1762 %21 = add i32 %20, %16 1763 %22 = getelementptr inbounds i32, ptr %x, i32 20 1764 %wide.load.5 = load <4 x i32>, ptr %22, align 4 1765 %23 = getelementptr inbounds i32, ptr %y, i32 20 1766 %wide.load10.5 = load <4 x i32>, ptr %23, align 4 1767 %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 1768 %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) 1769 %26 = add i32 %25, %21 1770 %27 = getelementptr inbounds i32, ptr %x, i32 24 1771 %wide.load.6 = load <4 x i32>, ptr %27, align 4 1772 %28 = getelementptr inbounds i32, ptr %y, i32 24 1773 %wide.load10.6 = load <4 x i32>, ptr %28, align 4 1774 %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 1775 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) 1776 %31 = add i32 %30, %26 1777 %32 = getelementptr inbounds i32, ptr %x, i32 28 1778 %wide.load.7 = load <4 x i32>, ptr %32, align 4 1779 %33 = getelementptr inbounds i32, ptr %y, i32 28 1780 %wide.load10.7 = load <4 x i32>, ptr %33, align 4 1781 %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 1782 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34) 1783 %36 = add i32 %35, %31 1784 %37 = getelementptr inbounds i32, ptr %x, i32 32 1785 %wide.load.8 = load <4 x i32>, ptr %37, align 4 1786 %38 = getelementptr inbounds i32, ptr %y, i32 32 1787 %wide.load10.8 = load <4 x i32>, ptr %38, align 4 1788 %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 1789 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39) 1790 %41 = add i32 %40, %36 1791 %42 = getelementptr inbounds i32, ptr %x, i32 36 1792 %wide.load.9 = load <4 x i32>, ptr %42, align 4 1793 %43 = getelementptr inbounds i32, ptr %y, i32 36 1794 %wide.load10.9 = load <4 x i32>, ptr %43, align 4 1795 %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 1796 %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44) 1797 %46 = add i32 %45, %41 1798 %47 = getelementptr inbounds i32, ptr %x, i32 40 1799 %wide.load.10 = load <4 x i32>, ptr %47, align 4 1800 %48 = getelementptr inbounds i32, ptr %y, i32 40 1801 %wide.load10.10 = load <4 x i32>, ptr %48, align 4 1802 %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 1803 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49) 1804 %51 = add i32 %50, %46 1805 %52 = getelementptr inbounds i32, ptr %x, i32 44 1806 %wide.load.11 = load <4 x i32>, ptr %52, align 4 1807 %53 = getelementptr inbounds i32, ptr %y, i32 44 1808 %wide.load10.11 = load <4 x i32>, ptr %53, align 4 1809 %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 1810 %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54) 1811 %56 = add i32 %55, %51 1812 %57 = getelementptr inbounds i32, ptr %x, i32 48 1813 %wide.load.12 = load <4 x i32>, ptr %57, align 4 1814 %58 = getelementptr inbounds i32, ptr %y, i32 48 1815 %wide.load10.12 = load <4 x i32>, ptr %58, align 4 1816 %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 1817 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59) 1818 %61 = add i32 %60, %56 1819 %62 = getelementptr inbounds i32, ptr %x, i32 52 1820 %wide.load.13 = load <4 x i32>, ptr %62, align 4 1821 %63 = getelementptr inbounds i32, ptr %y, i32 52 1822 %wide.load10.13 = load <4 x i32>, ptr %63, align 4 1823 %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 1824 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) 1825 %66 = add i32 %65, %61 1826 %67 = getelementptr inbounds i32, ptr %x, i32 56 1827 %wide.load.14 = load <4 x i32>, ptr %67, align 4 1828 %68 = getelementptr inbounds i32, ptr %y, i32 56 1829 %wide.load10.14 = load <4 x i32>, ptr %68, align 4 1830 %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 1831 %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69) 1832 %71 = add i32 %70, %66 1833 %72 = getelementptr inbounds i32, ptr %x, i32 60 1834 %wide.load.15 = load <4 x i32>, ptr %72, align 4 1835 %73 = getelementptr inbounds i32, ptr %y, i32 60 1836 %wide.load10.15 = load <4 x i32>, ptr %73, align 4 1837 %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 1838 %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74) 1839 %76 = add i32 %75, %71 1840 %77 = getelementptr inbounds i32, ptr %x, i32 64 1841 %wide.load.16 = load <4 x i32>, ptr %77, align 4 1842 %78 = getelementptr inbounds i32, ptr %y, i32 64 1843 %wide.load10.16 = load <4 x i32>, ptr %78, align 4 1844 %79 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16 1845 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %79) 1846 %81 = add i32 %80, %76 1847 %82 = getelementptr inbounds i32, ptr %x, i32 68 1848 %wide.load.17 = load <4 x i32>, ptr %82, align 4 1849 %83 = getelementptr inbounds i32, ptr %y, i32 68 1850 %wide.load10.17 = load <4 x i32>, ptr %83, align 4 1851 %84 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17 1852 %85 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %84) 1853 %86 = add i32 %85, %81 1854 %87 = getelementptr inbounds i32, ptr %x, i32 72 1855 %wide.load.18 = load <4 x i32>, ptr %87, align 4 1856 %88 = getelementptr inbounds i32, ptr %y, i32 72 1857 %wide.load10.18 = load <4 x i32>, ptr %88, align 4 1858 %89 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18 1859 %90 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %89) 1860 %91 = add i32 %90, %86 1861 %92 = getelementptr inbounds i32, ptr %x, i32 76 1862 %wide.load.19 = load <4 x i32>, ptr %92, align 4 1863 %93 = getelementptr inbounds i32, ptr %y, i32 76 1864 %wide.load10.19 = load <4 x i32>, ptr %93, align 4 1865 %94 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19 1866 %95 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %94) 1867 %96 = add i32 %95, %91 1868 %97 = getelementptr inbounds i32, ptr %x, i32 80 1869 %wide.load.20 = load <4 x i32>, ptr %97, align 4 1870 %98 = getelementptr inbounds i32, ptr %y, i32 80 1871 %wide.load10.20 = load <4 x i32>, ptr %98, align 4 1872 %99 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20 1873 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99) 1874 %101 = add i32 %100, %96 1875 %102 = getelementptr inbounds i32, ptr %x, i32 84 1876 %wide.load.21 = load <4 x i32>, ptr %102, align 4 1877 %103 = getelementptr inbounds i32, ptr %y, i32 84 1878 %wide.load10.21 = load <4 x i32>, ptr %103, align 4 1879 %104 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21 1880 %105 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %104) 1881 %106 = add i32 %105, %101 1882 %107 = getelementptr inbounds i32, ptr %x, i32 88 1883 %wide.load.22 = load <4 x i32>, ptr %107, align 4 1884 %108 = getelementptr inbounds i32, ptr %y, i32 88 1885 %wide.load10.22 = load <4 x i32>, ptr %108, align 4 1886 %109 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22 1887 %110 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %109) 1888 %111 = add i32 %110, %106 1889 %112 = getelementptr inbounds i32, ptr %x, i32 92 1890 %wide.load.23 = load <4 x i32>, ptr %112, align 4 1891 %113 = getelementptr inbounds i32, ptr %y, i32 92 1892 %wide.load10.23 = load <4 x i32>, ptr %113, align 4 1893 %114 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23 1894 %115 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %114) 1895 %116 = add i32 %115, %111 1896 %117 = getelementptr inbounds i32, ptr %x, i32 96 1897 %wide.load.24 = load <4 x i32>, ptr %117, align 4 1898 %118 = getelementptr inbounds i32, ptr %y, i32 96 1899 %wide.load10.24 = load <4 x i32>, ptr %118, align 4 1900 %119 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24 1901 %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %119) 1902 %121 = add i32 %120, %116 1903 %122 = getelementptr inbounds i32, ptr %x, i32 100 1904 %wide.load.25 = load <4 x i32>, ptr %122, align 4 1905 %123 = getelementptr inbounds i32, ptr %y, i32 100 1906 %wide.load10.25 = load <4 x i32>, ptr %123, align 4 1907 %124 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25 1908 %125 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %124) 1909 %126 = add i32 %125, %121 1910 %127 = getelementptr inbounds i32, ptr %x, i32 104 1911 %wide.load.26 = load <4 x i32>, ptr %127, align 4 1912 %128 = getelementptr inbounds i32, ptr %y, i32 104 1913 %wide.load10.26 = load <4 x i32>, ptr %128, align 4 1914 %129 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26 1915 %130 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %129) 1916 %131 = add i32 %130, %126 1917 %132 = getelementptr inbounds i32, ptr %x, i32 108 1918 %wide.load.27 = load <4 x i32>, ptr %132, align 4 1919 %133 = getelementptr inbounds i32, ptr %y, i32 108 1920 %wide.load10.27 = load <4 x i32>, ptr %133, align 4 1921 %134 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27 1922 %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134) 1923 %136 = add i32 %135, %131 1924 %137 = getelementptr inbounds i32, ptr %x, i32 112 1925 %wide.load.28 = load <4 x i32>, ptr %137, align 4 1926 %138 = getelementptr inbounds i32, ptr %y, i32 112 1927 %wide.load10.28 = load <4 x i32>, ptr %138, align 4 1928 %139 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28 1929 %140 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %139) 1930 %141 = add i32 %140, %136 1931 %142 = getelementptr inbounds i32, ptr %x, i32 116 1932 %wide.load.29 = load <4 x i32>, ptr %142, align 4 1933 %143 = getelementptr inbounds i32, ptr %y, i32 116 1934 %wide.load10.29 = load <4 x i32>, ptr %143, align 4 1935 %144 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29 1936 %145 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %144) 1937 %146 = add i32 %145, %141 1938 %147 = getelementptr inbounds i32, ptr %x, i32 120 1939 %wide.load.30 = load <4 x i32>, ptr %147, align 4 1940 %148 = getelementptr inbounds i32, ptr %y, i32 120 1941 %wide.load10.30 = load <4 x i32>, ptr %148, align 4 1942 %149 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30 1943 %150 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %149) 1944 %151 = add i32 %150, %146 1945 %152 = getelementptr inbounds i32, ptr %x, i32 124 1946 %wide.load.31 = load <4 x i32>, ptr %152, align 4 1947 %153 = getelementptr inbounds i32, ptr %y, i32 124 1948 %wide.load10.31 = load <4 x i32>, ptr %153, align 4 1949 %154 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31 1950 %155 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %154) 1951 %156 = add i32 %155, %151 1952 ret i32 %156 1953} 1954 1955define i32 @mlav2i32i16(ptr %x, ptr %y) { 1956; CHECK-LABEL: mlav2i32i16: 1957; CHECK: @ %bb.0: @ %entry 1958; CHECK-NEXT: ldrsh.w r2, [r0] 1959; CHECK-NEXT: ldrsh.w r3, [r1] 1960; CHECK-NEXT: ldrsh.w r0, [r0, #2] 1961; CHECK-NEXT: ldrsh.w r1, [r1, #2] 1962; CHECK-NEXT: muls r0, r1, r0 1963; CHECK-NEXT: smlabb r0, r3, r2, r0 1964; CHECK-NEXT: bx lr 1965entry: 1966 %0 = load i16, ptr %x, align 2 1967 %conv = sext i16 %0 to i32 1968 %1 = load i16, ptr %y, align 2 1969 %conv2 = sext i16 %1 to i32 1970 %mul = mul nsw i32 %conv2, %conv 1971 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 1972 %2 = load i16, ptr %arrayidx.1, align 2 1973 %conv.1 = sext i16 %2 to i32 1974 %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1 1975 %3 = load i16, ptr %arrayidx1.1, align 2 1976 %conv2.1 = sext i16 %3 to i32 1977 %mul.1 = mul nsw i32 %conv2.1, %conv.1 1978 %add.1 = add nsw i32 %mul.1, %mul 1979 ret i32 %add.1 1980} 1981 1982define i32 @mlav4i32i16(ptr %x, ptr %y) { 1983; CHECK-LABEL: mlav4i32i16: 1984; CHECK: @ %bb.0: @ %entry 1985; CHECK-NEXT: vldrh.s32 q0, [r0] 1986; CHECK-NEXT: vldrh.s32 q1, [r1] 1987; CHECK-NEXT: vmlav.u32 r0, q1, q0 1988; CHECK-NEXT: bx lr 1989entry: 1990 %0 = load <4 x i16>, ptr %x, align 2 1991 %1 = sext <4 x i16> %0 to <4 x i32> 1992 %2 = load <4 x i16>, ptr %y, align 2 1993 %3 = sext <4 x i16> %2 to <4 x i32> 1994 %4 = mul nsw <4 x i32> %3, %1 1995 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 1996 ret i32 %5 1997} 1998 1999define i32 @mlav8i32i16(ptr %x, ptr %y) { 2000; CHECK-LABEL: mlav8i32i16: 2001; CHECK: @ %bb.0: @ %entry 2002; CHECK-NEXT: vldrh.u16 q0, [r0] 2003; CHECK-NEXT: vldrh.u16 q1, [r1] 2004; CHECK-NEXT: vmlav.s16 r0, q1, q0 2005; CHECK-NEXT: bx lr 2006entry: 2007 %0 = load <8 x i16>, ptr %x, align 2 2008 %1 = sext <8 x i16> %0 to <8 x i32> 2009 %2 = load <8 x i16>, ptr %y, align 2 2010 %3 = sext <8 x i16> %2 to <8 x i32> 2011 %4 = mul nsw <8 x i32> %3, %1 2012 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 2013 ret i32 %5 2014} 2015 2016define i32 @mlav16i32i16(ptr %x, ptr %y) { 2017; CHECK-LABEL: mlav16i32i16: 2018; CHECK: @ %bb.0: @ %entry 2019; CHECK-NEXT: vldrh.s32 q0, [r0] 2020; CHECK-NEXT: vldrh.s32 q1, [r1] 2021; CHECK-NEXT: vmlav.u32 r2, q1, q0 2022; CHECK-NEXT: vldrh.s32 q0, [r0, #8] 2023; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 2024; CHECK-NEXT: vmlava.u32 r2, q1, q0 2025; CHECK-NEXT: vldrh.s32 q0, [r0, #16] 2026; CHECK-NEXT: vldrh.s32 q1, [r1, #16] 2027; CHECK-NEXT: vmlava.u32 r2, q1, q0 2028; CHECK-NEXT: vldrh.s32 q0, [r0, #24] 2029; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 2030; CHECK-NEXT: vmlava.u32 r2, q1, q0 2031; CHECK-NEXT: mov r0, r2 2032; CHECK-NEXT: bx lr 2033entry: 2034 %0 = load <16 x i16>, ptr %x, align 2 2035 %1 = sext <16 x i16> %0 to <16 x i32> 2036 %2 = load <16 x i16>, ptr %y, align 2 2037 %3 = sext <16 x i16> %2 to <16 x i32> 2038 %4 = mul nsw <16 x i32> %3, %1 2039 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 2040 ret i32 %5 2041} 2042 2043define i32 @mlav24i32i16(ptr %x, ptr %y) { 2044; CHECK-LABEL: mlav24i32i16: 2045; CHECK: @ %bb.0: @ %entry 2046; CHECK-NEXT: vldrh.u16 q0, [r0] 2047; CHECK-NEXT: vldrh.u16 q1, [r1] 2048; CHECK-NEXT: mov r2, r0 2049; CHECK-NEXT: vmlav.s16 r0, q1, q0 2050; CHECK-NEXT: vldrh.s32 q0, [r2, #16] 2051; CHECK-NEXT: vldrh.s32 q1, [r1, #16] 2052; CHECK-NEXT: vmlava.u32 r0, q1, q0 2053; CHECK-NEXT: vldrh.s32 q0, [r2, #24] 2054; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 2055; CHECK-NEXT: vmlava.u32 r0, q1, q0 2056; CHECK-NEXT: vldrh.s32 q0, [r2, #32] 2057; CHECK-NEXT: vldrh.s32 q1, [r1, #32] 2058; CHECK-NEXT: vmlava.u32 r0, q1, q0 2059; CHECK-NEXT: vldrh.s32 q0, [r2, #40] 2060; CHECK-NEXT: vldrh.s32 q1, [r1, #40] 2061; CHECK-NEXT: vmlava.u32 r0, q1, q0 2062; CHECK-NEXT: bx lr 2063entry: 2064 %0 = load <8 x i16>, ptr %x, align 2 2065 %1 = sext <8 x i16> %0 to <8 x i32> 2066 %2 = load <8 x i16>, ptr %y, align 2 2067 %3 = sext <8 x i16> %2 to <8 x i32> 2068 %4 = mul nsw <8 x i32> %3, %1 2069 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 2070 %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8 2071 %5 = load <16 x i16>, ptr %arrayidx.8, align 2 2072 %6 = sext <16 x i16> %5 to <16 x i32> 2073 %7 = load <16 x i16>, ptr %arrayidx1.8, align 2 2074 %8 = sext <16 x i16> %7 to <16 x i32> 2075 %9 = mul nsw <16 x i32> %8, %6 2076 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9) 2077 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 2078 %op.rdx = add nsw i32 %10, %11 2079 ret i32 %op.rdx 2080} 2081 2082define i32 @mlav32i32i16(ptr %x, ptr %y) { 2083; CHECK-LABEL: mlav32i32i16: 2084; CHECK: @ %bb.0: @ %entry 2085; CHECK-NEXT: vldrh.s32 q0, [r0] 2086; CHECK-NEXT: vldrh.s32 q1, [r1] 2087; CHECK-NEXT: mov r2, r0 2088; CHECK-NEXT: vmlav.u32 r0, q1, q0 2089; CHECK-NEXT: vldrh.s32 q0, [r2, #8] 2090; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 2091; CHECK-NEXT: vmlava.u32 r0, q1, q0 2092; CHECK-NEXT: vldrh.s32 q0, [r2, #16] 2093; CHECK-NEXT: vldrh.s32 q1, [r1, #16] 2094; CHECK-NEXT: vmlava.u32 r0, q1, q0 2095; CHECK-NEXT: vldrh.s32 q0, [r2, #24] 2096; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 2097; CHECK-NEXT: vmlava.u32 r0, q1, q0 2098; CHECK-NEXT: vldrh.s32 q0, [r2, #32] 2099; CHECK-NEXT: vldrh.s32 q1, [r1, #32] 2100; CHECK-NEXT: vmlava.u32 r0, q1, q0 2101; CHECK-NEXT: vldrh.s32 q0, [r2, #40] 2102; CHECK-NEXT: vldrh.s32 q1, [r1, #40] 2103; CHECK-NEXT: vmlava.u32 r0, q1, q0 2104; CHECK-NEXT: vldrh.s32 q0, [r2, #48] 2105; CHECK-NEXT: vldrh.s32 q1, [r1, #48] 2106; CHECK-NEXT: vmlava.u32 r0, q1, q0 2107; CHECK-NEXT: vldrh.s32 q0, [r2, #56] 2108; CHECK-NEXT: vldrh.s32 q1, [r1, #56] 2109; CHECK-NEXT: vmlava.u32 r0, q1, q0 2110; CHECK-NEXT: bx lr 2111entry: 2112 %0 = load <32 x i16>, ptr %x, align 2 2113 %1 = sext <32 x i16> %0 to <32 x i32> 2114 %2 = load <32 x i16>, ptr %y, align 2 2115 %3 = sext <32 x i16> %2 to <32 x i32> 2116 %4 = mul nsw <32 x i32> %3, %1 2117 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 2118 ret i32 %5 2119} 2120 2121define i32 @mlav64i32i16(ptr %x, ptr %y) { 2122; CHECK-LABEL: mlav64i32i16: 2123; CHECK: @ %bb.0: @ %entry 2124; CHECK-NEXT: vldrh.u16 q0, [r0] 2125; CHECK-NEXT: vldrh.u16 q1, [r1] 2126; CHECK-NEXT: mov r2, r0 2127; CHECK-NEXT: vmlav.s16 r0, q1, q0 2128; CHECK-NEXT: vldrh.u16 q0, [r2, #16] 2129; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2130; CHECK-NEXT: vmlava.s16 r0, q1, q0 2131; CHECK-NEXT: vldrh.u16 q0, [r2, #32] 2132; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2133; CHECK-NEXT: vmlava.s16 r0, q1, q0 2134; CHECK-NEXT: vldrh.u16 q0, [r2, #48] 2135; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2136; CHECK-NEXT: vmlava.s16 r0, q1, q0 2137; CHECK-NEXT: vldrh.u16 q0, [r2, #64] 2138; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 2139; CHECK-NEXT: vmlava.s16 r0, q1, q0 2140; CHECK-NEXT: vldrh.u16 q0, [r2, #80] 2141; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 2142; CHECK-NEXT: vmlava.s16 r0, q1, q0 2143; CHECK-NEXT: vldrh.u16 q0, [r2, #96] 2144; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 2145; CHECK-NEXT: vmlava.s16 r0, q1, q0 2146; CHECK-NEXT: vldrh.u16 q0, [r2, #112] 2147; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 2148; CHECK-NEXT: vmlava.s16 r0, q1, q0 2149; CHECK-NEXT: bx lr 2150entry: 2151 %wide.load = load <8 x i16>, ptr %x, align 2 2152 %0 = sext <8 x i16> %wide.load to <8 x i32> 2153 %wide.load11 = load <8 x i16>, ptr %y, align 2 2154 %1 = sext <8 x i16> %wide.load11 to <8 x i32> 2155 %2 = mul nsw <8 x i32> %1, %0 2156 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 2157 %4 = getelementptr inbounds i16, ptr %x, i32 8 2158 %wide.load.1 = load <8 x i16>, ptr %4, align 2 2159 %5 = sext <8 x i16> %wide.load.1 to <8 x i32> 2160 %6 = getelementptr inbounds i16, ptr %y, i32 8 2161 %wide.load11.1 = load <8 x i16>, ptr %6, align 2 2162 %7 = sext <8 x i16> %wide.load11.1 to <8 x i32> 2163 %8 = mul nsw <8 x i32> %7, %5 2164 %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) 2165 %10 = add i32 %9, %3 2166 %11 = getelementptr inbounds i16, ptr %x, i32 16 2167 %wide.load.2 = load <8 x i16>, ptr %11, align 2 2168 %12 = sext <8 x i16> %wide.load.2 to <8 x i32> 2169 %13 = getelementptr inbounds i16, ptr %y, i32 16 2170 %wide.load11.2 = load <8 x i16>, ptr %13, align 2 2171 %14 = sext <8 x i16> %wide.load11.2 to <8 x i32> 2172 %15 = mul nsw <8 x i32> %14, %12 2173 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) 2174 %17 = add i32 %16, %10 2175 %18 = getelementptr inbounds i16, ptr %x, i32 24 2176 %wide.load.3 = load <8 x i16>, ptr %18, align 2 2177 %19 = sext <8 x i16> %wide.load.3 to <8 x i32> 2178 %20 = getelementptr inbounds i16, ptr %y, i32 24 2179 %wide.load11.3 = load <8 x i16>, ptr %20, align 2 2180 %21 = sext <8 x i16> %wide.load11.3 to <8 x i32> 2181 %22 = mul nsw <8 x i32> %21, %19 2182 %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22) 2183 %24 = add i32 %23, %17 2184 %25 = getelementptr inbounds i16, ptr %x, i32 32 2185 %wide.load.4 = load <8 x i16>, ptr %25, align 2 2186 %26 = sext <8 x i16> %wide.load.4 to <8 x i32> 2187 %27 = getelementptr inbounds i16, ptr %y, i32 32 2188 %wide.load11.4 = load <8 x i16>, ptr %27, align 2 2189 %28 = sext <8 x i16> %wide.load11.4 to <8 x i32> 2190 %29 = mul nsw <8 x i32> %28, %26 2191 %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29) 2192 %31 = add i32 %30, %24 2193 %32 = getelementptr inbounds i16, ptr %x, i32 40 2194 %wide.load.5 = load <8 x i16>, ptr %32, align 2 2195 %33 = sext <8 x i16> %wide.load.5 to <8 x i32> 2196 %34 = getelementptr inbounds i16, ptr %y, i32 40 2197 %wide.load11.5 = load <8 x i16>, ptr %34, align 2 2198 %35 = sext <8 x i16> %wide.load11.5 to <8 x i32> 2199 %36 = mul nsw <8 x i32> %35, %33 2200 %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36) 2201 %38 = add i32 %37, %31 2202 %39 = getelementptr inbounds i16, ptr %x, i32 48 2203 %wide.load.6 = load <8 x i16>, ptr %39, align 2 2204 %40 = sext <8 x i16> %wide.load.6 to <8 x i32> 2205 %41 = getelementptr inbounds i16, ptr %y, i32 48 2206 %wide.load11.6 = load <8 x i16>, ptr %41, align 2 2207 %42 = sext <8 x i16> %wide.load11.6 to <8 x i32> 2208 %43 = mul nsw <8 x i32> %42, %40 2209 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) 2210 %45 = add i32 %44, %38 2211 %46 = getelementptr inbounds i16, ptr %x, i32 56 2212 %wide.load.7 = load <8 x i16>, ptr %46, align 2 2213 %47 = sext <8 x i16> %wide.load.7 to <8 x i32> 2214 %48 = getelementptr inbounds i16, ptr %y, i32 56 2215 %wide.load11.7 = load <8 x i16>, ptr %48, align 2 2216 %49 = sext <8 x i16> %wide.load11.7 to <8 x i32> 2217 %50 = mul nsw <8 x i32> %49, %47 2218 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) 2219 %52 = add i32 %51, %45 2220 ret i32 %52 2221} 2222 2223define i32 @mlav128i32i16(ptr %x, ptr %y) { 2224; CHECK-LABEL: mlav128i32i16: 2225; CHECK: @ %bb.0: @ %entry 2226; CHECK-NEXT: vldrh.u16 q0, [r0] 2227; CHECK-NEXT: vldrh.u16 q1, [r1] 2228; CHECK-NEXT: mov r2, r0 2229; CHECK-NEXT: vmlav.s16 r0, q1, q0 2230; CHECK-NEXT: vldrh.u16 q0, [r2, #16] 2231; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2232; CHECK-NEXT: vmlava.s16 r0, q1, q0 2233; CHECK-NEXT: vldrh.u16 q0, [r2, #32] 2234; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2235; CHECK-NEXT: vmlava.s16 r0, q1, q0 2236; CHECK-NEXT: vldrh.u16 q0, [r2, #48] 2237; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2238; CHECK-NEXT: vmlava.s16 r0, q1, q0 2239; CHECK-NEXT: vldrh.u16 q0, [r2, #64] 2240; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 2241; CHECK-NEXT: vmlava.s16 r0, q1, q0 2242; CHECK-NEXT: vldrh.u16 q0, [r2, #80] 2243; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 2244; CHECK-NEXT: vmlava.s16 r0, q1, q0 2245; CHECK-NEXT: vldrh.u16 q0, [r2, #96] 2246; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 2247; CHECK-NEXT: vmlava.s16 r0, q1, q0 2248; CHECK-NEXT: vldrh.u16 q0, [r2, #112] 2249; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 2250; CHECK-NEXT: vmlava.s16 r0, q1, q0 2251; CHECK-NEXT: vldrh.u16 q0, [r2, #128] 2252; CHECK-NEXT: vldrh.u16 q1, [r1, #128] 2253; CHECK-NEXT: vmlava.s16 r0, q1, q0 2254; CHECK-NEXT: vldrh.u16 q0, [r2, #144] 2255; CHECK-NEXT: vldrh.u16 q1, [r1, #144] 2256; CHECK-NEXT: vmlava.s16 r0, q1, q0 2257; CHECK-NEXT: vldrh.u16 q0, [r2, #160] 2258; CHECK-NEXT: vldrh.u16 q1, [r1, #160] 2259; CHECK-NEXT: vmlava.s16 r0, q1, q0 2260; CHECK-NEXT: vldrh.u16 q0, [r2, #176] 2261; CHECK-NEXT: vldrh.u16 q1, [r1, #176] 2262; CHECK-NEXT: vmlava.s16 r0, q1, q0 2263; CHECK-NEXT: vldrh.u16 q0, [r2, #192] 2264; CHECK-NEXT: vldrh.u16 q1, [r1, #192] 2265; CHECK-NEXT: vmlava.s16 r0, q1, q0 2266; CHECK-NEXT: vldrh.u16 q0, [r2, #208] 2267; CHECK-NEXT: vldrh.u16 q1, [r1, #208] 2268; CHECK-NEXT: vmlava.s16 r0, q1, q0 2269; CHECK-NEXT: vldrh.u16 q0, [r2, #224] 2270; CHECK-NEXT: vldrh.u16 q1, [r1, #224] 2271; CHECK-NEXT: vmlava.s16 r0, q1, q0 2272; CHECK-NEXT: vldrh.u16 q0, [r2, #240] 2273; CHECK-NEXT: vldrh.u16 q1, [r1, #240] 2274; CHECK-NEXT: vmlava.s16 r0, q1, q0 2275; CHECK-NEXT: bx lr 2276entry: 2277 %wide.load = load <8 x i16>, ptr %x, align 2 2278 %0 = sext <8 x i16> %wide.load to <8 x i32> 2279 %wide.load11 = load <8 x i16>, ptr %y, align 2 2280 %1 = sext <8 x i16> %wide.load11 to <8 x i32> 2281 %2 = mul nsw <8 x i32> %1, %0 2282 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) 2283 %4 = getelementptr inbounds i16, ptr %x, i32 8 2284 %wide.load.1 = load <8 x i16>, ptr %4, align 2 2285 %5 = sext <8 x i16> %wide.load.1 to <8 x i32> 2286 %6 = getelementptr inbounds i16, ptr %y, i32 8 2287 %wide.load11.1 = load <8 x i16>, ptr %6, align 2 2288 %7 = sext <8 x i16> %wide.load11.1 to <8 x i32> 2289 %8 = mul nsw <8 x i32> %7, %5 2290 %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) 2291 %10 = add i32 %9, %3 2292 %11 = getelementptr inbounds i16, ptr %x, i32 16 2293 %wide.load.2 = load <8 x i16>, ptr %11, align 2 2294 %12 = sext <8 x i16> %wide.load.2 to <8 x i32> 2295 %13 = getelementptr inbounds i16, ptr %y, i32 16 2296 %wide.load11.2 = load <8 x i16>, ptr %13, align 2 2297 %14 = sext <8 x i16> %wide.load11.2 to <8 x i32> 2298 %15 = mul nsw <8 x i32> %14, %12 2299 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) 2300 %17 = add i32 %16, %10 2301 %18 = getelementptr inbounds i16, ptr %x, i32 24 2302 %wide.load.3 = load <8 x i16>, ptr %18, align 2 2303 %19 = sext <8 x i16> %wide.load.3 to <8 x i32> 2304 %20 = getelementptr inbounds i16, ptr %y, i32 24 2305 %wide.load11.3 = load <8 x i16>, ptr %20, align 2 2306 %21 = sext <8 x i16> %wide.load11.3 to <8 x i32> 2307 %22 = mul nsw <8 x i32> %21, %19 2308 %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22) 2309 %24 = add i32 %23, %17 2310 %25 = getelementptr inbounds i16, ptr %x, i32 32 2311 %wide.load.4 = load <8 x i16>, ptr %25, align 2 2312 %26 = sext <8 x i16> %wide.load.4 to <8 x i32> 2313 %27 = getelementptr inbounds i16, ptr %y, i32 32 2314 %wide.load11.4 = load <8 x i16>, ptr %27, align 2 2315 %28 = sext <8 x i16> %wide.load11.4 to <8 x i32> 2316 %29 = mul nsw <8 x i32> %28, %26 2317 %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29) 2318 %31 = add i32 %30, %24 2319 %32 = getelementptr inbounds i16, ptr %x, i32 40 2320 %wide.load.5 = load <8 x i16>, ptr %32, align 2 2321 %33 = sext <8 x i16> %wide.load.5 to <8 x i32> 2322 %34 = getelementptr inbounds i16, ptr %y, i32 40 2323 %wide.load11.5 = load <8 x i16>, ptr %34, align 2 2324 %35 = sext <8 x i16> %wide.load11.5 to <8 x i32> 2325 %36 = mul nsw <8 x i32> %35, %33 2326 %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36) 2327 %38 = add i32 %37, %31 2328 %39 = getelementptr inbounds i16, ptr %x, i32 48 2329 %wide.load.6 = load <8 x i16>, ptr %39, align 2 2330 %40 = sext <8 x i16> %wide.load.6 to <8 x i32> 2331 %41 = getelementptr inbounds i16, ptr %y, i32 48 2332 %wide.load11.6 = load <8 x i16>, ptr %41, align 2 2333 %42 = sext <8 x i16> %wide.load11.6 to <8 x i32> 2334 %43 = mul nsw <8 x i32> %42, %40 2335 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) 2336 %45 = add i32 %44, %38 2337 %46 = getelementptr inbounds i16, ptr %x, i32 56 2338 %wide.load.7 = load <8 x i16>, ptr %46, align 2 2339 %47 = sext <8 x i16> %wide.load.7 to <8 x i32> 2340 %48 = getelementptr inbounds i16, ptr %y, i32 56 2341 %wide.load11.7 = load <8 x i16>, ptr %48, align 2 2342 %49 = sext <8 x i16> %wide.load11.7 to <8 x i32> 2343 %50 = mul nsw <8 x i32> %49, %47 2344 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) 2345 %52 = add i32 %51, %45 2346 %53 = getelementptr inbounds i16, ptr %x, i32 64 2347 %wide.load.8 = load <8 x i16>, ptr %53, align 2 2348 %54 = sext <8 x i16> %wide.load.8 to <8 x i32> 2349 %55 = getelementptr inbounds i16, ptr %y, i32 64 2350 %wide.load11.8 = load <8 x i16>, ptr %55, align 2 2351 %56 = sext <8 x i16> %wide.load11.8 to <8 x i32> 2352 %57 = mul nsw <8 x i32> %56, %54 2353 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57) 2354 %59 = add i32 %58, %52 2355 %60 = getelementptr inbounds i16, ptr %x, i32 72 2356 %wide.load.9 = load <8 x i16>, ptr %60, align 2 2357 %61 = sext <8 x i16> %wide.load.9 to <8 x i32> 2358 %62 = getelementptr inbounds i16, ptr %y, i32 72 2359 %wide.load11.9 = load <8 x i16>, ptr %62, align 2 2360 %63 = sext <8 x i16> %wide.load11.9 to <8 x i32> 2361 %64 = mul nsw <8 x i32> %63, %61 2362 %65 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %64) 2363 %66 = add i32 %65, %59 2364 %67 = getelementptr inbounds i16, ptr %x, i32 80 2365 %wide.load.10 = load <8 x i16>, ptr %67, align 2 2366 %68 = sext <8 x i16> %wide.load.10 to <8 x i32> 2367 %69 = getelementptr inbounds i16, ptr %y, i32 80 2368 %wide.load11.10 = load <8 x i16>, ptr %69, align 2 2369 %70 = sext <8 x i16> %wide.load11.10 to <8 x i32> 2370 %71 = mul nsw <8 x i32> %70, %68 2371 %72 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %71) 2372 %73 = add i32 %72, %66 2373 %74 = getelementptr inbounds i16, ptr %x, i32 88 2374 %wide.load.11 = load <8 x i16>, ptr %74, align 2 2375 %75 = sext <8 x i16> %wide.load.11 to <8 x i32> 2376 %76 = getelementptr inbounds i16, ptr %y, i32 88 2377 %wide.load11.11 = load <8 x i16>, ptr %76, align 2 2378 %77 = sext <8 x i16> %wide.load11.11 to <8 x i32> 2379 %78 = mul nsw <8 x i32> %77, %75 2380 %79 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %78) 2381 %80 = add i32 %79, %73 2382 %81 = getelementptr inbounds i16, ptr %x, i32 96 2383 %wide.load.12 = load <8 x i16>, ptr %81, align 2 2384 %82 = sext <8 x i16> %wide.load.12 to <8 x i32> 2385 %83 = getelementptr inbounds i16, ptr %y, i32 96 2386 %wide.load11.12 = load <8 x i16>, ptr %83, align 2 2387 %84 = sext <8 x i16> %wide.load11.12 to <8 x i32> 2388 %85 = mul nsw <8 x i32> %84, %82 2389 %86 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %85) 2390 %87 = add i32 %86, %80 2391 %88 = getelementptr inbounds i16, ptr %x, i32 104 2392 %wide.load.13 = load <8 x i16>, ptr %88, align 2 2393 %89 = sext <8 x i16> %wide.load.13 to <8 x i32> 2394 %90 = getelementptr inbounds i16, ptr %y, i32 104 2395 %wide.load11.13 = load <8 x i16>, ptr %90, align 2 2396 %91 = sext <8 x i16> %wide.load11.13 to <8 x i32> 2397 %92 = mul nsw <8 x i32> %91, %89 2398 %93 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %92) 2399 %94 = add i32 %93, %87 2400 %95 = getelementptr inbounds i16, ptr %x, i32 112 2401 %wide.load.14 = load <8 x i16>, ptr %95, align 2 2402 %96 = sext <8 x i16> %wide.load.14 to <8 x i32> 2403 %97 = getelementptr inbounds i16, ptr %y, i32 112 2404 %wide.load11.14 = load <8 x i16>, ptr %97, align 2 2405 %98 = sext <8 x i16> %wide.load11.14 to <8 x i32> 2406 %99 = mul nsw <8 x i32> %98, %96 2407 %100 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %99) 2408 %101 = add i32 %100, %94 2409 %102 = getelementptr inbounds i16, ptr %x, i32 120 2410 %wide.load.15 = load <8 x i16>, ptr %102, align 2 2411 %103 = sext <8 x i16> %wide.load.15 to <8 x i32> 2412 %104 = getelementptr inbounds i16, ptr %y, i32 120 2413 %wide.load11.15 = load <8 x i16>, ptr %104, align 2 2414 %105 = sext <8 x i16> %wide.load11.15 to <8 x i32> 2415 %106 = mul nsw <8 x i32> %105, %103 2416 %107 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %106) 2417 %108 = add i32 %107, %101 2418 ret i32 %108 2419} 2420 2421define i32 @mlav2i32i8(ptr %x, ptr %y) { 2422; CHECK-LABEL: mlav2i32i8: 2423; CHECK: @ %bb.0: @ %entry 2424; CHECK-NEXT: ldrb r2, [r0] 2425; CHECK-NEXT: ldrb r3, [r1] 2426; CHECK-NEXT: ldrb r0, [r0, #1] 2427; CHECK-NEXT: ldrb r1, [r1, #1] 2428; CHECK-NEXT: muls r0, r1, r0 2429; CHECK-NEXT: smlabb r0, r3, r2, r0 2430; CHECK-NEXT: bx lr 2431entry: 2432 %0 = load i8, ptr %x, align 1 2433 %conv = zext i8 %0 to i32 2434 %1 = load i8, ptr %y, align 1 2435 %conv2 = zext i8 %1 to i32 2436 %mul = mul nuw nsw i32 %conv2, %conv 2437 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 2438 %2 = load i8, ptr %arrayidx.1, align 1 2439 %conv.1 = zext i8 %2 to i32 2440 %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1 2441 %3 = load i8, ptr %arrayidx1.1, align 1 2442 %conv2.1 = zext i8 %3 to i32 2443 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 2444 %add.1 = add nuw nsw i32 %mul.1, %mul 2445 ret i32 %add.1 2446} 2447 2448define i32 @mlav4i32i8(ptr %x, ptr %y) { 2449; CHECK-LABEL: mlav4i32i8: 2450; CHECK: @ %bb.0: @ %entry 2451; CHECK-NEXT: vldrb.u32 q0, [r0] 2452; CHECK-NEXT: vldrb.u32 q1, [r1] 2453; CHECK-NEXT: vmlav.u32 r0, q1, q0 2454; CHECK-NEXT: bx lr 2455entry: 2456 %0 = load <4 x i8>, ptr %x, align 1 2457 %1 = zext <4 x i8> %0 to <4 x i32> 2458 %2 = load <4 x i8>, ptr %y, align 1 2459 %3 = zext <4 x i8> %2 to <4 x i32> 2460 %4 = mul nuw nsw <4 x i32> %3, %1 2461 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 2462 ret i32 %5 2463} 2464 2465define i32 @mlav8i32i8(ptr %x, ptr %y) { 2466; CHECK-LABEL: mlav8i32i8: 2467; CHECK: @ %bb.0: @ %entry 2468; CHECK-NEXT: vldrb.u16 q0, [r0] 2469; CHECK-NEXT: vldrb.u16 q1, [r1] 2470; CHECK-NEXT: vmlav.u16 r0, q1, q0 2471; CHECK-NEXT: bx lr 2472entry: 2473 %0 = load <8 x i8>, ptr %x, align 1 2474 %1 = zext <8 x i8> %0 to <8 x i32> 2475 %2 = load <8 x i8>, ptr %y, align 1 2476 %3 = zext <8 x i8> %2 to <8 x i32> 2477 %4 = mul nuw nsw <8 x i32> %3, %1 2478 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 2479 ret i32 %5 2480} 2481 2482define i32 @mlav16i32i8(ptr %x, ptr %y) { 2483; CHECK-LABEL: mlav16i32i8: 2484; CHECK: @ %bb.0: @ %entry 2485; CHECK-NEXT: vldrb.u8 q0, [r0] 2486; CHECK-NEXT: vldrb.u8 q1, [r1] 2487; CHECK-NEXT: vmlav.u8 r0, q1, q0 2488; CHECK-NEXT: bx lr 2489entry: 2490 %0 = load <16 x i8>, ptr %x, align 1 2491 %1 = zext <16 x i8> %0 to <16 x i32> 2492 %2 = load <16 x i8>, ptr %y, align 1 2493 %3 = zext <16 x i8> %2 to <16 x i32> 2494 %4 = mul nuw nsw <16 x i32> %3, %1 2495 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 2496 ret i32 %5 2497} 2498 2499define i32 @mlav24i32i8(ptr %x, ptr %y) { 2500; CHECK-LABEL: mlav24i32i8: 2501; CHECK: @ %bb.0: @ %entry 2502; CHECK-NEXT: vldrb.u16 q0, [r0] 2503; CHECK-NEXT: vldrb.u16 q1, [r1] 2504; CHECK-NEXT: vmlav.u16 r2, q1, q0 2505; CHECK-NEXT: vldrb.u8 q0, [r0, #8] 2506; CHECK-NEXT: vldrb.u8 q1, [r1, #8] 2507; CHECK-NEXT: vmlava.u8 r2, q1, q0 2508; CHECK-NEXT: mov r0, r2 2509; CHECK-NEXT: bx lr 2510entry: 2511 %0 = load <8 x i8>, ptr %x, align 1 2512 %1 = zext <8 x i8> %0 to <8 x i32> 2513 %2 = load <8 x i8>, ptr %y, align 1 2514 %3 = zext <8 x i8> %2 to <8 x i32> 2515 %4 = mul nuw nsw <8 x i32> %3, %1 2516 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 2517 %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8 2518 %5 = load <16 x i8>, ptr %arrayidx.8, align 1 2519 %6 = zext <16 x i8> %5 to <16 x i32> 2520 %7 = load <16 x i8>, ptr %arrayidx1.8, align 1 2521 %8 = zext <16 x i8> %7 to <16 x i32> 2522 %9 = mul nuw nsw <16 x i32> %8, %6 2523 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9) 2524 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 2525 %op.rdx = add nuw nsw i32 %10, %11 2526 ret i32 %op.rdx 2527} 2528 2529define i32 @mlav32i32i8(ptr %x, ptr %y) { 2530; CHECK-LABEL: mlav32i32i8: 2531; CHECK: @ %bb.0: @ %entry 2532; CHECK-NEXT: vldrb.u32 q0, [r0] 2533; CHECK-NEXT: vldrb.u32 q1, [r1] 2534; CHECK-NEXT: mov r2, r0 2535; CHECK-NEXT: vmlav.u32 r0, q1, q0 2536; CHECK-NEXT: vldrb.u32 q0, [r2, #4] 2537; CHECK-NEXT: vldrb.u32 q1, [r1, #4] 2538; CHECK-NEXT: vmlava.u32 r0, q1, q0 2539; CHECK-NEXT: vldrb.u32 q0, [r2, #8] 2540; CHECK-NEXT: vldrb.u32 q1, [r1, #8] 2541; CHECK-NEXT: vmlava.u32 r0, q1, q0 2542; CHECK-NEXT: vldrb.u32 q0, [r2, #12] 2543; CHECK-NEXT: vldrb.u32 q1, [r1, #12] 2544; CHECK-NEXT: vmlava.u32 r0, q1, q0 2545; CHECK-NEXT: vldrb.u32 q0, [r2, #16] 2546; CHECK-NEXT: vldrb.u32 q1, [r1, #16] 2547; CHECK-NEXT: vmlava.u32 r0, q1, q0 2548; CHECK-NEXT: vldrb.u32 q0, [r2, #20] 2549; CHECK-NEXT: vldrb.u32 q1, [r1, #20] 2550; CHECK-NEXT: vmlava.u32 r0, q1, q0 2551; CHECK-NEXT: vldrb.u32 q0, [r2, #24] 2552; CHECK-NEXT: vldrb.u32 q1, [r1, #24] 2553; CHECK-NEXT: vmlava.u32 r0, q1, q0 2554; CHECK-NEXT: vldrb.u32 q0, [r2, #28] 2555; CHECK-NEXT: vldrb.u32 q1, [r1, #28] 2556; CHECK-NEXT: vmlava.u32 r0, q1, q0 2557; CHECK-NEXT: bx lr 2558entry: 2559 %0 = load <32 x i8>, ptr %x, align 1 2560 %1 = zext <32 x i8> %0 to <32 x i32> 2561 %2 = load <32 x i8>, ptr %y, align 1 2562 %3 = zext <32 x i8> %2 to <32 x i32> 2563 %4 = mul nuw nsw <32 x i32> %3, %1 2564 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 2565 ret i32 %5 2566} 2567 2568define i32 @mlav64i32i8(ptr %x, ptr %y) { 2569; CHECK-LABEL: mlav64i32i8: 2570; CHECK: @ %bb.0: @ %entry 2571; CHECK-NEXT: vldrb.u8 q0, [r0] 2572; CHECK-NEXT: vldrb.u8 q1, [r1] 2573; CHECK-NEXT: vmlav.u8 r2, q1, q0 2574; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 2575; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 2576; CHECK-NEXT: vmlava.u8 r2, q1, q0 2577; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 2578; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 2579; CHECK-NEXT: vmlava.u8 r2, q1, q0 2580; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 2581; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 2582; CHECK-NEXT: vmlava.u8 r2, q1, q0 2583; CHECK-NEXT: mov r0, r2 2584; CHECK-NEXT: bx lr 2585entry: 2586 %wide.load = load <16 x i8>, ptr %x, align 1 2587 %0 = zext <16 x i8> %wide.load to <16 x i32> 2588 %wide.load11 = load <16 x i8>, ptr %y, align 1 2589 %1 = zext <16 x i8> %wide.load11 to <16 x i32> 2590 %2 = mul nuw nsw <16 x i32> %1, %0 2591 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 2592 %4 = getelementptr inbounds i8, ptr %x, i32 16 2593 %wide.load.1 = load <16 x i8>, ptr %4, align 1 2594 %5 = zext <16 x i8> %wide.load.1 to <16 x i32> 2595 %6 = getelementptr inbounds i8, ptr %y, i32 16 2596 %wide.load11.1 = load <16 x i8>, ptr %6, align 1 2597 %7 = zext <16 x i8> %wide.load11.1 to <16 x i32> 2598 %8 = mul nuw nsw <16 x i32> %7, %5 2599 %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8) 2600 %10 = add i32 %9, %3 2601 %11 = getelementptr inbounds i8, ptr %x, i32 32 2602 %wide.load.2 = load <16 x i8>, ptr %11, align 1 2603 %12 = zext <16 x i8> %wide.load.2 to <16 x i32> 2604 %13 = getelementptr inbounds i8, ptr %y, i32 32 2605 %wide.load11.2 = load <16 x i8>, ptr %13, align 1 2606 %14 = zext <16 x i8> %wide.load11.2 to <16 x i32> 2607 %15 = mul nuw nsw <16 x i32> %14, %12 2608 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) 2609 %17 = add i32 %16, %10 2610 %18 = getelementptr inbounds i8, ptr %x, i32 48 2611 %wide.load.3 = load <16 x i8>, ptr %18, align 1 2612 %19 = zext <16 x i8> %wide.load.3 to <16 x i32> 2613 %20 = getelementptr inbounds i8, ptr %y, i32 48 2614 %wide.load11.3 = load <16 x i8>, ptr %20, align 1 2615 %21 = zext <16 x i8> %wide.load11.3 to <16 x i32> 2616 %22 = mul nuw nsw <16 x i32> %21, %19 2617 %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22) 2618 %24 = add i32 %23, %17 2619 ret i32 %24 2620} 2621 2622define i32 @mlav128i32i8(ptr %x, ptr %y) { 2623; CHECK-LABEL: mlav128i32i8: 2624; CHECK: @ %bb.0: @ %entry 2625; CHECK-NEXT: vldrb.u8 q0, [r0] 2626; CHECK-NEXT: vldrb.u8 q1, [r1] 2627; CHECK-NEXT: mov r2, r0 2628; CHECK-NEXT: vmlav.u8 r0, q1, q0 2629; CHECK-NEXT: vldrb.u8 q0, [r2, #16] 2630; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 2631; CHECK-NEXT: vmlava.u8 r0, q1, q0 2632; CHECK-NEXT: vldrb.u8 q0, [r2, #32] 2633; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 2634; CHECK-NEXT: vmlava.u8 r0, q1, q0 2635; CHECK-NEXT: vldrb.u8 q0, [r2, #48] 2636; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 2637; CHECK-NEXT: vmlava.u8 r0, q1, q0 2638; CHECK-NEXT: vldrb.u8 q0, [r2, #64] 2639; CHECK-NEXT: vldrb.u8 q1, [r1, #64] 2640; CHECK-NEXT: vmlava.u8 r0, q1, q0 2641; CHECK-NEXT: vldrb.u8 q0, [r2, #80] 2642; CHECK-NEXT: vldrb.u8 q1, [r1, #80] 2643; CHECK-NEXT: vmlava.u8 r0, q1, q0 2644; CHECK-NEXT: vldrb.u8 q0, [r2, #96] 2645; CHECK-NEXT: vldrb.u8 q1, [r1, #96] 2646; CHECK-NEXT: vmlava.u8 r0, q1, q0 2647; CHECK-NEXT: vldrb.u8 q0, [r2, #112] 2648; CHECK-NEXT: vldrb.u8 q1, [r1, #112] 2649; CHECK-NEXT: vmlava.u8 r0, q1, q0 2650; CHECK-NEXT: bx lr 2651entry: 2652 %wide.load = load <16 x i8>, ptr %x, align 1 2653 %0 = zext <16 x i8> %wide.load to <16 x i32> 2654 %wide.load11 = load <16 x i8>, ptr %y, align 1 2655 %1 = zext <16 x i8> %wide.load11 to <16 x i32> 2656 %2 = mul nuw nsw <16 x i32> %1, %0 2657 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) 2658 %4 = getelementptr inbounds i8, ptr %x, i32 16 2659 %wide.load.1 = load <16 x i8>, ptr %4, align 1 2660 %5 = zext <16 x i8> %wide.load.1 to <16 x i32> 2661 %6 = getelementptr inbounds i8, ptr %y, i32 16 2662 %wide.load11.1 = load <16 x i8>, ptr %6, align 1 2663 %7 = zext <16 x i8> %wide.load11.1 to <16 x i32> 2664 %8 = mul nuw nsw <16 x i32> %7, %5 2665 %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8) 2666 %10 = add i32 %9, %3 2667 %11 = getelementptr inbounds i8, ptr %x, i32 32 2668 %wide.load.2 = load <16 x i8>, ptr %11, align 1 2669 %12 = zext <16 x i8> %wide.load.2 to <16 x i32> 2670 %13 = getelementptr inbounds i8, ptr %y, i32 32 2671 %wide.load11.2 = load <16 x i8>, ptr %13, align 1 2672 %14 = zext <16 x i8> %wide.load11.2 to <16 x i32> 2673 %15 = mul nuw nsw <16 x i32> %14, %12 2674 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) 2675 %17 = add i32 %16, %10 2676 %18 = getelementptr inbounds i8, ptr %x, i32 48 2677 %wide.load.3 = load <16 x i8>, ptr %18, align 1 2678 %19 = zext <16 x i8> %wide.load.3 to <16 x i32> 2679 %20 = getelementptr inbounds i8, ptr %y, i32 48 2680 %wide.load11.3 = load <16 x i8>, ptr %20, align 1 2681 %21 = zext <16 x i8> %wide.load11.3 to <16 x i32> 2682 %22 = mul nuw nsw <16 x i32> %21, %19 2683 %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22) 2684 %24 = add i32 %23, %17 2685 %25 = getelementptr inbounds i8, ptr %x, i32 64 2686 %wide.load.4 = load <16 x i8>, ptr %25, align 1 2687 %26 = zext <16 x i8> %wide.load.4 to <16 x i32> 2688 %27 = getelementptr inbounds i8, ptr %y, i32 64 2689 %wide.load11.4 = load <16 x i8>, ptr %27, align 1 2690 %28 = zext <16 x i8> %wide.load11.4 to <16 x i32> 2691 %29 = mul nuw nsw <16 x i32> %28, %26 2692 %30 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %29) 2693 %31 = add i32 %30, %24 2694 %32 = getelementptr inbounds i8, ptr %x, i32 80 2695 %wide.load.5 = load <16 x i8>, ptr %32, align 1 2696 %33 = zext <16 x i8> %wide.load.5 to <16 x i32> 2697 %34 = getelementptr inbounds i8, ptr %y, i32 80 2698 %wide.load11.5 = load <16 x i8>, ptr %34, align 1 2699 %35 = zext <16 x i8> %wide.load11.5 to <16 x i32> 2700 %36 = mul nuw nsw <16 x i32> %35, %33 2701 %37 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %36) 2702 %38 = add i32 %37, %31 2703 %39 = getelementptr inbounds i8, ptr %x, i32 96 2704 %wide.load.6 = load <16 x i8>, ptr %39, align 1 2705 %40 = zext <16 x i8> %wide.load.6 to <16 x i32> 2706 %41 = getelementptr inbounds i8, ptr %y, i32 96 2707 %wide.load11.6 = load <16 x i8>, ptr %41, align 1 2708 %42 = zext <16 x i8> %wide.load11.6 to <16 x i32> 2709 %43 = mul nuw nsw <16 x i32> %42, %40 2710 %44 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %43) 2711 %45 = add i32 %44, %38 2712 %46 = getelementptr inbounds i8, ptr %x, i32 112 2713 %wide.load.7 = load <16 x i8>, ptr %46, align 1 2714 %47 = zext <16 x i8> %wide.load.7 to <16 x i32> 2715 %48 = getelementptr inbounds i8, ptr %y, i32 112 2716 %wide.load11.7 = load <16 x i8>, ptr %48, align 1 2717 %49 = zext <16 x i8> %wide.load11.7 to <16 x i32> 2718 %50 = mul nuw nsw <16 x i32> %49, %47 2719 %51 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %50) 2720 %52 = add i32 %51, %45 2721 ret i32 %52 2722} 2723 2724define signext i16 @mlav2i16i16(ptr %x, ptr %y) { 2725; CHECK-LABEL: mlav2i16i16: 2726; CHECK: @ %bb.0: @ %entry 2727; CHECK-NEXT: ldrh r2, [r0] 2728; CHECK-NEXT: ldrh r3, [r1] 2729; CHECK-NEXT: ldrh r0, [r0, #2] 2730; CHECK-NEXT: ldrh r1, [r1, #2] 2731; CHECK-NEXT: muls r2, r3, r2 2732; CHECK-NEXT: mla r0, r1, r0, r2 2733; CHECK-NEXT: sxth r0, r0 2734; CHECK-NEXT: bx lr 2735entry: 2736 %0 = load i16, ptr %x, align 2 2737 %1 = load i16, ptr %y, align 2 2738 %mul = mul i16 %1, %0 2739 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 2740 %2 = load i16, ptr %arrayidx.1, align 2 2741 %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1 2742 %3 = load i16, ptr %arrayidx1.1, align 2 2743 %mul.1 = mul i16 %3, %2 2744 %add.1 = add i16 %mul.1, %mul 2745 ret i16 %add.1 2746} 2747 2748define signext i16 @mlav4i16i16(ptr %x, ptr %y) { 2749; CHECK-LABEL: mlav4i16i16: 2750; CHECK: @ %bb.0: @ %entry 2751; CHECK-NEXT: vldrh.u32 q0, [r0] 2752; CHECK-NEXT: vldrh.u32 q1, [r1] 2753; CHECK-NEXT: vmlav.u32 r0, q1, q0 2754; CHECK-NEXT: sxth r0, r0 2755; CHECK-NEXT: bx lr 2756entry: 2757 %0 = load <4 x i16>, ptr %x, align 2 2758 %1 = load <4 x i16>, ptr %y, align 2 2759 %2 = mul <4 x i16> %1, %0 2760 %3 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %2) 2761 ret i16 %3 2762} 2763 2764define signext i16 @mlav8i16i16(ptr %x, ptr %y) { 2765; CHECK-LABEL: mlav8i16i16: 2766; CHECK: @ %bb.0: @ %entry 2767; CHECK-NEXT: vldrh.u16 q0, [r0] 2768; CHECK-NEXT: vldrh.u16 q1, [r1] 2769; CHECK-NEXT: vmlav.u16 r0, q1, q0 2770; CHECK-NEXT: sxth r0, r0 2771; CHECK-NEXT: bx lr 2772entry: 2773 %0 = load <8 x i16>, ptr %x, align 2 2774 %1 = load <8 x i16>, ptr %y, align 2 2775 %2 = mul <8 x i16> %1, %0 2776 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 2777 ret i16 %3 2778} 2779 2780define signext i16 @mlav16i16i16(ptr %x, ptr %y) { 2781; CHECK-LABEL: mlav16i16i16: 2782; CHECK: @ %bb.0: @ %entry 2783; CHECK-NEXT: vldrh.u16 q0, [r0] 2784; CHECK-NEXT: vldrh.u16 q1, [r1] 2785; CHECK-NEXT: vmlav.u16 r2, q1, q0 2786; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 2787; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2788; CHECK-NEXT: vmlava.u16 r2, q1, q0 2789; CHECK-NEXT: sxth r0, r2 2790; CHECK-NEXT: bx lr 2791entry: 2792 %0 = load <16 x i16>, ptr %x, align 2 2793 %1 = load <16 x i16>, ptr %y, align 2 2794 %2 = mul <16 x i16> %1, %0 2795 %3 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %2) 2796 ret i16 %3 2797} 2798 2799define signext i16 @mlav24i16i16(ptr %x, ptr %y) { 2800; CHECK-LABEL: mlav24i16i16: 2801; CHECK: @ %bb.0: @ %entry 2802; CHECK-NEXT: vldrh.u16 q0, [r0] 2803; CHECK-NEXT: vldrh.u16 q1, [r1] 2804; CHECK-NEXT: vmlav.u16 r2, q1, q0 2805; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 2806; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2807; CHECK-NEXT: vmlava.u16 r2, q1, q0 2808; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 2809; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2810; CHECK-NEXT: vmlava.u16 r2, q1, q0 2811; CHECK-NEXT: sxth r0, r2 2812; CHECK-NEXT: bx lr 2813entry: 2814 %0 = load <8 x i16>, ptr %x, align 2 2815 %1 = load <8 x i16>, ptr %y, align 2 2816 %2 = mul <8 x i16> %1, %0 2817 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 2818 %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8 2819 %3 = load <16 x i16>, ptr %arrayidx.8, align 2 2820 %4 = load <16 x i16>, ptr %arrayidx1.8, align 2 2821 %5 = mul <16 x i16> %4, %3 2822 %6 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5) 2823 %7 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 2824 %op.rdx = add i16 %6, %7 2825 ret i16 %op.rdx 2826} 2827 2828define signext i16 @mlav32i16i16(ptr %x, ptr %y) { 2829; CHECK-LABEL: mlav32i16i16: 2830; CHECK: @ %bb.0: @ %entry 2831; CHECK-NEXT: vldrh.u16 q0, [r0] 2832; CHECK-NEXT: vldrh.u16 q1, [r1] 2833; CHECK-NEXT: vmlav.u16 r2, q1, q0 2834; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 2835; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2836; CHECK-NEXT: vmlava.u16 r2, q1, q0 2837; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 2838; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2839; CHECK-NEXT: vmlava.u16 r2, q1, q0 2840; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 2841; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2842; CHECK-NEXT: vmlava.u16 r2, q1, q0 2843; CHECK-NEXT: sxth r0, r2 2844; CHECK-NEXT: bx lr 2845entry: 2846 %0 = load <32 x i16>, ptr %x, align 2 2847 %1 = load <32 x i16>, ptr %y, align 2 2848 %2 = mul <32 x i16> %1, %0 2849 %3 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %2) 2850 ret i16 %3 2851} 2852 2853define signext i16 @mlav64i16i16(ptr %x, ptr %y) { 2854; CHECK-LABEL: mlav64i16i16: 2855; CHECK: @ %bb.0: @ %entry 2856; CHECK-NEXT: vldrh.u16 q0, [r0] 2857; CHECK-NEXT: vldrh.u16 q1, [r1] 2858; CHECK-NEXT: vmlav.u16 r2, q1, q0 2859; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 2860; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2861; CHECK-NEXT: vmlava.u16 r2, q1, q0 2862; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 2863; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2864; CHECK-NEXT: vmlava.u16 r2, q1, q0 2865; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 2866; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2867; CHECK-NEXT: vmlava.u16 r2, q1, q0 2868; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 2869; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 2870; CHECK-NEXT: vmlava.u16 r2, q1, q0 2871; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 2872; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 2873; CHECK-NEXT: vmlava.u16 r2, q1, q0 2874; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 2875; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 2876; CHECK-NEXT: vmlava.u16 r2, q1, q0 2877; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 2878; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 2879; CHECK-NEXT: vmlava.u16 r2, q1, q0 2880; CHECK-NEXT: sxth r0, r2 2881; CHECK-NEXT: bx lr 2882entry: 2883 %wide.load = load <8 x i16>, ptr %x, align 2 2884 %wide.load13 = load <8 x i16>, ptr %y, align 2 2885 %0 = mul <8 x i16> %wide.load13, %wide.load 2886 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) 2887 %2 = getelementptr inbounds i16, ptr %x, i32 8 2888 %wide.load.1 = load <8 x i16>, ptr %2, align 2 2889 %3 = getelementptr inbounds i16, ptr %y, i32 8 2890 %wide.load13.1 = load <8 x i16>, ptr %3, align 2 2891 %4 = mul <8 x i16> %wide.load13.1, %wide.load.1 2892 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4) 2893 %6 = add i16 %5, %1 2894 %7 = getelementptr inbounds i16, ptr %x, i32 16 2895 %wide.load.2 = load <8 x i16>, ptr %7, align 2 2896 %8 = getelementptr inbounds i16, ptr %y, i32 16 2897 %wide.load13.2 = load <8 x i16>, ptr %8, align 2 2898 %9 = mul <8 x i16> %wide.load13.2, %wide.load.2 2899 %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9) 2900 %11 = add i16 %10, %6 2901 %12 = getelementptr inbounds i16, ptr %x, i32 24 2902 %wide.load.3 = load <8 x i16>, ptr %12, align 2 2903 %13 = getelementptr inbounds i16, ptr %y, i32 24 2904 %wide.load13.3 = load <8 x i16>, ptr %13, align 2 2905 %14 = mul <8 x i16> %wide.load13.3, %wide.load.3 2906 %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14) 2907 %16 = add i16 %15, %11 2908 %17 = getelementptr inbounds i16, ptr %x, i32 32 2909 %wide.load.4 = load <8 x i16>, ptr %17, align 2 2910 %18 = getelementptr inbounds i16, ptr %y, i32 32 2911 %wide.load13.4 = load <8 x i16>, ptr %18, align 2 2912 %19 = mul <8 x i16> %wide.load13.4, %wide.load.4 2913 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19) 2914 %21 = add i16 %20, %16 2915 %22 = getelementptr inbounds i16, ptr %x, i32 40 2916 %wide.load.5 = load <8 x i16>, ptr %22, align 2 2917 %23 = getelementptr inbounds i16, ptr %y, i32 40 2918 %wide.load13.5 = load <8 x i16>, ptr %23, align 2 2919 %24 = mul <8 x i16> %wide.load13.5, %wide.load.5 2920 %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24) 2921 %26 = add i16 %25, %21 2922 %27 = getelementptr inbounds i16, ptr %x, i32 48 2923 %wide.load.6 = load <8 x i16>, ptr %27, align 2 2924 %28 = getelementptr inbounds i16, ptr %y, i32 48 2925 %wide.load13.6 = load <8 x i16>, ptr %28, align 2 2926 %29 = mul <8 x i16> %wide.load13.6, %wide.load.6 2927 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29) 2928 %31 = add i16 %30, %26 2929 %32 = getelementptr inbounds i16, ptr %x, i32 56 2930 %wide.load.7 = load <8 x i16>, ptr %32, align 2 2931 %33 = getelementptr inbounds i16, ptr %y, i32 56 2932 %wide.load13.7 = load <8 x i16>, ptr %33, align 2 2933 %34 = mul <8 x i16> %wide.load13.7, %wide.load.7 2934 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34) 2935 %36 = add i16 %35, %31 2936 ret i16 %36 2937} 2938 2939define signext i16 @mlav128i16i16(ptr %x, ptr %y) { 2940; CHECK-LABEL: mlav128i16i16: 2941; CHECK: @ %bb.0: @ %entry 2942; CHECK-NEXT: vldrh.u16 q0, [r0] 2943; CHECK-NEXT: vldrh.u16 q1, [r1] 2944; CHECK-NEXT: vmlav.u16 r2, q1, q0 2945; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 2946; CHECK-NEXT: vldrh.u16 q1, [r1, #16] 2947; CHECK-NEXT: vmlava.u16 r2, q1, q0 2948; CHECK-NEXT: vldrh.u16 q0, [r0, #32] 2949; CHECK-NEXT: vldrh.u16 q1, [r1, #32] 2950; CHECK-NEXT: vmlava.u16 r2, q1, q0 2951; CHECK-NEXT: vldrh.u16 q0, [r0, #48] 2952; CHECK-NEXT: vldrh.u16 q1, [r1, #48] 2953; CHECK-NEXT: vmlava.u16 r2, q1, q0 2954; CHECK-NEXT: vldrh.u16 q0, [r0, #64] 2955; CHECK-NEXT: vldrh.u16 q1, [r1, #64] 2956; CHECK-NEXT: vmlava.u16 r2, q1, q0 2957; CHECK-NEXT: vldrh.u16 q0, [r0, #80] 2958; CHECK-NEXT: vldrh.u16 q1, [r1, #80] 2959; CHECK-NEXT: vmlava.u16 r2, q1, q0 2960; CHECK-NEXT: vldrh.u16 q0, [r0, #96] 2961; CHECK-NEXT: vldrh.u16 q1, [r1, #96] 2962; CHECK-NEXT: vmlava.u16 r2, q1, q0 2963; CHECK-NEXT: vldrh.u16 q0, [r0, #112] 2964; CHECK-NEXT: vldrh.u16 q1, [r1, #112] 2965; CHECK-NEXT: vmlava.u16 r2, q1, q0 2966; CHECK-NEXT: vldrh.u16 q0, [r0, #128] 2967; CHECK-NEXT: vldrh.u16 q1, [r1, #128] 2968; CHECK-NEXT: vmlava.u16 r2, q1, q0 2969; CHECK-NEXT: vldrh.u16 q0, [r0, #144] 2970; CHECK-NEXT: vldrh.u16 q1, [r1, #144] 2971; CHECK-NEXT: vmlava.u16 r2, q1, q0 2972; CHECK-NEXT: vldrh.u16 q0, [r0, #160] 2973; CHECK-NEXT: vldrh.u16 q1, [r1, #160] 2974; CHECK-NEXT: vmlava.u16 r2, q1, q0 2975; CHECK-NEXT: vldrh.u16 q0, [r0, #176] 2976; CHECK-NEXT: vldrh.u16 q1, [r1, #176] 2977; CHECK-NEXT: vmlava.u16 r2, q1, q0 2978; CHECK-NEXT: vldrh.u16 q0, [r0, #192] 2979; CHECK-NEXT: vldrh.u16 q1, [r1, #192] 2980; CHECK-NEXT: vmlava.u16 r2, q1, q0 2981; CHECK-NEXT: vldrh.u16 q0, [r0, #208] 2982; CHECK-NEXT: vldrh.u16 q1, [r1, #208] 2983; CHECK-NEXT: vmlava.u16 r2, q1, q0 2984; CHECK-NEXT: vldrh.u16 q0, [r0, #224] 2985; CHECK-NEXT: vldrh.u16 q1, [r1, #224] 2986; CHECK-NEXT: vmlava.u16 r2, q1, q0 2987; CHECK-NEXT: vldrh.u16 q0, [r0, #240] 2988; CHECK-NEXT: vldrh.u16 q1, [r1, #240] 2989; CHECK-NEXT: vmlava.u16 r2, q1, q0 2990; CHECK-NEXT: sxth r0, r2 2991; CHECK-NEXT: bx lr 2992entry: 2993 %wide.load = load <8 x i16>, ptr %x, align 2 2994 %wide.load13 = load <8 x i16>, ptr %y, align 2 2995 %0 = mul <8 x i16> %wide.load13, %wide.load 2996 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) 2997 %2 = getelementptr inbounds i16, ptr %x, i32 8 2998 %wide.load.1 = load <8 x i16>, ptr %2, align 2 2999 %3 = getelementptr inbounds i16, ptr %y, i32 8 3000 %wide.load13.1 = load <8 x i16>, ptr %3, align 2 3001 %4 = mul <8 x i16> %wide.load13.1, %wide.load.1 3002 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4) 3003 %6 = add i16 %5, %1 3004 %7 = getelementptr inbounds i16, ptr %x, i32 16 3005 %wide.load.2 = load <8 x i16>, ptr %7, align 2 3006 %8 = getelementptr inbounds i16, ptr %y, i32 16 3007 %wide.load13.2 = load <8 x i16>, ptr %8, align 2 3008 %9 = mul <8 x i16> %wide.load13.2, %wide.load.2 3009 %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9) 3010 %11 = add i16 %10, %6 3011 %12 = getelementptr inbounds i16, ptr %x, i32 24 3012 %wide.load.3 = load <8 x i16>, ptr %12, align 2 3013 %13 = getelementptr inbounds i16, ptr %y, i32 24 3014 %wide.load13.3 = load <8 x i16>, ptr %13, align 2 3015 %14 = mul <8 x i16> %wide.load13.3, %wide.load.3 3016 %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14) 3017 %16 = add i16 %15, %11 3018 %17 = getelementptr inbounds i16, ptr %x, i32 32 3019 %wide.load.4 = load <8 x i16>, ptr %17, align 2 3020 %18 = getelementptr inbounds i16, ptr %y, i32 32 3021 %wide.load13.4 = load <8 x i16>, ptr %18, align 2 3022 %19 = mul <8 x i16> %wide.load13.4, %wide.load.4 3023 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19) 3024 %21 = add i16 %20, %16 3025 %22 = getelementptr inbounds i16, ptr %x, i32 40 3026 %wide.load.5 = load <8 x i16>, ptr %22, align 2 3027 %23 = getelementptr inbounds i16, ptr %y, i32 40 3028 %wide.load13.5 = load <8 x i16>, ptr %23, align 2 3029 %24 = mul <8 x i16> %wide.load13.5, %wide.load.5 3030 %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24) 3031 %26 = add i16 %25, %21 3032 %27 = getelementptr inbounds i16, ptr %x, i32 48 3033 %wide.load.6 = load <8 x i16>, ptr %27, align 2 3034 %28 = getelementptr inbounds i16, ptr %y, i32 48 3035 %wide.load13.6 = load <8 x i16>, ptr %28, align 2 3036 %29 = mul <8 x i16> %wide.load13.6, %wide.load.6 3037 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29) 3038 %31 = add i16 %30, %26 3039 %32 = getelementptr inbounds i16, ptr %x, i32 56 3040 %wide.load.7 = load <8 x i16>, ptr %32, align 2 3041 %33 = getelementptr inbounds i16, ptr %y, i32 56 3042 %wide.load13.7 = load <8 x i16>, ptr %33, align 2 3043 %34 = mul <8 x i16> %wide.load13.7, %wide.load.7 3044 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34) 3045 %36 = add i16 %35, %31 3046 %37 = getelementptr inbounds i16, ptr %x, i32 64 3047 %wide.load.8 = load <8 x i16>, ptr %37, align 2 3048 %38 = getelementptr inbounds i16, ptr %y, i32 64 3049 %wide.load13.8 = load <8 x i16>, ptr %38, align 2 3050 %39 = mul <8 x i16> %wide.load13.8, %wide.load.8 3051 %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %39) 3052 %41 = add i16 %40, %36 3053 %42 = getelementptr inbounds i16, ptr %x, i32 72 3054 %wide.load.9 = load <8 x i16>, ptr %42, align 2 3055 %43 = getelementptr inbounds i16, ptr %y, i32 72 3056 %wide.load13.9 = load <8 x i16>, ptr %43, align 2 3057 %44 = mul <8 x i16> %wide.load13.9, %wide.load.9 3058 %45 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %44) 3059 %46 = add i16 %45, %41 3060 %47 = getelementptr inbounds i16, ptr %x, i32 80 3061 %wide.load.10 = load <8 x i16>, ptr %47, align 2 3062 %48 = getelementptr inbounds i16, ptr %y, i32 80 3063 %wide.load13.10 = load <8 x i16>, ptr %48, align 2 3064 %49 = mul <8 x i16> %wide.load13.10, %wide.load.10 3065 %50 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %49) 3066 %51 = add i16 %50, %46 3067 %52 = getelementptr inbounds i16, ptr %x, i32 88 3068 %wide.load.11 = load <8 x i16>, ptr %52, align 2 3069 %53 = getelementptr inbounds i16, ptr %y, i32 88 3070 %wide.load13.11 = load <8 x i16>, ptr %53, align 2 3071 %54 = mul <8 x i16> %wide.load13.11, %wide.load.11 3072 %55 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %54) 3073 %56 = add i16 %55, %51 3074 %57 = getelementptr inbounds i16, ptr %x, i32 96 3075 %wide.load.12 = load <8 x i16>, ptr %57, align 2 3076 %58 = getelementptr inbounds i16, ptr %y, i32 96 3077 %wide.load13.12 = load <8 x i16>, ptr %58, align 2 3078 %59 = mul <8 x i16> %wide.load13.12, %wide.load.12 3079 %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %59) 3080 %61 = add i16 %60, %56 3081 %62 = getelementptr inbounds i16, ptr %x, i32 104 3082 %wide.load.13 = load <8 x i16>, ptr %62, align 2 3083 %63 = getelementptr inbounds i16, ptr %y, i32 104 3084 %wide.load13.13 = load <8 x i16>, ptr %63, align 2 3085 %64 = mul <8 x i16> %wide.load13.13, %wide.load.13 3086 %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64) 3087 %66 = add i16 %65, %61 3088 %67 = getelementptr inbounds i16, ptr %x, i32 112 3089 %wide.load.14 = load <8 x i16>, ptr %67, align 2 3090 %68 = getelementptr inbounds i16, ptr %y, i32 112 3091 %wide.load13.14 = load <8 x i16>, ptr %68, align 2 3092 %69 = mul <8 x i16> %wide.load13.14, %wide.load.14 3093 %70 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %69) 3094 %71 = add i16 %70, %66 3095 %72 = getelementptr inbounds i16, ptr %x, i32 120 3096 %wide.load.15 = load <8 x i16>, ptr %72, align 2 3097 %73 = getelementptr inbounds i16, ptr %y, i32 120 3098 %wide.load13.15 = load <8 x i16>, ptr %73, align 2 3099 %74 = mul <8 x i16> %wide.load13.15, %wide.load.15 3100 %75 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %74) 3101 %76 = add i16 %75, %71 3102 ret i16 %76 3103} 3104 3105define zeroext i8 @mlav2i8i8(ptr %x, ptr %y) { 3106; CHECK-LABEL: mlav2i8i8: 3107; CHECK: @ %bb.0: @ %entry 3108; CHECK-NEXT: ldrb r2, [r0] 3109; CHECK-NEXT: ldrb r3, [r1] 3110; CHECK-NEXT: ldrb r0, [r0, #1] 3111; CHECK-NEXT: ldrb r1, [r1, #1] 3112; CHECK-NEXT: muls r2, r3, r2 3113; CHECK-NEXT: mla r0, r1, r0, r2 3114; CHECK-NEXT: uxtb r0, r0 3115; CHECK-NEXT: bx lr 3116entry: 3117 %0 = load i8, ptr %x, align 1 3118 %1 = load i8, ptr %y, align 1 3119 %mul = mul i8 %1, %0 3120 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 3121 %2 = load i8, ptr %arrayidx.1, align 1 3122 %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1 3123 %3 = load i8, ptr %arrayidx1.1, align 1 3124 %mul.1 = mul i8 %3, %2 3125 %add.1 = add i8 %mul.1, %mul 3126 ret i8 %add.1 3127} 3128 3129define zeroext i8 @mlav4i8i8(ptr %x, ptr %y) { 3130; CHECK-LABEL: mlav4i8i8: 3131; CHECK: @ %bb.0: @ %entry 3132; CHECK-NEXT: vldrb.u32 q0, [r0] 3133; CHECK-NEXT: vldrb.u32 q1, [r1] 3134; CHECK-NEXT: vmlav.u32 r0, q1, q0 3135; CHECK-NEXT: uxtb r0, r0 3136; CHECK-NEXT: bx lr 3137entry: 3138 %0 = load <4 x i8>, ptr %x, align 1 3139 %1 = load <4 x i8>, ptr %y, align 1 3140 %2 = mul <4 x i8> %1, %0 3141 %3 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %2) 3142 ret i8 %3 3143} 3144 3145define zeroext i8 @mlav8i8i8(ptr %x, ptr %y) { 3146; CHECK-LABEL: mlav8i8i8: 3147; CHECK: @ %bb.0: @ %entry 3148; CHECK-NEXT: vldrb.u16 q0, [r0] 3149; CHECK-NEXT: vldrb.u16 q1, [r1] 3150; CHECK-NEXT: vmlav.u16 r0, q1, q0 3151; CHECK-NEXT: uxtb r0, r0 3152; CHECK-NEXT: bx lr 3153entry: 3154 %0 = load <8 x i8>, ptr %x, align 1 3155 %1 = load <8 x i8>, ptr %y, align 1 3156 %2 = mul <8 x i8> %1, %0 3157 %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2) 3158 ret i8 %3 3159} 3160 3161define zeroext i8 @mlav16i8i8(ptr %x, ptr %y) { 3162; CHECK-LABEL: mlav16i8i8: 3163; CHECK: @ %bb.0: @ %entry 3164; CHECK-NEXT: vldrb.u8 q0, [r0] 3165; CHECK-NEXT: vldrb.u8 q1, [r1] 3166; CHECK-NEXT: vmlav.u8 r0, q1, q0 3167; CHECK-NEXT: uxtb r0, r0 3168; CHECK-NEXT: bx lr 3169entry: 3170 %0 = load <16 x i8>, ptr %x, align 1 3171 %1 = load <16 x i8>, ptr %y, align 1 3172 %2 = mul <16 x i8> %1, %0 3173 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) 3174 ret i8 %3 3175} 3176 3177define zeroext i8 @mlav24i8i8(ptr %x, ptr %y) { 3178; CHECK-LABEL: mlav24i8i8: 3179; CHECK: @ %bb.0: @ %entry 3180; CHECK-NEXT: vldrb.u16 q0, [r0] 3181; CHECK-NEXT: vldrb.u16 q1, [r1] 3182; CHECK-NEXT: vmlav.u16 r2, q1, q0 3183; CHECK-NEXT: vldrb.u8 q0, [r0, #8] 3184; CHECK-NEXT: vldrb.u8 q1, [r1, #8] 3185; CHECK-NEXT: vmlava.u8 r2, q1, q0 3186; CHECK-NEXT: uxtb r0, r2 3187; CHECK-NEXT: bx lr 3188entry: 3189 %0 = load <8 x i8>, ptr %x, align 1 3190 %1 = load <8 x i8>, ptr %y, align 1 3191 %2 = mul <8 x i8> %1, %0 3192 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 3193 %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8 3194 %3 = load <16 x i8>, ptr %arrayidx.8, align 1 3195 %4 = load <16 x i8>, ptr %arrayidx1.8, align 1 3196 %5 = mul <16 x i8> %4, %3 3197 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5) 3198 %7 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2) 3199 %op.rdx = add i8 %6, %7 3200 ret i8 %op.rdx 3201} 3202 3203define zeroext i8 @mlav32i8i8(ptr %x, ptr %y) { 3204; CHECK-LABEL: mlav32i8i8: 3205; CHECK: @ %bb.0: @ %entry 3206; CHECK-NEXT: vldrb.u8 q0, [r0] 3207; CHECK-NEXT: vldrb.u8 q1, [r1] 3208; CHECK-NEXT: vmlav.u8 r2, q1, q0 3209; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 3210; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 3211; CHECK-NEXT: vmlava.u8 r2, q1, q0 3212; CHECK-NEXT: uxtb r0, r2 3213; CHECK-NEXT: bx lr 3214entry: 3215 %0 = load <32 x i8>, ptr %x, align 1 3216 %1 = load <32 x i8>, ptr %y, align 1 3217 %2 = mul <32 x i8> %1, %0 3218 %3 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %2) 3219 ret i8 %3 3220} 3221 3222define zeroext i8 @mlav64i8i8(ptr %x, ptr %y) { 3223; CHECK-LABEL: mlav64i8i8: 3224; CHECK: @ %bb.0: @ %entry 3225; CHECK-NEXT: vldrb.u8 q0, [r0] 3226; CHECK-NEXT: vldrb.u8 q1, [r1] 3227; CHECK-NEXT: vmlav.u8 r2, q1, q0 3228; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 3229; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 3230; CHECK-NEXT: vmlava.u8 r2, q1, q0 3231; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 3232; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 3233; CHECK-NEXT: vmlava.u8 r2, q1, q0 3234; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 3235; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 3236; CHECK-NEXT: vmlava.u8 r2, q1, q0 3237; CHECK-NEXT: uxtb r0, r2 3238; CHECK-NEXT: bx lr 3239entry: 3240 %wide.load = load <16 x i8>, ptr %x, align 1 3241 %wide.load12 = load <16 x i8>, ptr %y, align 1 3242 %0 = mul <16 x i8> %wide.load12, %wide.load 3243 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) 3244 %2 = getelementptr inbounds i8, ptr %x, i32 16 3245 %wide.load.1 = load <16 x i8>, ptr %2, align 1 3246 %3 = getelementptr inbounds i8, ptr %y, i32 16 3247 %wide.load12.1 = load <16 x i8>, ptr %3, align 1 3248 %4 = mul <16 x i8> %wide.load12.1, %wide.load.1 3249 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4) 3250 %6 = add i8 %5, %1 3251 %7 = getelementptr inbounds i8, ptr %x, i32 32 3252 %wide.load.2 = load <16 x i8>, ptr %7, align 1 3253 %8 = getelementptr inbounds i8, ptr %y, i32 32 3254 %wide.load12.2 = load <16 x i8>, ptr %8, align 1 3255 %9 = mul <16 x i8> %wide.load12.2, %wide.load.2 3256 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9) 3257 %11 = add i8 %10, %6 3258 %12 = getelementptr inbounds i8, ptr %x, i32 48 3259 %wide.load.3 = load <16 x i8>, ptr %12, align 1 3260 %13 = getelementptr inbounds i8, ptr %y, i32 48 3261 %wide.load12.3 = load <16 x i8>, ptr %13, align 1 3262 %14 = mul <16 x i8> %wide.load12.3, %wide.load.3 3263 %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14) 3264 %16 = add i8 %15, %11 3265 ret i8 %16 3266} 3267 3268define zeroext i8 @mlav128i8i8(ptr %x, ptr %y) { 3269; CHECK-LABEL: mlav128i8i8: 3270; CHECK: @ %bb.0: @ %entry 3271; CHECK-NEXT: vldrb.u8 q0, [r0] 3272; CHECK-NEXT: vldrb.u8 q1, [r1] 3273; CHECK-NEXT: vmlav.u8 r2, q1, q0 3274; CHECK-NEXT: vldrb.u8 q0, [r0, #16] 3275; CHECK-NEXT: vldrb.u8 q1, [r1, #16] 3276; CHECK-NEXT: vmlava.u8 r2, q1, q0 3277; CHECK-NEXT: vldrb.u8 q0, [r0, #32] 3278; CHECK-NEXT: vldrb.u8 q1, [r1, #32] 3279; CHECK-NEXT: vmlava.u8 r2, q1, q0 3280; CHECK-NEXT: vldrb.u8 q0, [r0, #48] 3281; CHECK-NEXT: vldrb.u8 q1, [r1, #48] 3282; CHECK-NEXT: vmlava.u8 r2, q1, q0 3283; CHECK-NEXT: vldrb.u8 q0, [r0, #64] 3284; CHECK-NEXT: vldrb.u8 q1, [r1, #64] 3285; CHECK-NEXT: vmlava.u8 r2, q1, q0 3286; CHECK-NEXT: vldrb.u8 q0, [r0, #80] 3287; CHECK-NEXT: vldrb.u8 q1, [r1, #80] 3288; CHECK-NEXT: vmlava.u8 r2, q1, q0 3289; CHECK-NEXT: vldrb.u8 q0, [r0, #96] 3290; CHECK-NEXT: vldrb.u8 q1, [r1, #96] 3291; CHECK-NEXT: vmlava.u8 r2, q1, q0 3292; CHECK-NEXT: vldrb.u8 q0, [r0, #112] 3293; CHECK-NEXT: vldrb.u8 q1, [r1, #112] 3294; CHECK-NEXT: vmlava.u8 r2, q1, q0 3295; CHECK-NEXT: uxtb r0, r2 3296; CHECK-NEXT: bx lr 3297entry: 3298 %wide.load = load <16 x i8>, ptr %x, align 1 3299 %wide.load12 = load <16 x i8>, ptr %y, align 1 3300 %0 = mul <16 x i8> %wide.load12, %wide.load 3301 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) 3302 %2 = getelementptr inbounds i8, ptr %x, i32 16 3303 %wide.load.1 = load <16 x i8>, ptr %2, align 1 3304 %3 = getelementptr inbounds i8, ptr %y, i32 16 3305 %wide.load12.1 = load <16 x i8>, ptr %3, align 1 3306 %4 = mul <16 x i8> %wide.load12.1, %wide.load.1 3307 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4) 3308 %6 = add i8 %5, %1 3309 %7 = getelementptr inbounds i8, ptr %x, i32 32 3310 %wide.load.2 = load <16 x i8>, ptr %7, align 1 3311 %8 = getelementptr inbounds i8, ptr %y, i32 32 3312 %wide.load12.2 = load <16 x i8>, ptr %8, align 1 3313 %9 = mul <16 x i8> %wide.load12.2, %wide.load.2 3314 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9) 3315 %11 = add i8 %10, %6 3316 %12 = getelementptr inbounds i8, ptr %x, i32 48 3317 %wide.load.3 = load <16 x i8>, ptr %12, align 1 3318 %13 = getelementptr inbounds i8, ptr %y, i32 48 3319 %wide.load12.3 = load <16 x i8>, ptr %13, align 1 3320 %14 = mul <16 x i8> %wide.load12.3, %wide.load.3 3321 %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14) 3322 %16 = add i8 %15, %11 3323 %17 = getelementptr inbounds i8, ptr %x, i32 64 3324 %wide.load.4 = load <16 x i8>, ptr %17, align 1 3325 %18 = getelementptr inbounds i8, ptr %y, i32 64 3326 %wide.load12.4 = load <16 x i8>, ptr %18, align 1 3327 %19 = mul <16 x i8> %wide.load12.4, %wide.load.4 3328 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %19) 3329 %21 = add i8 %20, %16 3330 %22 = getelementptr inbounds i8, ptr %x, i32 80 3331 %wide.load.5 = load <16 x i8>, ptr %22, align 1 3332 %23 = getelementptr inbounds i8, ptr %y, i32 80 3333 %wide.load12.5 = load <16 x i8>, ptr %23, align 1 3334 %24 = mul <16 x i8> %wide.load12.5, %wide.load.5 3335 %25 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %24) 3336 %26 = add i8 %25, %21 3337 %27 = getelementptr inbounds i8, ptr %x, i32 96 3338 %wide.load.6 = load <16 x i8>, ptr %27, align 1 3339 %28 = getelementptr inbounds i8, ptr %y, i32 96 3340 %wide.load12.6 = load <16 x i8>, ptr %28, align 1 3341 %29 = mul <16 x i8> %wide.load12.6, %wide.load.6 3342 %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29) 3343 %31 = add i8 %30, %26 3344 %32 = getelementptr inbounds i8, ptr %x, i32 112 3345 %wide.load.7 = load <16 x i8>, ptr %32, align 1 3346 %33 = getelementptr inbounds i8, ptr %y, i32 112 3347 %wide.load12.7 = load <16 x i8>, ptr %33, align 1 3348 %34 = mul <16 x i8> %wide.load12.7, %wide.load.7 3349 %35 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %34) 3350 %36 = add i8 %35, %31 3351 ret i8 %36 3352} 3353 3354 3355define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) { 3356; CHECK-LABEL: add_two_const: 3357; CHECK: @ %bb.0: @ %entry 3358; CHECK-NEXT: vaddv.u32 r0, q1 3359; CHECK-NEXT: vaddva.u32 r0, q0 3360; CHECK-NEXT: adds r0, #10 3361; CHECK-NEXT: bx lr 3362entry: 3363 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 3364 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 3365 %c = add i32 %a, %b 3366 %d = add i32 %c, 10 3367 ret i32 %d 3368} 3369 3370define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) { 3371; CHECK-LABEL: add_two_const2: 3372; CHECK: @ %bb.0: @ %entry 3373; CHECK-NEXT: vaddv.u32 r0, q1 3374; CHECK-NEXT: vaddva.u32 r0, q0 3375; CHECK-NEXT: adds r0, #10 3376; CHECK-NEXT: bx lr 3377entry: 3378 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 3379 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 3380 %c = add i32 %a, 10 3381 %d = add i32 %c, %b 3382 ret i32 %d 3383} 3384 3385define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) { 3386; CHECK-LABEL: add_two_const3: 3387; CHECK: @ %bb.0: @ %entry 3388; CHECK-NEXT: vaddv.u32 r0, q0 3389; CHECK-NEXT: vaddva.u32 r0, q1 3390; CHECK-NEXT: adds r0, #20 3391; CHECK-NEXT: bx lr 3392entry: 3393 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) 3394 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) 3395 %c = add i32 %a, 10 3396 %d = add i32 %b, 10 3397 %e = add i32 %c, %d 3398 ret i32 %e 3399} 3400 3401declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 3402declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 3403declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 3404declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 3405declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 3406declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 3407declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 3408declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 3409declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 3410declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 3411declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 3412declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 3413declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 3414declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 3415declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 3416