1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD 3; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5; CHECK-GI: warning: Instruction selection used fallback path for test_udot_v5i8 6; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v5i8_nomla 7; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v5i8 8; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v5i8_double 9; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v5i8_double_nomla 10; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v25i8 11; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v25i8_nomla 12; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v25i8 13; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v25i8_double 14; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v25i8_double_nomla 15; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v33i8 16; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v33i8_nomla 17; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v33i8 18; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v33i8_double 19; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v33i8_double_nomla 20 21declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 22declare i32 @llvm.vector.reduce.add.v5i32(<5 x i32>) 23declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 24declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 25declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>) 26declare i32 @llvm.vector.reduce.add.v25i32(<25 x i32>) 27declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 28declare i32 @llvm.vector.reduce.add.v33i32(<33 x i32>) 29declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>) 30declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 31 32define i32 @test_udot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 33; CHECK-SD-LABEL: test_udot_v4i8: 34; CHECK-SD: // %bb.0: // %entry 35; CHECK-SD-NEXT: ldr s0, [x0] 36; CHECK-SD-NEXT: ldr s1, [x1] 37; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 38; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 39; CHECK-SD-NEXT: umull v0.4s, v1.4h, v0.4h 40; CHECK-SD-NEXT: addv s0, v0.4s 41; CHECK-SD-NEXT: fmov w8, s0 42; CHECK-SD-NEXT: add w0, w8, w2 43; CHECK-SD-NEXT: ret 44; 45; CHECK-GI-LABEL: test_udot_v4i8: 46; CHECK-GI: // %bb.0: // %entry 47; CHECK-GI-NEXT: ldr w8, [x0] 48; CHECK-GI-NEXT: ldr w9, [x1] 49; CHECK-GI-NEXT: fmov s0, w8 50; CHECK-GI-NEXT: fmov s2, w9 51; CHECK-GI-NEXT: uxtb w8, w8 52; CHECK-GI-NEXT: uxtb w9, w9 53; CHECK-GI-NEXT: mov b1, v0.b[1] 54; CHECK-GI-NEXT: mov b3, v0.b[2] 55; CHECK-GI-NEXT: mov b5, v2.b[2] 56; CHECK-GI-NEXT: mov b4, v0.b[3] 57; CHECK-GI-NEXT: mov b0, v2.b[1] 58; CHECK-GI-NEXT: mov b6, v2.b[3] 59; CHECK-GI-NEXT: fmov s2, w9 60; CHECK-GI-NEXT: fmov w10, s1 61; CHECK-GI-NEXT: fmov w11, s3 62; CHECK-GI-NEXT: fmov s1, w8 63; CHECK-GI-NEXT: fmov w13, s5 64; CHECK-GI-NEXT: fmov w8, s4 65; CHECK-GI-NEXT: fmov w12, s0 66; CHECK-GI-NEXT: uxtb w10, w10 67; CHECK-GI-NEXT: uxtb w11, w11 68; CHECK-GI-NEXT: uxtb w13, w13 69; CHECK-GI-NEXT: uxtb w8, w8 70; CHECK-GI-NEXT: uxtb w12, w12 71; CHECK-GI-NEXT: mov v1.h[1], w10 72; CHECK-GI-NEXT: fmov w10, s6 73; CHECK-GI-NEXT: fmov s0, w11 74; CHECK-GI-NEXT: fmov s3, w13 75; CHECK-GI-NEXT: mov v2.h[1], w12 76; CHECK-GI-NEXT: uxtb w10, w10 77; CHECK-GI-NEXT: mov v0.h[1], w8 78; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 79; CHECK-GI-NEXT: mov v3.h[1], w10 80; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 81; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 82; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 83; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] 84; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] 85; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s 86; CHECK-GI-NEXT: addv s0, v0.4s 87; CHECK-GI-NEXT: fmov w8, s0 88; CHECK-GI-NEXT: add w0, w8, w2 89; CHECK-GI-NEXT: ret 90entry: 91 %0 = load <4 x i8>, ptr %a 92 %1 = zext <4 x i8> %0 to <4 x i32> 93 %2 = load <4 x i8>, ptr %b 94 %3 = zext <4 x i8> %2 to <4 x i32> 95 %4 = mul nuw nsw <4 x i32> %3, %1 96 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 97 %op.extra = add i32 %5, %sum 98 ret i32 %op.extra 99} 100 101define i32 @test_udot_v4i8_nomla(ptr nocapture readonly %a1) { 102; CHECK-SD-LABEL: test_udot_v4i8_nomla: 103; CHECK-SD: // %bb.0: // %entry 104; CHECK-SD-NEXT: ldr s0, [x0] 105; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 106; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 107; CHECK-SD-NEXT: addv s0, v0.4s 108; CHECK-SD-NEXT: fmov w0, s0 109; CHECK-SD-NEXT: ret 110; 111; CHECK-GI-LABEL: test_udot_v4i8_nomla: 112; CHECK-GI: // %bb.0: // %entry 113; CHECK-GI-NEXT: ldr w8, [x0] 114; CHECK-GI-NEXT: fmov s0, w8 115; CHECK-GI-NEXT: mov b1, v0.b[1] 116; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] 117; CHECK-GI-NEXT: mov b3, v0.b[2] 118; CHECK-GI-NEXT: mov b0, v0.b[3] 119; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] 120; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] 121; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] 122; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 123; CHECK-GI-NEXT: uaddlv s0, v0.4h 124; CHECK-GI-NEXT: fmov w8, s0 125; CHECK-GI-NEXT: and w0, w8, #0xffff 126; CHECK-GI-NEXT: ret 127entry: 128 %0 = load <4 x i8>, ptr %a1 129 %1 = zext <4 x i8> %0 to <4 x i32> 130 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) 131 ret i32 %2 132} 133define i32 @test_sdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 134; CHECK-SD-LABEL: test_sdot_v4i8: 135; CHECK-SD: // %bb.0: // %entry 136; CHECK-SD-NEXT: ldr s0, [x0] 137; CHECK-SD-NEXT: ldr s1, [x1] 138; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 139; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0 140; CHECK-SD-NEXT: smull v0.4s, v1.4h, v0.4h 141; CHECK-SD-NEXT: addv s0, v0.4s 142; CHECK-SD-NEXT: fmov w8, s0 143; CHECK-SD-NEXT: add w0, w8, w2 144; CHECK-SD-NEXT: ret 145; 146; CHECK-GI-LABEL: test_sdot_v4i8: 147; CHECK-GI: // %bb.0: // %entry 148; CHECK-GI-NEXT: ldr w8, [x0] 149; CHECK-GI-NEXT: ldr w9, [x1] 150; CHECK-GI-NEXT: fmov s0, w8 151; CHECK-GI-NEXT: fmov s2, w9 152; CHECK-GI-NEXT: sxtb w8, w8 153; CHECK-GI-NEXT: sxtb w9, w9 154; CHECK-GI-NEXT: mov b1, v0.b[1] 155; CHECK-GI-NEXT: mov b3, v0.b[2] 156; CHECK-GI-NEXT: mov b5, v2.b[2] 157; CHECK-GI-NEXT: mov b4, v0.b[3] 158; CHECK-GI-NEXT: mov b0, v2.b[1] 159; CHECK-GI-NEXT: mov b6, v2.b[3] 160; CHECK-GI-NEXT: fmov s2, w9 161; CHECK-GI-NEXT: fmov w10, s1 162; CHECK-GI-NEXT: fmov w11, s3 163; CHECK-GI-NEXT: fmov s1, w8 164; CHECK-GI-NEXT: fmov w13, s5 165; CHECK-GI-NEXT: fmov w8, s4 166; CHECK-GI-NEXT: fmov w12, s0 167; CHECK-GI-NEXT: sxtb w10, w10 168; CHECK-GI-NEXT: sxtb w11, w11 169; CHECK-GI-NEXT: sxtb w13, w13 170; CHECK-GI-NEXT: sxtb w8, w8 171; CHECK-GI-NEXT: sxtb w12, w12 172; CHECK-GI-NEXT: mov v1.h[1], w10 173; CHECK-GI-NEXT: fmov w10, s6 174; CHECK-GI-NEXT: fmov s0, w11 175; CHECK-GI-NEXT: fmov s3, w13 176; CHECK-GI-NEXT: mov v2.h[1], w12 177; CHECK-GI-NEXT: sxtb w10, w10 178; CHECK-GI-NEXT: mov v0.h[1], w8 179; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 180; CHECK-GI-NEXT: mov v3.h[1], w10 181; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 182; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 183; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 184; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] 185; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] 186; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s 187; CHECK-GI-NEXT: addv s0, v0.4s 188; CHECK-GI-NEXT: fmov w8, s0 189; CHECK-GI-NEXT: add w0, w8, w2 190; CHECK-GI-NEXT: ret 191entry: 192 %0 = load <4 x i8>, ptr %a 193 %1 = sext <4 x i8> %0 to <4 x i32> 194 %2 = load <4 x i8>, ptr %b 195 %3 = sext <4 x i8> %2 to <4 x i32> 196 %4 = mul nsw <4 x i32> %3, %1 197 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 198 %op.extra = add nsw i32 %5, %sum 199 ret i32 %op.extra 200} 201 202define i32 @test_sdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { 203; CHECK-SD-LABEL: test_sdot_v4i8_double: 204; CHECK-SD: // %bb.0: // %entry 205; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 206; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0 207; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 208; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 209; CHECK-SD-NEXT: shl v2.4s, v2.4s, #24 210; CHECK-SD-NEXT: shl v3.4s, v3.4s, #24 211; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24 212; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24 213; CHECK-SD-NEXT: sshr v2.4s, v2.4s, #24 214; CHECK-SD-NEXT: sshr v3.4s, v3.4s, #24 215; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #24 216; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24 217; CHECK-SD-NEXT: mul v2.4s, v2.4s, v3.4s 218; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s 219; CHECK-SD-NEXT: addv s0, v2.4s 220; CHECK-SD-NEXT: fmov w0, s0 221; CHECK-SD-NEXT: ret 222; 223; CHECK-GI-LABEL: test_sdot_v4i8_double: 224; CHECK-GI: // %bb.0: // %entry 225; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 226; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 227; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 228; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 229; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 230; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24 231; CHECK-GI-NEXT: shl v2.4s, v2.4s, #24 232; CHECK-GI-NEXT: shl v3.4s, v3.4s, #24 233; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 234; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24 235; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #24 236; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #24 237; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 238; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s 239; CHECK-GI-NEXT: addv s0, v0.4s 240; CHECK-GI-NEXT: addv s1, v1.4s 241; CHECK-GI-NEXT: fmov w8, s0 242; CHECK-GI-NEXT: fmov w9, s1 243; CHECK-GI-NEXT: add w0, w8, w9 244; CHECK-GI-NEXT: ret 245entry: 246 %az = sext <4 x i8> %a to <4 x i32> 247 %bz = sext <4 x i8> %b to <4 x i32> 248 %m1 = mul nuw nsw <4 x i32> %az, %bz 249 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1) 250 %cz = sext <4 x i8> %c to <4 x i32> 251 %dz = sext <4 x i8> %d to <4 x i32> 252 %m2 = mul nuw nsw <4 x i32> %cz, %dz 253 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2) 254 %x = add i32 %r1, %r2 255 ret i32 %x 256} 257 258define i32 @test_sdot_v4i8_double_nomla(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { 259; CHECK-SD-LABEL: test_sdot_v4i8_double_nomla: 260; CHECK-SD: // %bb.0: // %entry 261; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 262; CHECK-SD-NEXT: ushll v1.4s, v2.4h, #0 263; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24 264; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24 265; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24 266; CHECK-SD-NEXT: ssra v0.4s, v1.4s, #24 267; CHECK-SD-NEXT: addv s0, v0.4s 268; CHECK-SD-NEXT: fmov w0, s0 269; CHECK-SD-NEXT: ret 270; 271; CHECK-GI-LABEL: test_sdot_v4i8_double_nomla: 272; CHECK-GI: // %bb.0: // %entry 273; CHECK-GI-NEXT: shl v1.4h, v2.4h, #8 274; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 275; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 276; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 277; CHECK-GI-NEXT: saddlv s1, v1.4h 278; CHECK-GI-NEXT: saddlv s0, v0.4h 279; CHECK-GI-NEXT: fmov w8, s1 280; CHECK-GI-NEXT: fmov w9, s0 281; CHECK-GI-NEXT: sxth w8, w8 282; CHECK-GI-NEXT: add w0, w8, w9, sxth 283; CHECK-GI-NEXT: ret 284entry: 285 %az = sext <4 x i8> %a to <4 x i32> 286 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %az) 287 %cz = sext <4 x i8> %c to <4 x i32> 288 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %cz) 289 %x = add i32 %r1, %r2 290 ret i32 %x 291} 292 293define i32 @test_usdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 294; CHECK-SD-LABEL: test_usdot_v4i8: 295; CHECK-SD: // %bb.0: // %entry 296; CHECK-SD-NEXT: ldr s0, [x0] 297; CHECK-SD-NEXT: ldr s1, [x1] 298; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 299; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0 300; CHECK-SD-NEXT: smull v0.4s, v1.4h, v0.4h 301; CHECK-SD-NEXT: addv s0, v0.4s 302; CHECK-SD-NEXT: fmov w8, s0 303; CHECK-SD-NEXT: add w0, w8, w2 304; CHECK-SD-NEXT: ret 305; 306; CHECK-GI-LABEL: test_usdot_v4i8: 307; CHECK-GI: // %bb.0: // %entry 308; CHECK-GI-NEXT: ldr w8, [x0] 309; CHECK-GI-NEXT: ldr w9, [x1] 310; CHECK-GI-NEXT: fmov s0, w8 311; CHECK-GI-NEXT: fmov s2, w9 312; CHECK-GI-NEXT: uxtb w8, w8 313; CHECK-GI-NEXT: sxtb w9, w9 314; CHECK-GI-NEXT: mov b1, v0.b[1] 315; CHECK-GI-NEXT: mov b3, v0.b[2] 316; CHECK-GI-NEXT: mov b5, v2.b[2] 317; CHECK-GI-NEXT: mov b4, v0.b[3] 318; CHECK-GI-NEXT: mov b0, v2.b[1] 319; CHECK-GI-NEXT: mov b6, v2.b[3] 320; CHECK-GI-NEXT: fmov s2, w9 321; CHECK-GI-NEXT: fmov w10, s1 322; CHECK-GI-NEXT: fmov w11, s3 323; CHECK-GI-NEXT: fmov s1, w8 324; CHECK-GI-NEXT: fmov w13, s5 325; CHECK-GI-NEXT: fmov w8, s4 326; CHECK-GI-NEXT: fmov w12, s0 327; CHECK-GI-NEXT: uxtb w10, w10 328; CHECK-GI-NEXT: uxtb w11, w11 329; CHECK-GI-NEXT: sxtb w13, w13 330; CHECK-GI-NEXT: uxtb w8, w8 331; CHECK-GI-NEXT: sxtb w12, w12 332; CHECK-GI-NEXT: mov v1.h[1], w10 333; CHECK-GI-NEXT: fmov w10, s6 334; CHECK-GI-NEXT: fmov s0, w11 335; CHECK-GI-NEXT: fmov s3, w13 336; CHECK-GI-NEXT: mov v2.h[1], w12 337; CHECK-GI-NEXT: sxtb w10, w10 338; CHECK-GI-NEXT: mov v0.h[1], w8 339; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 340; CHECK-GI-NEXT: mov v3.h[1], w10 341; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 342; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 343; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 344; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] 345; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] 346; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s 347; CHECK-GI-NEXT: addv s0, v0.4s 348; CHECK-GI-NEXT: fmov w8, s0 349; CHECK-GI-NEXT: add w0, w8, w2 350; CHECK-GI-NEXT: ret 351entry: 352 %0 = load <4 x i8>, ptr %a 353 %1 = zext <4 x i8> %0 to <4 x i32> 354 %2 = load <4 x i8>, ptr %b 355 %3 = sext <4 x i8> %2 to <4 x i32> 356 %4 = mul nsw <4 x i32> %3, %1 357 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) 358 %op.extra = add nsw i32 %5, %sum 359 ret i32 %op.extra 360} 361 362define i32 @test_usdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { 363; CHECK-SD-LABEL: test_usdot_v4i8_double: 364; CHECK-SD: // %bb.0: // %entry 365; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 366; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8 367; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 368; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 369; CHECK-SD-NEXT: shl v3.4s, v3.4s, #24 370; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0 371; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24 372; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 373; CHECK-SD-NEXT: sshr v3.4s, v3.4s, #24 374; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #24 375; CHECK-SD-NEXT: mul v2.4s, v2.4s, v3.4s 376; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s 377; CHECK-SD-NEXT: addv s0, v2.4s 378; CHECK-SD-NEXT: fmov w0, s0 379; CHECK-SD-NEXT: ret 380; 381; CHECK-GI-LABEL: test_usdot_v4i8_double: 382; CHECK-GI: // %bb.0: // %entry 383; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 384; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 385; CHECK-GI-NEXT: movi v4.2d, #0x0000ff000000ff 386; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 387; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 388; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24 389; CHECK-GI-NEXT: shl v3.4s, v3.4s, #24 390; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b 391; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b 392; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24 393; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #24 394; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 395; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s 396; CHECK-GI-NEXT: addv s0, v0.4s 397; CHECK-GI-NEXT: addv s1, v1.4s 398; CHECK-GI-NEXT: fmov w8, s0 399; CHECK-GI-NEXT: fmov w9, s1 400; CHECK-GI-NEXT: add w0, w8, w9 401; CHECK-GI-NEXT: ret 402entry: 403 %az = zext <4 x i8> %a to <4 x i32> 404 %bz = sext <4 x i8> %b to <4 x i32> 405 %m1 = mul nuw nsw <4 x i32> %az, %bz 406 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1) 407 %cz = zext <4 x i8> %c to <4 x i32> 408 %dz = sext <4 x i8> %d to <4 x i32> 409 %m2 = mul nuw nsw <4 x i32> %cz, %dz 410 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2) 411 %x = add i32 %r1, %r2 412 ret i32 %x 413} 414 415define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 416; CHECK-LABEL: test_udot_v5i8: 417; CHECK: // %bb.0: // %entry 418; CHECK-NEXT: ldr d0, [x0] 419; CHECK-NEXT: ldr d1, [x1] 420; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b 421; CHECK-NEXT: movi v1.2d, #0000000000000000 422; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 423; CHECK-NEXT: mov v1.s[0], v2.s[0] 424; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h 425; CHECK-NEXT: addv s0, v0.4s 426; CHECK-NEXT: fmov w8, s0 427; CHECK-NEXT: add w0, w8, w2 428; CHECK-NEXT: ret 429entry: 430 %0 = load <5 x i8>, ptr %a 431 %1 = zext <5 x i8> %0 to <5 x i32> 432 %2 = load <5 x i8>, ptr %b 433 %3 = zext <5 x i8> %2 to <5 x i32> 434 %4 = mul nuw nsw <5 x i32> %3, %1 435 %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4) 436 %op.extra = add i32 %5, %sum 437 ret i32 %op.extra 438} 439 440define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { 441; CHECK-LABEL: test_udot_v5i8_nomla: 442; CHECK: // %bb.0: // %entry 443; CHECK-NEXT: ldr d0, [x0] 444; CHECK-NEXT: movi v1.2d, #0000000000000000 445; CHECK-NEXT: ushll v0.8h, v0.8b, #0 446; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 447; CHECK-NEXT: mov v1.s[0], v2.s[0] 448; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h 449; CHECK-NEXT: addv s0, v0.4s 450; CHECK-NEXT: fmov w0, s0 451; CHECK-NEXT: ret 452entry: 453 %0 = load <5 x i8>, ptr %a1 454 %1 = zext <5 x i8> %0 to <5 x i32> 455 %2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %1) 456 ret i32 %2 457} 458define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 459; CHECK-LABEL: test_sdot_v5i8: 460; CHECK: // %bb.0: // %entry 461; CHECK-NEXT: ldr d0, [x0] 462; CHECK-NEXT: ldr d1, [x1] 463; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b 464; CHECK-NEXT: movi v1.2d, #0000000000000000 465; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0 466; CHECK-NEXT: mov v1.s[0], v2.s[0] 467; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h 468; CHECK-NEXT: addv s0, v0.4s 469; CHECK-NEXT: fmov w8, s0 470; CHECK-NEXT: add w0, w8, w2 471; CHECK-NEXT: ret 472entry: 473 %0 = load <5 x i8>, ptr %a 474 %1 = sext <5 x i8> %0 to <5 x i32> 475 %2 = load <5 x i8>, ptr %b 476 %3 = sext <5 x i8> %2 to <5 x i32> 477 %4 = mul nsw <5 x i32> %3, %1 478 %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4) 479 %op.extra = add nsw i32 %5, %sum 480 ret i32 %op.extra 481} 482 483define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { 484; CHECK-LABEL: test_sdot_v5i8_double: 485; CHECK: // %bb.0: // %entry 486; CHECK-NEXT: smull v2.8h, v2.8b, v3.8b 487; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b 488; CHECK-NEXT: movi v1.2d, #0000000000000000 489; CHECK-NEXT: movi v3.2d, #0000000000000000 490; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 491; CHECK-NEXT: sshll2 v5.4s, v2.8h, #0 492; CHECK-NEXT: mov v3.s[0], v4.s[0] 493; CHECK-NEXT: mov v1.s[0], v5.s[0] 494; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h 495; CHECK-NEXT: saddw v1.4s, v1.4s, v2.4h 496; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 497; CHECK-NEXT: addv s0, v0.4s 498; CHECK-NEXT: fmov w0, s0 499; CHECK-NEXT: ret 500entry: 501 %az = sext <5 x i8> %a to <5 x i32> 502 %bz = sext <5 x i8> %b to <5 x i32> 503 %m1 = mul nuw nsw <5 x i32> %az, %bz 504 %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m1) 505 %cz = sext <5 x i8> %c to <5 x i32> 506 %dz = sext <5 x i8> %d to <5 x i32> 507 %m2 = mul nuw nsw <5 x i32> %cz, %dz 508 %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m2) 509 %x = add i32 %r1, %r2 510 ret i32 %x 511} 512 513define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { 514; CHECK-LABEL: test_sdot_v5i8_double_nomla: 515; CHECK: // %bb.0: // %entry 516; CHECK-NEXT: sshll v0.8h, v0.8b, #0 517; CHECK-NEXT: sshll v1.8h, v2.8b, #0 518; CHECK-NEXT: movi v2.2d, #0000000000000000 519; CHECK-NEXT: movi v3.2d, #0000000000000000 520; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 521; CHECK-NEXT: sshll2 v5.4s, v1.8h, #0 522; CHECK-NEXT: mov v3.s[0], v4.s[0] 523; CHECK-NEXT: mov v2.s[0], v5.s[0] 524; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h 525; CHECK-NEXT: saddw v1.4s, v2.4s, v1.4h 526; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 527; CHECK-NEXT: addv s0, v0.4s 528; CHECK-NEXT: fmov w0, s0 529; CHECK-NEXT: ret 530entry: 531 %az = sext <5 x i8> %a to <5 x i32> 532 %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %az) 533 %cz = sext <5 x i8> %c to <5 x i32> 534 %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %cz) 535 %x = add i32 %r1, %r2 536 ret i32 %x 537} 538 539define i32 @test_udot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) { 540; CHECK-LABEL: test_udot_v8i8: 541; CHECK: // %bb.0: // %entry 542; CHECK-NEXT: movi v0.2d, #0000000000000000 543; CHECK-NEXT: ldr d1, [x0] 544; CHECK-NEXT: ldr d2, [x1] 545; CHECK-NEXT: udot v0.2s, v2.8b, v1.8b 546; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 547; CHECK-NEXT: fmov w0, s0 548; CHECK-NEXT: ret 549entry: 550 %0 = load <8 x i8>, ptr %a 551 %1 = zext <8 x i8> %0 to <8 x i32> 552 %2 = load <8 x i8>, ptr %b 553 %3 = zext <8 x i8> %2 to <8 x i32> 554 %4 = mul nuw nsw <8 x i32> %3, %1 555 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 556 ret i32 %5 557} 558 559define i32 @test_udot_v8i8_nomla(ptr nocapture readonly %a1) { 560; CHECK-SD-LABEL: test_udot_v8i8_nomla: 561; CHECK-SD: // %bb.0: // %entry 562; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 563; CHECK-SD-NEXT: movi v1.8b, #1 564; CHECK-SD-NEXT: ldr d2, [x0] 565; CHECK-SD-NEXT: udot v0.2s, v2.8b, v1.8b 566; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s 567; CHECK-SD-NEXT: fmov w0, s0 568; CHECK-SD-NEXT: ret 569; 570; CHECK-GI-LABEL: test_udot_v8i8_nomla: 571; CHECK-GI: // %bb.0: // %entry 572; CHECK-GI-NEXT: movi v0.8b, #1 573; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 574; CHECK-GI-NEXT: ldr d2, [x0] 575; CHECK-GI-NEXT: udot v1.2s, v2.8b, v0.8b 576; CHECK-GI-NEXT: addp v0.2s, v1.2s, v1.2s 577; CHECK-GI-NEXT: fmov w0, s0 578; CHECK-GI-NEXT: ret 579entry: 580 %0 = load <8 x i8>, ptr %a1 581 %1 = zext <8 x i8> %0 to <8 x i32> 582 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 583 ret i32 %2 584} 585 586define i32 @test_sdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) { 587; CHECK-LABEL: test_sdot_v8i8: 588; CHECK: // %bb.0: // %entry 589; CHECK-NEXT: movi v0.2d, #0000000000000000 590; CHECK-NEXT: ldr d1, [x0] 591; CHECK-NEXT: ldr d2, [x1] 592; CHECK-NEXT: sdot v0.2s, v2.8b, v1.8b 593; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 594; CHECK-NEXT: fmov w0, s0 595; CHECK-NEXT: ret 596entry: 597 %0 = load <8 x i8>, ptr %a 598 %1 = sext <8 x i8> %0 to <8 x i32> 599 %2 = load <8 x i8>, ptr %b 600 %3 = sext <8 x i8> %2 to <8 x i32> 601 %4 = mul nsw <8 x i32> %3, %1 602 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 603 ret i32 %5 604} 605 606define i32 @test_sdot_v8i8_nomla(ptr nocapture readonly %a1) { 607; CHECK-SD-LABEL: test_sdot_v8i8_nomla: 608; CHECK-SD: // %bb.0: // %entry 609; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 610; CHECK-SD-NEXT: movi v1.8b, #1 611; CHECK-SD-NEXT: ldr d2, [x0] 612; CHECK-SD-NEXT: sdot v0.2s, v2.8b, v1.8b 613; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s 614; CHECK-SD-NEXT: fmov w0, s0 615; CHECK-SD-NEXT: ret 616; 617; CHECK-GI-LABEL: test_sdot_v8i8_nomla: 618; CHECK-GI: // %bb.0: // %entry 619; CHECK-GI-NEXT: movi v0.8b, #1 620; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 621; CHECK-GI-NEXT: ldr d2, [x0] 622; CHECK-GI-NEXT: sdot v1.2s, v2.8b, v0.8b 623; CHECK-GI-NEXT: addp v0.2s, v1.2s, v1.2s 624; CHECK-GI-NEXT: fmov w0, s0 625; CHECK-GI-NEXT: ret 626entry: 627 %0 = load <8 x i8>, ptr %a1 628 %1 = sext <8 x i8> %0 to <8 x i32> 629 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) 630 ret i32 %2 631} 632 633define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) { 634; CHECK-SD-LABEL: test_usdot_v8i8: 635; CHECK-SD: // %bb.0: // %entry 636; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 637; CHECK-SD-NEXT: ldr d1, [x0] 638; CHECK-SD-NEXT: ldr d2, [x1] 639; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b 640; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s 641; CHECK-SD-NEXT: fmov w0, s0 642; CHECK-SD-NEXT: ret 643; 644; CHECK-GI-LABEL: test_usdot_v8i8: 645; CHECK-GI: // %bb.0: // %entry 646; CHECK-GI-NEXT: ldr d0, [x0] 647; CHECK-GI-NEXT: ldr d1, [x1] 648; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 649; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 650; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0 651; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 652; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 653; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 654; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s 655; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s 656; CHECK-GI-NEXT: addv s0, v2.4s 657; CHECK-GI-NEXT: fmov w0, s0 658; CHECK-GI-NEXT: ret 659entry: 660 %0 = load <8 x i8>, ptr %a 661 %1 = zext <8 x i8> %0 to <8 x i32> 662 %2 = load <8 x i8>, ptr %b 663 %3 = sext <8 x i8> %2 to <8 x i32> 664 %4 = mul nsw <8 x i32> %3, %1 665 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 666 ret i32 %5 667} 668 669define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) { 670; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8: 671; CHECK-SD: // %bb.0: // %entry 672; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 673; CHECK-SD-NEXT: ldr d1, [x0] 674; CHECK-SD-NEXT: ldr d2, [x1] 675; CHECK-SD-NEXT: usdot v0.2s, v2.8b, v1.8b 676; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s 677; CHECK-SD-NEXT: fmov w0, s0 678; CHECK-SD-NEXT: ret 679; 680; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8: 681; CHECK-GI: // %bb.0: // %entry 682; CHECK-GI-NEXT: ldr d0, [x0] 683; CHECK-GI-NEXT: ldr d1, [x1] 684; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 685; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 686; CHECK-GI-NEXT: sshll2 v2.4s, v0.8h, #0 687; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0 688; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 689; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 690; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s 691; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s 692; CHECK-GI-NEXT: addv s0, v2.4s 693; CHECK-GI-NEXT: fmov w0, s0 694; CHECK-GI-NEXT: ret 695entry: 696 %0 = load <8 x i8>, ptr %a 697 %1 = sext <8 x i8> %0 to <8 x i32> 698 %2 = load <8 x i8>, ptr %b 699 %3 = zext <8 x i8> %2 to <8 x i32> 700 %4 = mul nsw <8 x i32> %3, %1 701 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) 702 ret i32 %5 703} 704 705define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 706; CHECK-LABEL: test_udot_v16i8: 707; CHECK: // %bb.0: // %entry 708; CHECK-NEXT: movi v0.2d, #0000000000000000 709; CHECK-NEXT: ldr q1, [x0] 710; CHECK-NEXT: ldr q2, [x1] 711; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b 712; CHECK-NEXT: addv s0, v0.4s 713; CHECK-NEXT: fmov w8, s0 714; CHECK-NEXT: add w0, w8, w2 715; CHECK-NEXT: ret 716entry: 717 %0 = load <16 x i8>, ptr %a 718 %1 = zext <16 x i8> %0 to <16 x i32> 719 %2 = load <16 x i8>, ptr %b 720 %3 = zext <16 x i8> %2 to <16 x i32> 721 %4 = mul nuw nsw <16 x i32> %3, %1 722 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 723 %op.extra = add i32 %5, %sum 724 ret i32 %op.extra 725} 726 727define i32 @test_udot_v16i8_nomla(ptr nocapture readonly %a1) { 728; CHECK-LABEL: test_udot_v16i8_nomla: 729; CHECK: // %bb.0: // %entry 730; CHECK-NEXT: movi v0.16b, #1 731; CHECK-NEXT: movi v1.2d, #0000000000000000 732; CHECK-NEXT: ldr q2, [x0] 733; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b 734; CHECK-NEXT: addv s0, v1.4s 735; CHECK-NEXT: fmov w0, s0 736; CHECK-NEXT: ret 737entry: 738 %0 = load <16 x i8>, ptr %a1 739 %1 = zext <16 x i8> %0 to <16 x i32> 740 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 741 ret i32 %2 742} 743 744define i32 @test_sdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 745; CHECK-LABEL: test_sdot_v16i8: 746; CHECK: // %bb.0: // %entry 747; CHECK-NEXT: movi v0.2d, #0000000000000000 748; CHECK-NEXT: ldr q1, [x0] 749; CHECK-NEXT: ldr q2, [x1] 750; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b 751; CHECK-NEXT: addv s0, v0.4s 752; CHECK-NEXT: fmov w8, s0 753; CHECK-NEXT: add w0, w8, w2 754; CHECK-NEXT: ret 755entry: 756 %0 = load <16 x i8>, ptr %a 757 %1 = sext <16 x i8> %0 to <16 x i32> 758 %2 = load <16 x i8>, ptr %b 759 %3 = sext <16 x i8> %2 to <16 x i32> 760 %4 = mul nsw <16 x i32> %3, %1 761 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 762 %op.extra = add nsw i32 %5, %sum 763 ret i32 %op.extra 764} 765 766define i32 @test_sdot_v16i8_nomla(ptr nocapture readonly %a1) { 767; CHECK-LABEL: test_sdot_v16i8_nomla: 768; CHECK: // %bb.0: // %entry 769; CHECK-NEXT: movi v0.16b, #1 770; CHECK-NEXT: movi v1.2d, #0000000000000000 771; CHECK-NEXT: ldr q2, [x0] 772; CHECK-NEXT: sdot v1.4s, v2.16b, v0.16b 773; CHECK-NEXT: addv s0, v1.4s 774; CHECK-NEXT: fmov w0, s0 775; CHECK-NEXT: ret 776entry: 777 %0 = load <16 x i8>, ptr %a1 778 %1 = sext <16 x i8> %0 to <16 x i32> 779 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) 780 ret i32 %2 781} 782 783define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 784; CHECK-SD-LABEL: test_usdot_v16i8: 785; CHECK-SD: // %bb.0: // %entry 786; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 787; CHECK-SD-NEXT: ldr q1, [x0] 788; CHECK-SD-NEXT: ldr q2, [x1] 789; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b 790; CHECK-SD-NEXT: addv s0, v0.4s 791; CHECK-SD-NEXT: fmov w8, s0 792; CHECK-SD-NEXT: add w0, w8, w2 793; CHECK-SD-NEXT: ret 794; 795; CHECK-GI-LABEL: test_usdot_v16i8: 796; CHECK-GI: // %bb.0: // %entry 797; CHECK-GI-NEXT: ldr q0, [x0] 798; CHECK-GI-NEXT: ldr q1, [x1] 799; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 800; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 801; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 802; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 803; CHECK-GI-NEXT: ushll2 v4.4s, v2.8h, #0 804; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 805; CHECK-GI-NEXT: sshll2 v6.4s, v3.8h, #0 806; CHECK-GI-NEXT: sshll2 v7.4s, v1.8h, #0 807; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 808; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 809; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 810; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 811; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s 812; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s 813; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s 814; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s 815; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s 816; CHECK-GI-NEXT: addv s0, v0.4s 817; CHECK-GI-NEXT: fmov w8, s0 818; CHECK-GI-NEXT: add w0, w8, w2 819; CHECK-GI-NEXT: ret 820entry: 821 %0 = load <16 x i8>, ptr %a 822 %1 = zext <16 x i8> %0 to <16 x i32> 823 %2 = load <16 x i8>, ptr %b 824 %3 = sext <16 x i8> %2 to <16 x i32> 825 %4 = mul nsw <16 x i32> %3, %1 826 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 827 %op.extra = add nsw i32 %5, %sum 828 ret i32 %op.extra 829} 830 831define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 832; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8: 833; CHECK-SD: // %bb.0: // %entry 834; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 835; CHECK-SD-NEXT: ldr q1, [x0] 836; CHECK-SD-NEXT: ldr q2, [x1] 837; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v1.16b 838; CHECK-SD-NEXT: addv s0, v0.4s 839; CHECK-SD-NEXT: fmov w8, s0 840; CHECK-SD-NEXT: add w0, w8, w2 841; CHECK-SD-NEXT: ret 842; 843; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8: 844; CHECK-GI: // %bb.0: // %entry 845; CHECK-GI-NEXT: ldr q0, [x0] 846; CHECK-GI-NEXT: ldr q1, [x1] 847; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 848; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 849; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 850; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 851; CHECK-GI-NEXT: sshll2 v4.4s, v2.8h, #0 852; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 853; CHECK-GI-NEXT: ushll2 v6.4s, v3.8h, #0 854; CHECK-GI-NEXT: ushll2 v7.4s, v1.8h, #0 855; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 856; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 857; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 858; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 859; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s 860; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s 861; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s 862; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s 863; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s 864; CHECK-GI-NEXT: addv s0, v0.4s 865; CHECK-GI-NEXT: fmov w8, s0 866; CHECK-GI-NEXT: add w0, w8, w2 867; CHECK-GI-NEXT: ret 868entry: 869 %0 = load <16 x i8>, ptr %a 870 %1 = sext <16 x i8> %0 to <16 x i32> 871 %2 = load <16 x i8>, ptr %b 872 %3 = zext <16 x i8> %2 to <16 x i32> 873 %4 = mul nsw <16 x i32> %3, %1 874 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) 875 %op.extra = add nsw i32 %5, %sum 876 ret i32 %op.extra 877} 878 879define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 880; CHECK-SD-LABEL: test_udot_v8i8_double: 881; CHECK-SD: // %bb.0: // %entry 882; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 883; CHECK-SD-NEXT: udot v4.2s, v2.8b, v3.8b 884; CHECK-SD-NEXT: udot v4.2s, v0.8b, v1.8b 885; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s 886; CHECK-SD-NEXT: fmov w0, s0 887; CHECK-SD-NEXT: ret 888; 889; CHECK-GI-LABEL: test_udot_v8i8_double: 890; CHECK-GI: // %bb.0: // %entry 891; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 892; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 893; CHECK-GI-NEXT: udot v5.2s, v0.8b, v1.8b 894; CHECK-GI-NEXT: udot v4.2s, v2.8b, v3.8b 895; CHECK-GI-NEXT: addp v0.2s, v5.2s, v5.2s 896; CHECK-GI-NEXT: addp v1.2s, v4.2s, v4.2s 897; CHECK-GI-NEXT: fmov w8, s0 898; CHECK-GI-NEXT: fmov w9, s1 899; CHECK-GI-NEXT: add w0, w8, w9 900; CHECK-GI-NEXT: ret 901entry: 902 %az = zext <8 x i8> %a to <8 x i32> 903 %bz = zext <8 x i8> %b to <8 x i32> 904 %m1 = mul nuw nsw <8 x i32> %az, %bz 905 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1) 906 %cz = zext <8 x i8> %c to <8 x i32> 907 %dz = zext <8 x i8> %d to <8 x i32> 908 %m2 = mul nuw nsw <8 x i32> %cz, %dz 909 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2) 910 %x = add i32 %r1, %r2 911 ret i32 %x 912} 913 914define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 915; CHECK-SD-LABEL: test_udot_v8i8_double_nomla: 916; CHECK-SD: // %bb.0: // %entry 917; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 918; CHECK-SD-NEXT: movi v3.8b, #1 919; CHECK-SD-NEXT: udot v1.2s, v2.8b, v3.8b 920; CHECK-SD-NEXT: udot v1.2s, v0.8b, v3.8b 921; CHECK-SD-NEXT: addp v0.2s, v1.2s, v1.2s 922; CHECK-SD-NEXT: fmov w0, s0 923; CHECK-SD-NEXT: ret 924; 925; CHECK-GI-LABEL: test_udot_v8i8_double_nomla: 926; CHECK-GI: // %bb.0: // %entry 927; CHECK-GI-NEXT: movi v1.8b, #1 928; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 929; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 930; CHECK-GI-NEXT: udot v4.2s, v0.8b, v1.8b 931; CHECK-GI-NEXT: udot v3.2s, v2.8b, v1.8b 932; CHECK-GI-NEXT: addp v0.2s, v4.2s, v4.2s 933; CHECK-GI-NEXT: addp v1.2s, v3.2s, v3.2s 934; CHECK-GI-NEXT: fmov w8, s0 935; CHECK-GI-NEXT: fmov w9, s1 936; CHECK-GI-NEXT: add w0, w8, w9 937; CHECK-GI-NEXT: ret 938entry: 939 %az = zext <8 x i8> %a to <8 x i32> 940 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az) 941 %cz = zext <8 x i8> %c to <8 x i32> 942 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz) 943 %x = add i32 %r1, %r2 944 ret i32 %x 945} 946 947define i32 @test_udot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { 948; CHECK-SD-LABEL: test_udot_v16i8_double: 949; CHECK-SD: // %bb.0: // %entry 950; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 951; CHECK-SD-NEXT: udot v4.4s, v2.16b, v3.16b 952; CHECK-SD-NEXT: udot v4.4s, v0.16b, v1.16b 953; CHECK-SD-NEXT: addv s0, v4.4s 954; CHECK-SD-NEXT: fmov w0, s0 955; CHECK-SD-NEXT: ret 956; 957; CHECK-GI-LABEL: test_udot_v16i8_double: 958; CHECK-GI: // %bb.0: // %entry 959; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 960; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 961; CHECK-GI-NEXT: udot v5.4s, v0.16b, v1.16b 962; CHECK-GI-NEXT: udot v4.4s, v2.16b, v3.16b 963; CHECK-GI-NEXT: addv s0, v5.4s 964; CHECK-GI-NEXT: addv s1, v4.4s 965; CHECK-GI-NEXT: fmov w8, s0 966; CHECK-GI-NEXT: fmov w9, s1 967; CHECK-GI-NEXT: add w0, w8, w9 968; CHECK-GI-NEXT: ret 969entry: 970 %az = zext <16 x i8> %a to <16 x i32> 971 %bz = zext <16 x i8> %b to <16 x i32> 972 %m1 = mul nuw nsw <16 x i32> %az, %bz 973 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1) 974 %cz = zext <16 x i8> %c to <16 x i32> 975 %dz = zext <16 x i8> %d to <16 x i32> 976 %m2 = mul nuw nsw <16 x i32> %cz, %dz 977 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2) 978 %x = add i32 %r1, %r2 979 ret i32 %x 980} 981 982define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { 983; CHECK-SD-LABEL: test_udot_v16i8_double_nomla: 984; CHECK-SD: // %bb.0: // %entry 985; CHECK-SD-NEXT: movi v1.16b, #1 986; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 987; CHECK-SD-NEXT: udot v3.4s, v2.16b, v1.16b 988; CHECK-SD-NEXT: udot v3.4s, v0.16b, v1.16b 989; CHECK-SD-NEXT: addv s0, v3.4s 990; CHECK-SD-NEXT: fmov w0, s0 991; CHECK-SD-NEXT: ret 992; 993; CHECK-GI-LABEL: test_udot_v16i8_double_nomla: 994; CHECK-GI: // %bb.0: // %entry 995; CHECK-GI-NEXT: movi v1.16b, #1 996; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 997; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 998; CHECK-GI-NEXT: udot v4.4s, v0.16b, v1.16b 999; CHECK-GI-NEXT: udot v3.4s, v2.16b, v1.16b 1000; CHECK-GI-NEXT: addv s0, v4.4s 1001; CHECK-GI-NEXT: addv s1, v3.4s 1002; CHECK-GI-NEXT: fmov w8, s0 1003; CHECK-GI-NEXT: fmov w9, s1 1004; CHECK-GI-NEXT: add w0, w8, w9 1005; CHECK-GI-NEXT: ret 1006entry: 1007 %az = zext <16 x i8> %a to <16 x i32> 1008 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az) 1009 %cz = zext <16 x i8> %c to <16 x i32> 1010 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz) 1011 %x = add i32 %r1, %r2 1012 ret i32 %x 1013} 1014 1015define i32 @test_sdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 1016; CHECK-SD-LABEL: test_sdot_v8i8_double: 1017; CHECK-SD: // %bb.0: // %entry 1018; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 1019; CHECK-SD-NEXT: sdot v4.2s, v2.8b, v3.8b 1020; CHECK-SD-NEXT: sdot v4.2s, v0.8b, v1.8b 1021; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s 1022; CHECK-SD-NEXT: fmov w0, s0 1023; CHECK-SD-NEXT: ret 1024; 1025; CHECK-GI-LABEL: test_sdot_v8i8_double: 1026; CHECK-GI: // %bb.0: // %entry 1027; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 1028; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 1029; CHECK-GI-NEXT: sdot v5.2s, v0.8b, v1.8b 1030; CHECK-GI-NEXT: sdot v4.2s, v2.8b, v3.8b 1031; CHECK-GI-NEXT: addp v0.2s, v5.2s, v5.2s 1032; CHECK-GI-NEXT: addp v1.2s, v4.2s, v4.2s 1033; CHECK-GI-NEXT: fmov w8, s0 1034; CHECK-GI-NEXT: fmov w9, s1 1035; CHECK-GI-NEXT: add w0, w8, w9 1036; CHECK-GI-NEXT: ret 1037entry: 1038 %az = sext <8 x i8> %a to <8 x i32> 1039 %bz = sext <8 x i8> %b to <8 x i32> 1040 %m1 = mul nuw nsw <8 x i32> %az, %bz 1041 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1) 1042 %cz = sext <8 x i8> %c to <8 x i32> 1043 %dz = sext <8 x i8> %d to <8 x i32> 1044 %m2 = mul nuw nsw <8 x i32> %cz, %dz 1045 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2) 1046 %x = add i32 %r1, %r2 1047 ret i32 %x 1048} 1049 1050define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 1051; CHECK-SD-LABEL: test_sdot_v8i8_double_nomla: 1052; CHECK-SD: // %bb.0: // %entry 1053; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 1054; CHECK-SD-NEXT: movi v3.8b, #1 1055; CHECK-SD-NEXT: sdot v1.2s, v2.8b, v3.8b 1056; CHECK-SD-NEXT: sdot v1.2s, v0.8b, v3.8b 1057; CHECK-SD-NEXT: addp v0.2s, v1.2s, v1.2s 1058; CHECK-SD-NEXT: fmov w0, s0 1059; CHECK-SD-NEXT: ret 1060; 1061; CHECK-GI-LABEL: test_sdot_v8i8_double_nomla: 1062; CHECK-GI: // %bb.0: // %entry 1063; CHECK-GI-NEXT: movi v1.8b, #1 1064; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 1065; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 1066; CHECK-GI-NEXT: sdot v4.2s, v0.8b, v1.8b 1067; CHECK-GI-NEXT: sdot v3.2s, v2.8b, v1.8b 1068; CHECK-GI-NEXT: addp v0.2s, v4.2s, v4.2s 1069; CHECK-GI-NEXT: addp v1.2s, v3.2s, v3.2s 1070; CHECK-GI-NEXT: fmov w8, s0 1071; CHECK-GI-NEXT: fmov w9, s1 1072; CHECK-GI-NEXT: add w0, w8, w9 1073; CHECK-GI-NEXT: ret 1074entry: 1075 %az = sext <8 x i8> %a to <8 x i32> 1076 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az) 1077 %cz = sext <8 x i8> %c to <8 x i32> 1078 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz) 1079 %x = add i32 %r1, %r2 1080 ret i32 %x 1081} 1082 1083define i32 @test_sdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { 1084; CHECK-SD-LABEL: test_sdot_v16i8_double: 1085; CHECK-SD: // %bb.0: // %entry 1086; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 1087; CHECK-SD-NEXT: sdot v4.4s, v2.16b, v3.16b 1088; CHECK-SD-NEXT: sdot v4.4s, v0.16b, v1.16b 1089; CHECK-SD-NEXT: addv s0, v4.4s 1090; CHECK-SD-NEXT: fmov w0, s0 1091; CHECK-SD-NEXT: ret 1092; 1093; CHECK-GI-LABEL: test_sdot_v16i8_double: 1094; CHECK-GI: // %bb.0: // %entry 1095; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 1096; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 1097; CHECK-GI-NEXT: sdot v5.4s, v0.16b, v1.16b 1098; CHECK-GI-NEXT: sdot v4.4s, v2.16b, v3.16b 1099; CHECK-GI-NEXT: addv s0, v5.4s 1100; CHECK-GI-NEXT: addv s1, v4.4s 1101; CHECK-GI-NEXT: fmov w8, s0 1102; CHECK-GI-NEXT: fmov w9, s1 1103; CHECK-GI-NEXT: add w0, w8, w9 1104; CHECK-GI-NEXT: ret 1105entry: 1106 %az = sext <16 x i8> %a to <16 x i32> 1107 %bz = sext <16 x i8> %b to <16 x i32> 1108 %m1 = mul nuw nsw <16 x i32> %az, %bz 1109 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1) 1110 %cz = sext <16 x i8> %c to <16 x i32> 1111 %dz = sext <16 x i8> %d to <16 x i32> 1112 %m2 = mul nuw nsw <16 x i32> %cz, %dz 1113 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2) 1114 %x = add i32 %r1, %r2 1115 ret i32 %x 1116} 1117 1118define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { 1119; CHECK-SD-LABEL: test_sdot_v16i8_double_nomla: 1120; CHECK-SD: // %bb.0: // %entry 1121; CHECK-SD-NEXT: movi v1.16b, #1 1122; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 1123; CHECK-SD-NEXT: sdot v3.4s, v2.16b, v1.16b 1124; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b 1125; CHECK-SD-NEXT: addv s0, v3.4s 1126; CHECK-SD-NEXT: fmov w0, s0 1127; CHECK-SD-NEXT: ret 1128; 1129; CHECK-GI-LABEL: test_sdot_v16i8_double_nomla: 1130; CHECK-GI: // %bb.0: // %entry 1131; CHECK-GI-NEXT: movi v1.16b, #1 1132; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 1133; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 1134; CHECK-GI-NEXT: sdot v4.4s, v0.16b, v1.16b 1135; CHECK-GI-NEXT: sdot v3.4s, v2.16b, v1.16b 1136; CHECK-GI-NEXT: addv s0, v4.4s 1137; CHECK-GI-NEXT: addv s1, v3.4s 1138; CHECK-GI-NEXT: fmov w8, s0 1139; CHECK-GI-NEXT: fmov w9, s1 1140; CHECK-GI-NEXT: add w0, w8, w9 1141; CHECK-GI-NEXT: ret 1142entry: 1143 %az = sext <16 x i8> %a to <16 x i32> 1144 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az) 1145 %cz = sext <16 x i8> %c to <16 x i32> 1146 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz) 1147 %x = add i32 %r1, %r2 1148 ret i32 %x 1149} 1150 1151 1152define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 1153; CHECK-SD-LABEL: test_usdot_v8i8_double: 1154; CHECK-SD: // %bb.0: // %entry 1155; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 1156; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 1157; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b 1158; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b 1159; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s 1160; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s 1161; CHECK-SD-NEXT: fmov w0, s0 1162; CHECK-SD-NEXT: ret 1163; 1164; CHECK-GI-LABEL: test_usdot_v8i8_double: 1165; CHECK-GI: // %bb.0: // %entry 1166; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1167; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 1168; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 1169; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0 1170; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 1171; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 1172; CHECK-GI-NEXT: ushll2 v6.4s, v2.8h, #0 1173; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0 1174; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 1175; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 1176; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 1177; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 1178; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s 1179; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s 1180; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s 1181; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s 1182; CHECK-GI-NEXT: addv s0, v4.4s 1183; CHECK-GI-NEXT: addv s1, v5.4s 1184; CHECK-GI-NEXT: fmov w8, s0 1185; CHECK-GI-NEXT: fmov w9, s1 1186; CHECK-GI-NEXT: add w0, w8, w9 1187; CHECK-GI-NEXT: ret 1188entry: 1189 %az = zext <8 x i8> %a to <8 x i32> 1190 %bz = sext <8 x i8> %b to <8 x i32> 1191 %m1 = mul nuw nsw <8 x i32> %az, %bz 1192 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1) 1193 %cz = zext <8 x i8> %c to <8 x i32> 1194 %dz = sext <8 x i8> %d to <8 x i32> 1195 %m2 = mul nuw nsw <8 x i32> %cz, %dz 1196 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2) 1197 %x = add i32 %r1, %r2 1198 ret i32 %x 1199} 1200 1201define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 1202; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double: 1203; CHECK-SD: // %bb.0: // %entry 1204; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 1205; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 1206; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b 1207; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b 1208; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s 1209; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s 1210; CHECK-SD-NEXT: fmov w0, s0 1211; CHECK-SD-NEXT: ret 1212; 1213; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8_double: 1214; CHECK-GI: // %bb.0: // %entry 1215; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 1216; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 1217; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 1218; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 1219; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 1220; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 1221; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0 1222; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0 1223; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 1224; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 1225; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 1226; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 1227; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s 1228; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s 1229; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s 1230; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s 1231; CHECK-GI-NEXT: addv s0, v4.4s 1232; CHECK-GI-NEXT: addv s1, v5.4s 1233; CHECK-GI-NEXT: fmov w8, s0 1234; CHECK-GI-NEXT: fmov w9, s1 1235; CHECK-GI-NEXT: add w0, w8, w9 1236; CHECK-GI-NEXT: ret 1237entry: 1238 %az = sext <8 x i8> %a to <8 x i32> 1239 %bz = zext <8 x i8> %b to <8 x i32> 1240 %m1 = mul nuw nsw <8 x i32> %az, %bz 1241 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1) 1242 %cz = sext <8 x i8> %c to <8 x i32> 1243 %dz = zext <8 x i8> %d to <8 x i32> 1244 %m2 = mul nuw nsw <8 x i32> %cz, %dz 1245 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2) 1246 %x = add i32 %r1, %r2 1247 ret i32 %x 1248} 1249 1250define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { 1251; CHECK-SD-LABEL: test_usdot_v16i8_double: 1252; CHECK-SD: // %bb.0: // %entry 1253; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 1254; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 1255; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b 1256; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b 1257; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s 1258; CHECK-SD-NEXT: addv s0, v0.4s 1259; CHECK-SD-NEXT: fmov w0, s0 1260; CHECK-SD-NEXT: ret 1261; 1262; CHECK-GI-LABEL: test_usdot_v16i8_double: 1263; CHECK-GI: // %bb.0: // %entry 1264; CHECK-GI-NEXT: ushll v4.8h, v0.8b, #0 1265; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 1266; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0 1267; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 1268; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0 1269; CHECK-GI-NEXT: sshll v7.8h, v3.8b, #0 1270; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0 1271; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0 1272; CHECK-GI-NEXT: ushll2 v16.4s, v4.8h, #0 1273; CHECK-GI-NEXT: ushll2 v17.4s, v0.8h, #0 1274; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0 1275; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0 1276; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0 1277; CHECK-GI-NEXT: sshll2 v21.4s, v7.8h, #0 1278; CHECK-GI-NEXT: ushll2 v22.4s, v2.8h, #0 1279; CHECK-GI-NEXT: sshll2 v23.4s, v3.8h, #0 1280; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 1281; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s 1282; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 1283; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 1284; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s 1285; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s 1286; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 1287; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s 1288; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 1289; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 1290; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0 1291; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 1292; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s 1293; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s 1294; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s 1295; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s 1296; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s 1297; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s 1298; CHECK-GI-NEXT: addv s0, v0.4s 1299; CHECK-GI-NEXT: addv s1, v1.4s 1300; CHECK-GI-NEXT: fmov w8, s0 1301; CHECK-GI-NEXT: fmov w9, s1 1302; CHECK-GI-NEXT: add w0, w8, w9 1303; CHECK-GI-NEXT: ret 1304entry: 1305 %az = zext <16 x i8> %a to <16 x i32> 1306 %bz = sext <16 x i8> %b to <16 x i32> 1307 %m1 = mul nuw nsw <16 x i32> %az, %bz 1308 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1) 1309 %cz = zext <16 x i8> %c to <16 x i32> 1310 %dz = sext <16 x i8> %d to <16 x i32> 1311 %m2 = mul nuw nsw <16 x i32> %cz, %dz 1312 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2) 1313 %x = add i32 %r1, %r2 1314 ret i32 %x 1315} 1316 1317 1318define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { 1319; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double: 1320; CHECK-SD: // %bb.0: // %entry 1321; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 1322; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 1323; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b 1324; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b 1325; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s 1326; CHECK-SD-NEXT: addv s0, v0.4s 1327; CHECK-SD-NEXT: fmov w0, s0 1328; CHECK-SD-NEXT: ret 1329; 1330; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8_double: 1331; CHECK-GI: // %bb.0: // %entry 1332; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0 1333; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 1334; CHECK-GI-NEXT: ushll v5.8h, v1.8b, #0 1335; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 1336; CHECK-GI-NEXT: sshll v6.8h, v2.8b, #0 1337; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 1338; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 1339; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 1340; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0 1341; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0 1342; CHECK-GI-NEXT: ushll2 v18.4s, v5.8h, #0 1343; CHECK-GI-NEXT: ushll2 v19.4s, v1.8h, #0 1344; CHECK-GI-NEXT: sshll2 v20.4s, v6.8h, #0 1345; CHECK-GI-NEXT: ushll2 v21.4s, v7.8h, #0 1346; CHECK-GI-NEXT: sshll2 v22.4s, v2.8h, #0 1347; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0 1348; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 1349; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s 1350; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 1351; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 1352; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s 1353; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s 1354; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 1355; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s 1356; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 1357; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 1358; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 1359; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 1360; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s 1361; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s 1362; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s 1363; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s 1364; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s 1365; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s 1366; CHECK-GI-NEXT: addv s0, v0.4s 1367; CHECK-GI-NEXT: addv s1, v1.4s 1368; CHECK-GI-NEXT: fmov w8, s0 1369; CHECK-GI-NEXT: fmov w9, s1 1370; CHECK-GI-NEXT: add w0, w8, w9 1371; CHECK-GI-NEXT: ret 1372entry: 1373 %az = sext <16 x i8> %a to <16 x i32> 1374 %bz = zext <16 x i8> %b to <16 x i32> 1375 %m1 = mul nuw nsw <16 x i32> %az, %bz 1376 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1) 1377 %cz = sext <16 x i8> %c to <16 x i32> 1378 %dz = zext <16 x i8> %d to <16 x i32> 1379 %m2 = mul nuw nsw <16 x i32> %cz, %dz 1380 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2) 1381 %x = add i32 %r1, %r2 1382 ret i32 %x 1383} 1384 1385define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 1386; CHECK-SD-LABEL: test_udot_v24i8: 1387; CHECK-SD: // %bb.0: // %entry 1388; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 1389; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 1390; CHECK-SD-NEXT: ldr q2, [x0] 1391; CHECK-SD-NEXT: ldr q3, [x1] 1392; CHECK-SD-NEXT: ldr d4, [x0, #16] 1393; CHECK-SD-NEXT: ldr d5, [x1, #16] 1394; CHECK-SD-NEXT: udot v1.2s, v5.8b, v4.8b 1395; CHECK-SD-NEXT: udot v0.4s, v3.16b, v2.16b 1396; CHECK-SD-NEXT: addp v1.2s, v1.2s, v1.2s 1397; CHECK-SD-NEXT: addv s0, v0.4s 1398; CHECK-SD-NEXT: fmov w8, s1 1399; CHECK-SD-NEXT: fmov w9, s0 1400; CHECK-SD-NEXT: add w8, w9, w8 1401; CHECK-SD-NEXT: add w0, w8, w2 1402; CHECK-SD-NEXT: ret 1403; 1404; CHECK-GI-LABEL: test_udot_v24i8: 1405; CHECK-GI: // %bb.0: // %entry 1406; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 1407; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 1408; CHECK-GI-NEXT: ldr q2, [x0] 1409; CHECK-GI-NEXT: ldr d3, [x0, #16] 1410; CHECK-GI-NEXT: ldr q4, [x1] 1411; CHECK-GI-NEXT: ldr d5, [x1, #16] 1412; CHECK-GI-NEXT: udot v1.4s, v4.16b, v2.16b 1413; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b 1414; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s 1415; CHECK-GI-NEXT: addv s0, v0.4s 1416; CHECK-GI-NEXT: fmov w8, s0 1417; CHECK-GI-NEXT: add w0, w8, w2 1418; CHECK-GI-NEXT: ret 1419entry: 1420 %0 = load <24 x i8>, ptr %a 1421 %1 = zext <24 x i8> %0 to <24 x i32> 1422 %2 = load <24 x i8>, ptr %b 1423 %3 = zext <24 x i8> %2 to <24 x i32> 1424 %4 = mul nuw nsw <24 x i32> %3, %1 1425 %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4) 1426 %op.extra = add i32 %5, %sum 1427 ret i32 %op.extra 1428} 1429 1430define i32 @test_udot_v24i8_nomla(ptr nocapture readonly %a1) { 1431; CHECK-SD-LABEL: test_udot_v24i8_nomla: 1432; CHECK-SD: // %bb.0: // %entry 1433; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 1434; CHECK-SD-NEXT: movi v1.8b, #1 1435; CHECK-SD-NEXT: ldr q4, [x0] 1436; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 1437; CHECK-SD-NEXT: movi v3.16b, #1 1438; CHECK-SD-NEXT: ldr d5, [x0, #16] 1439; CHECK-SD-NEXT: udot v2.2s, v5.8b, v1.8b 1440; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b 1441; CHECK-SD-NEXT: addp v1.2s, v2.2s, v2.2s 1442; CHECK-SD-NEXT: addv s0, v0.4s 1443; CHECK-SD-NEXT: fmov w8, s1 1444; CHECK-SD-NEXT: fmov w9, s0 1445; CHECK-SD-NEXT: add w0, w9, w8 1446; CHECK-SD-NEXT: ret 1447; 1448; CHECK-GI-LABEL: test_udot_v24i8_nomla: 1449; CHECK-GI: // %bb.0: // %entry 1450; CHECK-GI-NEXT: movi v0.8b, #1 1451; CHECK-GI-NEXT: movi v1.8b, #1 1452; CHECK-GI-NEXT: ldr q4, [x0] 1453; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 1454; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 1455; CHECK-GI-NEXT: ldr d5, [x0, #16] 1456; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] 1457; CHECK-GI-NEXT: udot v2.4s, v5.16b, v0.16b 1458; CHECK-GI-NEXT: udot v3.4s, v4.16b, v1.16b 1459; CHECK-GI-NEXT: add v0.4s, v3.4s, v2.4s 1460; CHECK-GI-NEXT: addv s0, v0.4s 1461; CHECK-GI-NEXT: fmov w0, s0 1462; CHECK-GI-NEXT: ret 1463entry: 1464 %0 = load <24 x i8>, ptr %a1 1465 %1 = zext <24 x i8> %0 to <24 x i32> 1466 %2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %1) 1467 ret i32 %2 1468} 1469define i32 @test_sdot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 1470; CHECK-SD-LABEL: test_sdot_v24i8: 1471; CHECK-SD: // %bb.0: // %entry 1472; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 1473; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 1474; CHECK-SD-NEXT: ldr q2, [x0] 1475; CHECK-SD-NEXT: ldr q3, [x1] 1476; CHECK-SD-NEXT: ldr d4, [x0, #16] 1477; CHECK-SD-NEXT: ldr d5, [x1, #16] 1478; CHECK-SD-NEXT: sdot v1.2s, v5.8b, v4.8b 1479; CHECK-SD-NEXT: sdot v0.4s, v3.16b, v2.16b 1480; CHECK-SD-NEXT: addp v1.2s, v1.2s, v1.2s 1481; CHECK-SD-NEXT: addv s0, v0.4s 1482; CHECK-SD-NEXT: fmov w8, s1 1483; CHECK-SD-NEXT: fmov w9, s0 1484; CHECK-SD-NEXT: add w8, w9, w8 1485; CHECK-SD-NEXT: add w0, w8, w2 1486; CHECK-SD-NEXT: ret 1487; 1488; CHECK-GI-LABEL: test_sdot_v24i8: 1489; CHECK-GI: // %bb.0: // %entry 1490; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 1491; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 1492; CHECK-GI-NEXT: ldr q2, [x0] 1493; CHECK-GI-NEXT: ldr d3, [x0, #16] 1494; CHECK-GI-NEXT: ldr q4, [x1] 1495; CHECK-GI-NEXT: ldr d5, [x1, #16] 1496; CHECK-GI-NEXT: sdot v1.4s, v4.16b, v2.16b 1497; CHECK-GI-NEXT: sdot v0.4s, v5.16b, v3.16b 1498; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s 1499; CHECK-GI-NEXT: addv s0, v0.4s 1500; CHECK-GI-NEXT: fmov w8, s0 1501; CHECK-GI-NEXT: add w0, w8, w2 1502; CHECK-GI-NEXT: ret 1503entry: 1504 %0 = load <24 x i8>, ptr %a 1505 %1 = sext <24 x i8> %0 to <24 x i32> 1506 %2 = load <24 x i8>, ptr %b 1507 %3 = sext <24 x i8> %2 to <24 x i32> 1508 %4 = mul nsw <24 x i32> %3, %1 1509 %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4) 1510 %op.extra = add nsw i32 %5, %sum 1511 ret i32 %op.extra 1512} 1513 1514define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) { 1515; CHECK-SD-LABEL: test_sdot_v24i8_double: 1516; CHECK-SD: // %bb.0: // %entry 1517; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1518; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 1519; CHECK-SD-NEXT: .cfi_offset w29, -16 1520; CHECK-SD-NEXT: fmov s0, w0 1521; CHECK-SD-NEXT: ldr b1, [sp, #144] 1522; CHECK-SD-NEXT: add x10, sp, #152 1523; CHECK-SD-NEXT: add x9, sp, #160 1524; CHECK-SD-NEXT: add x8, sp, #168 1525; CHECK-SD-NEXT: ldr b2, [sp, #272] 1526; CHECK-SD-NEXT: ld1 { v1.b }[1], [x10] 1527; CHECK-SD-NEXT: add x11, sp, #280 1528; CHECK-SD-NEXT: ldr b3, [sp, #80] 1529; CHECK-SD-NEXT: mov v0.b[1], w1 1530; CHECK-SD-NEXT: ldr b4, [sp, #528] 1531; CHECK-SD-NEXT: add x10, sp, #88 1532; CHECK-SD-NEXT: ld1 { v2.b }[1], [x11] 1533; CHECK-SD-NEXT: add x11, sp, #536 1534; CHECK-SD-NEXT: ldr b5, [sp, #336] 1535; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] 1536; CHECK-SD-NEXT: ld1 { v3.b }[1], [x10] 1537; CHECK-SD-NEXT: add x10, sp, #344 1538; CHECK-SD-NEXT: ld1 { v4.b }[1], [x11] 1539; CHECK-SD-NEXT: add x11, sp, #176 1540; CHECK-SD-NEXT: ldr b6, [sp, #656] 1541; CHECK-SD-NEXT: mov v0.b[2], w2 1542; CHECK-SD-NEXT: ld1 { v5.b }[1], [x10] 1543; CHECK-SD-NEXT: ldr b7, [sp, #464] 1544; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] 1545; CHECK-SD-NEXT: add x12, sp, #664 1546; CHECK-SD-NEXT: add x9, sp, #472 1547; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12] 1548; CHECK-SD-NEXT: add x8, sp, #96 1549; CHECK-SD-NEXT: add x10, sp, #184 1550; CHECK-SD-NEXT: add x12, sp, #288 1551; CHECK-SD-NEXT: ld1 { v7.b }[1], [x9] 1552; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] 1553; CHECK-SD-NEXT: mov v0.b[3], w3 1554; CHECK-SD-NEXT: ld1 { v1.b }[4], [x11] 1555; CHECK-SD-NEXT: add x8, sp, #352 1556; CHECK-SD-NEXT: ld1 { v2.b }[2], [x12] 1557; CHECK-SD-NEXT: add x13, sp, #544 1558; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] 1559; CHECK-SD-NEXT: add x8, sp, #672 1560; CHECK-SD-NEXT: ld1 { v4.b }[2], [x13] 1561; CHECK-SD-NEXT: add x9, sp, #192 1562; CHECK-SD-NEXT: ld1 { v1.b }[5], [x10] 1563; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] 1564; CHECK-SD-NEXT: add x8, sp, #480 1565; CHECK-SD-NEXT: mov v0.b[4], w4 1566; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8] 1567; CHECK-SD-NEXT: add x8, sp, #296 1568; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] 1569; CHECK-SD-NEXT: add x8, sp, #552 1570; CHECK-SD-NEXT: add x12, sp, #200 1571; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] 1572; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] 1573; CHECK-SD-NEXT: add x8, sp, #360 1574; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] 1575; CHECK-SD-NEXT: add x8, sp, #104 1576; CHECK-SD-NEXT: add x9, sp, #560 1577; CHECK-SD-NEXT: mov v0.b[5], w5 1578; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] 1579; CHECK-SD-NEXT: add x8, sp, #368 1580; CHECK-SD-NEXT: ld1 { v1.b }[7], [x12] 1581; CHECK-SD-NEXT: ld1 { v4.b }[4], [x9] 1582; CHECK-SD-NEXT: add x13, sp, #208 1583; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] 1584; CHECK-SD-NEXT: add x12, sp, #304 1585; CHECK-SD-NEXT: add x8, sp, #568 1586; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12] 1587; CHECK-SD-NEXT: add x12, sp, #16 1588; CHECK-SD-NEXT: add x17, sp, #376 1589; CHECK-SD-NEXT: mov v0.b[6], w6 1590; CHECK-SD-NEXT: ld1 { v1.b }[8], [x13] 1591; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] 1592; CHECK-SD-NEXT: add x14, sp, #216 1593; CHECK-SD-NEXT: ld1 { v5.b }[5], [x17] 1594; CHECK-SD-NEXT: add x13, sp, #576 1595; CHECK-SD-NEXT: add x11, sp, #224 1596; CHECK-SD-NEXT: add x10, sp, #232 1597; CHECK-SD-NEXT: add x15, sp, #240 1598; CHECK-SD-NEXT: ld1 { v1.b }[9], [x14] 1599; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13] 1600; CHECK-SD-NEXT: add x13, sp, #384 1601; CHECK-SD-NEXT: mov v0.b[7], w7 1602; CHECK-SD-NEXT: ld1 { v5.b }[6], [x13] 1603; CHECK-SD-NEXT: add x13, sp, #112 1604; CHECK-SD-NEXT: ld1 { v3.b }[4], [x13] 1605; CHECK-SD-NEXT: add x13, sp, #32 1606; CHECK-SD-NEXT: add x14, sp, #584 1607; CHECK-SD-NEXT: ld1 { v1.b }[10], [x11] 1608; CHECK-SD-NEXT: ld1 { v4.b }[7], [x14] 1609; CHECK-SD-NEXT: add x11, sp, #312 1610; CHECK-SD-NEXT: add x14, sp, #40 1611; CHECK-SD-NEXT: ld1 { v2.b }[5], [x11] 1612; CHECK-SD-NEXT: add x11, sp, #592 1613; CHECK-SD-NEXT: ld1 { v0.b }[8], [x12] 1614; CHECK-SD-NEXT: add x12, sp, #24 1615; CHECK-SD-NEXT: add x16, sp, #248 1616; CHECK-SD-NEXT: ld1 { v1.b }[11], [x10] 1617; CHECK-SD-NEXT: ld1 { v4.b }[8], [x11] 1618; CHECK-SD-NEXT: add x11, sp, #400 1619; CHECK-SD-NEXT: add x9, sp, #256 1620; CHECK-SD-NEXT: add x8, sp, #264 1621; CHECK-SD-NEXT: add x10, sp, #72 1622; CHECK-SD-NEXT: ld1 { v0.b }[9], [x12] 1623; CHECK-SD-NEXT: add x12, sp, #392 1624; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 1625; CHECK-SD-NEXT: ld1 { v5.b }[7], [x12] 1626; CHECK-SD-NEXT: add x12, sp, #48 1627; CHECK-SD-NEXT: ld1 { v1.b }[12], [x15] 1628; CHECK-SD-NEXT: add x15, sp, #120 1629; CHECK-SD-NEXT: movi v17.2d, #0000000000000000 1630; CHECK-SD-NEXT: movi v18.2d, #0000000000000000 1631; CHECK-SD-NEXT: ld1 { v0.b }[10], [x13] 1632; CHECK-SD-NEXT: ld1 { v3.b }[5], [x15] 1633; CHECK-SD-NEXT: add x15, sp, #408 1634; CHECK-SD-NEXT: ld1 { v5.b }[8], [x11] 1635; CHECK-SD-NEXT: add x13, sp, #56 1636; CHECK-SD-NEXT: ld1 { v1.b }[13], [x16] 1637; CHECK-SD-NEXT: add x11, sp, #64 1638; CHECK-SD-NEXT: add x16, sp, #616 1639; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 1640; CHECK-SD-NEXT: ld1 { v0.b }[11], [x14] 1641; CHECK-SD-NEXT: add x14, sp, #600 1642; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14] 1643; CHECK-SD-NEXT: ld1 { v5.b }[9], [x15] 1644; CHECK-SD-NEXT: add x15, sp, #608 1645; CHECK-SD-NEXT: ld1 { v1.b }[14], [x9] 1646; CHECK-SD-NEXT: add x9, sp, #488 1647; CHECK-SD-NEXT: add x14, sp, #320 1648; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12] 1649; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9] 1650; CHECK-SD-NEXT: ld1 { v2.b }[6], [x14] 1651; CHECK-SD-NEXT: ld1 { v4.b }[10], [x15] 1652; CHECK-SD-NEXT: add x14, sp, #624 1653; CHECK-SD-NEXT: add x9, sp, #688 1654; CHECK-SD-NEXT: ld1 { v1.b }[15], [x8] 1655; CHECK-SD-NEXT: add x8, sp, #432 1656; CHECK-SD-NEXT: add x12, sp, #328 1657; CHECK-SD-NEXT: ld1 { v0.b }[13], [x13] 1658; CHECK-SD-NEXT: add x13, sp, #416 1659; CHECK-SD-NEXT: ld1 { v2.b }[7], [x12] 1660; CHECK-SD-NEXT: ld1 { v5.b }[10], [x13] 1661; CHECK-SD-NEXT: ld1 { v4.b }[11], [x16] 1662; CHECK-SD-NEXT: add x16, sp, #680 1663; CHECK-SD-NEXT: ld1 { v6.b }[3], [x16] 1664; CHECK-SD-NEXT: add x13, sp, #632 1665; CHECK-SD-NEXT: add x12, sp, #504 1666; CHECK-SD-NEXT: ld1 { v0.b }[14], [x11] 1667; CHECK-SD-NEXT: add x11, sp, #424 1668; CHECK-SD-NEXT: add x15, sp, #128 1669; CHECK-SD-NEXT: ld1 { v5.b }[11], [x11] 1670; CHECK-SD-NEXT: ld1 { v4.b }[12], [x14] 1671; CHECK-SD-NEXT: add x11, sp, #696 1672; CHECK-SD-NEXT: ld1 { v6.b }[4], [x9] 1673; CHECK-SD-NEXT: ld1 { v3.b }[6], [x15] 1674; CHECK-SD-NEXT: add x9, sp, #640 1675; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10] 1676; CHECK-SD-NEXT: add x10, sp, #496 1677; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8] 1678; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10] 1679; CHECK-SD-NEXT: ld1 { v4.b }[13], [x13] 1680; CHECK-SD-NEXT: add x10, sp, #440 1681; CHECK-SD-NEXT: ld1 { v6.b }[5], [x11] 1682; CHECK-SD-NEXT: add x11, sp, #512 1683; CHECK-SD-NEXT: add x8, sp, #136 1684; CHECK-SD-NEXT: sdot v17.4s, v0.16b, v1.16b 1685; CHECK-SD-NEXT: ld1 { v5.b }[13], [x10] 1686; CHECK-SD-NEXT: ld1 { v7.b }[5], [x12] 1687; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9] 1688; CHECK-SD-NEXT: add x9, sp, #448 1689; CHECK-SD-NEXT: add x10, sp, #704 1690; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8] 1691; CHECK-SD-NEXT: ld1 { v6.b }[6], [x10] 1692; CHECK-SD-NEXT: add x8, sp, #648 1693; CHECK-SD-NEXT: add x10, sp, #520 1694; CHECK-SD-NEXT: ld1 { v5.b }[14], [x9] 1695; CHECK-SD-NEXT: ld1 { v7.b }[6], [x11] 1696; CHECK-SD-NEXT: ld1 { v4.b }[15], [x8] 1697; CHECK-SD-NEXT: add x8, sp, #456 1698; CHECK-SD-NEXT: add x9, sp, #712 1699; CHECK-SD-NEXT: sdot v19.2s, v3.8b, v2.8b 1700; CHECK-SD-NEXT: ld1 { v6.b }[7], [x9] 1701; CHECK-SD-NEXT: addv s0, v17.4s 1702; CHECK-SD-NEXT: ld1 { v5.b }[15], [x8] 1703; CHECK-SD-NEXT: ld1 { v7.b }[7], [x10] 1704; CHECK-SD-NEXT: addp v1.2s, v19.2s, v19.2s 1705; CHECK-SD-NEXT: fmov w8, s0 1706; CHECK-SD-NEXT: sdot v16.4s, v5.16b, v4.16b 1707; CHECK-SD-NEXT: sdot v18.2s, v7.8b, v6.8b 1708; CHECK-SD-NEXT: fmov w9, s1 1709; CHECK-SD-NEXT: addv s2, v16.4s 1710; CHECK-SD-NEXT: addp v3.2s, v18.2s, v18.2s 1711; CHECK-SD-NEXT: add w8, w8, w9 1712; CHECK-SD-NEXT: fmov w10, s2 1713; CHECK-SD-NEXT: fmov w11, s3 1714; CHECK-SD-NEXT: add w9, w10, w11 1715; CHECK-SD-NEXT: add w0, w8, w9 1716; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1717; CHECK-SD-NEXT: ret 1718; 1719; CHECK-GI-LABEL: test_sdot_v24i8_double: 1720; CHECK-GI: // %bb.0: // %entry 1721; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1722; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 1723; CHECK-GI-NEXT: .cfi_offset w29, -16 1724; CHECK-GI-NEXT: ldr w8, [sp, #80] 1725; CHECK-GI-NEXT: ldr w9, [sp, #88] 1726; CHECK-GI-NEXT: fmov s1, w0 1727; CHECK-GI-NEXT: ldr w11, [sp, #336] 1728; CHECK-GI-NEXT: ldr w10, [sp, #280] 1729; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 1730; CHECK-GI-NEXT: fmov s0, w8 1731; CHECK-GI-NEXT: ldr w8, [sp, #96] 1732; CHECK-GI-NEXT: ldr w12, [sp, #152] 1733; CHECK-GI-NEXT: mov v1.b[1], w1 1734; CHECK-GI-NEXT: fmov s4, w11 1735; CHECK-GI-NEXT: ldr w11, [sp, #584] 1736; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 1737; CHECK-GI-NEXT: movi v18.2d, #0000000000000000 1738; CHECK-GI-NEXT: movi v19.2d, #0000000000000000 1739; CHECK-GI-NEXT: mov v0.b[1], w9 1740; CHECK-GI-NEXT: ldr w9, [sp, #272] 1741; CHECK-GI-NEXT: fmov s2, w9 1742; CHECK-GI-NEXT: ldr w9, [sp, #144] 1743; CHECK-GI-NEXT: mov v1.b[2], w2 1744; CHECK-GI-NEXT: mov v0.b[2], w8 1745; CHECK-GI-NEXT: ldr w8, [sp, #528] 1746; CHECK-GI-NEXT: fmov s3, w9 1747; CHECK-GI-NEXT: mov v2.b[1], w10 1748; CHECK-GI-NEXT: ldr w9, [sp, #344] 1749; CHECK-GI-NEXT: ldr w10, [sp, #536] 1750; CHECK-GI-NEXT: fmov s5, w8 1751; CHECK-GI-NEXT: ldr w8, [sp, #288] 1752; CHECK-GI-NEXT: mov v1.b[3], w3 1753; CHECK-GI-NEXT: mov v3.b[1], w12 1754; CHECK-GI-NEXT: mov v4.b[1], w9 1755; CHECK-GI-NEXT: ldr w9, [sp, #160] 1756; CHECK-GI-NEXT: mov v5.b[1], w10 1757; CHECK-GI-NEXT: mov v2.b[2], w8 1758; CHECK-GI-NEXT: ldr w8, [sp, #104] 1759; CHECK-GI-NEXT: ldr w10, [sp, #352] 1760; CHECK-GI-NEXT: mov v1.b[4], w4 1761; CHECK-GI-NEXT: mov v3.b[2], w9 1762; CHECK-GI-NEXT: ldr w9, [sp, #544] 1763; CHECK-GI-NEXT: mov v0.b[3], w8 1764; CHECK-GI-NEXT: ldr w8, [sp, #296] 1765; CHECK-GI-NEXT: mov v4.b[2], w10 1766; CHECK-GI-NEXT: ldr w10, [sp, #360] 1767; CHECK-GI-NEXT: mov v5.b[2], w9 1768; CHECK-GI-NEXT: ldr w9, [sp, #168] 1769; CHECK-GI-NEXT: mov v2.b[3], w8 1770; CHECK-GI-NEXT: ldr w8, [sp, #112] 1771; CHECK-GI-NEXT: mov v1.b[5], w5 1772; CHECK-GI-NEXT: mov v3.b[3], w9 1773; CHECK-GI-NEXT: ldr w9, [sp, #552] 1774; CHECK-GI-NEXT: mov v0.b[4], w8 1775; CHECK-GI-NEXT: ldr w8, [sp, #304] 1776; CHECK-GI-NEXT: mov v4.b[3], w10 1777; CHECK-GI-NEXT: mov v5.b[3], w9 1778; CHECK-GI-NEXT: ldr w9, [sp, #176] 1779; CHECK-GI-NEXT: ldr w10, [sp, #368] 1780; CHECK-GI-NEXT: mov v2.b[4], w8 1781; CHECK-GI-NEXT: ldr w8, [sp, #120] 1782; CHECK-GI-NEXT: mov v1.b[6], w6 1783; CHECK-GI-NEXT: mov v3.b[4], w9 1784; CHECK-GI-NEXT: ldr w9, [sp, #560] 1785; CHECK-GI-NEXT: mov v0.b[5], w8 1786; CHECK-GI-NEXT: ldr w8, [sp, #312] 1787; CHECK-GI-NEXT: mov v4.b[4], w10 1788; CHECK-GI-NEXT: mov v5.b[4], w9 1789; CHECK-GI-NEXT: ldr w9, [sp, #184] 1790; CHECK-GI-NEXT: ldr w10, [sp, #376] 1791; CHECK-GI-NEXT: mov v2.b[5], w8 1792; CHECK-GI-NEXT: ldr w8, [sp, #128] 1793; CHECK-GI-NEXT: mov v1.b[7], w7 1794; CHECK-GI-NEXT: mov v3.b[5], w9 1795; CHECK-GI-NEXT: ldr w9, [sp, #568] 1796; CHECK-GI-NEXT: mov v0.b[6], w8 1797; CHECK-GI-NEXT: ldr w8, [sp, #320] 1798; CHECK-GI-NEXT: mov v4.b[5], w10 1799; CHECK-GI-NEXT: mov v5.b[5], w9 1800; CHECK-GI-NEXT: ldr w9, [sp, #192] 1801; CHECK-GI-NEXT: ldr w10, [sp, #384] 1802; CHECK-GI-NEXT: mov v2.b[6], w8 1803; CHECK-GI-NEXT: ldr w8, [sp, #136] 1804; CHECK-GI-NEXT: mov v3.b[6], w9 1805; CHECK-GI-NEXT: ldr w9, [sp, #576] 1806; CHECK-GI-NEXT: mov v0.b[7], w8 1807; CHECK-GI-NEXT: ldr w8, [sp, #328] 1808; CHECK-GI-NEXT: mov v4.b[6], w10 1809; CHECK-GI-NEXT: ldr w10, [sp, #200] 1810; CHECK-GI-NEXT: mov v5.b[6], w9 1811; CHECK-GI-NEXT: ldr w9, [sp, #392] 1812; CHECK-GI-NEXT: mov v2.b[7], w8 1813; CHECK-GI-NEXT: ldr w8, [sp, #464] 1814; CHECK-GI-NEXT: mov v3.b[7], w10 1815; CHECK-GI-NEXT: ldr w10, [sp, #16] 1816; CHECK-GI-NEXT: fmov s6, w8 1817; CHECK-GI-NEXT: ldr w8, [sp, #208] 1818; CHECK-GI-NEXT: mov v4.b[7], w9 1819; CHECK-GI-NEXT: mov v1.b[8], w10 1820; CHECK-GI-NEXT: ldr w10, [sp, #656] 1821; CHECK-GI-NEXT: ldr w9, [sp, #472] 1822; CHECK-GI-NEXT: mov v5.b[7], w11 1823; CHECK-GI-NEXT: ldr w11, [sp, #400] 1824; CHECK-GI-NEXT: fmov d0, d0 1825; CHECK-GI-NEXT: fmov s7, w10 1826; CHECK-GI-NEXT: mov v6.b[1], w9 1827; CHECK-GI-NEXT: ldr w9, [sp, #592] 1828; CHECK-GI-NEXT: mov v3.b[8], w8 1829; CHECK-GI-NEXT: ldr w10, [sp, #664] 1830; CHECK-GI-NEXT: ldr w8, [sp, #24] 1831; CHECK-GI-NEXT: mov v4.b[8], w11 1832; CHECK-GI-NEXT: ldr w11, [sp, #216] 1833; CHECK-GI-NEXT: fmov d2, d2 1834; CHECK-GI-NEXT: mov v5.b[8], w9 1835; CHECK-GI-NEXT: ldr w9, [sp, #480] 1836; CHECK-GI-NEXT: mov v7.b[1], w10 1837; CHECK-GI-NEXT: mov v1.b[9], w8 1838; CHECK-GI-NEXT: ldr w8, [sp, #408] 1839; CHECK-GI-NEXT: ldr w10, [sp, #600] 1840; CHECK-GI-NEXT: mov v3.b[9], w11 1841; CHECK-GI-NEXT: mov v6.b[2], w9 1842; CHECK-GI-NEXT: ldr w9, [sp, #672] 1843; CHECK-GI-NEXT: ldr w11, [sp, #32] 1844; CHECK-GI-NEXT: mov v4.b[9], w8 1845; CHECK-GI-NEXT: ldr w8, [sp, #224] 1846; CHECK-GI-NEXT: mov v5.b[9], w10 1847; CHECK-GI-NEXT: ldr w10, [sp, #488] 1848; CHECK-GI-NEXT: mov v7.b[2], w9 1849; CHECK-GI-NEXT: mov v1.b[10], w11 1850; CHECK-GI-NEXT: ldr w9, [sp, #416] 1851; CHECK-GI-NEXT: ldr w11, [sp, #608] 1852; CHECK-GI-NEXT: mov v3.b[10], w8 1853; CHECK-GI-NEXT: mov v6.b[3], w10 1854; CHECK-GI-NEXT: ldr w10, [sp, #680] 1855; CHECK-GI-NEXT: ldr w8, [sp, #40] 1856; CHECK-GI-NEXT: mov v4.b[10], w9 1857; CHECK-GI-NEXT: ldr w9, [sp, #232] 1858; CHECK-GI-NEXT: mov v5.b[10], w11 1859; CHECK-GI-NEXT: ldr w11, [sp, #496] 1860; CHECK-GI-NEXT: mov v7.b[3], w10 1861; CHECK-GI-NEXT: mov v1.b[11], w8 1862; CHECK-GI-NEXT: ldr w8, [sp, #424] 1863; CHECK-GI-NEXT: ldr w10, [sp, #616] 1864; CHECK-GI-NEXT: mov v3.b[11], w9 1865; CHECK-GI-NEXT: mov v6.b[4], w11 1866; CHECK-GI-NEXT: ldr w11, [sp, #688] 1867; CHECK-GI-NEXT: ldr w9, [sp, #48] 1868; CHECK-GI-NEXT: mov v4.b[11], w8 1869; CHECK-GI-NEXT: ldr w8, [sp, #240] 1870; CHECK-GI-NEXT: mov v5.b[11], w10 1871; CHECK-GI-NEXT: ldr w10, [sp, #504] 1872; CHECK-GI-NEXT: mov v7.b[4], w11 1873; CHECK-GI-NEXT: mov v1.b[12], w9 1874; CHECK-GI-NEXT: ldr w9, [sp, #432] 1875; CHECK-GI-NEXT: ldr w11, [sp, #624] 1876; CHECK-GI-NEXT: mov v3.b[12], w8 1877; CHECK-GI-NEXT: mov v6.b[5], w10 1878; CHECK-GI-NEXT: ldr w10, [sp, #696] 1879; CHECK-GI-NEXT: ldr w8, [sp, #56] 1880; CHECK-GI-NEXT: mov v4.b[12], w9 1881; CHECK-GI-NEXT: ldr w9, [sp, #248] 1882; CHECK-GI-NEXT: mov v5.b[12], w11 1883; CHECK-GI-NEXT: ldr w11, [sp, #512] 1884; CHECK-GI-NEXT: mov v7.b[5], w10 1885; CHECK-GI-NEXT: mov v1.b[13], w8 1886; CHECK-GI-NEXT: ldr w8, [sp, #440] 1887; CHECK-GI-NEXT: ldr w10, [sp, #632] 1888; CHECK-GI-NEXT: mov v3.b[13], w9 1889; CHECK-GI-NEXT: mov v6.b[6], w11 1890; CHECK-GI-NEXT: ldr w11, [sp, #704] 1891; CHECK-GI-NEXT: ldr w9, [sp, #64] 1892; CHECK-GI-NEXT: mov v4.b[13], w8 1893; CHECK-GI-NEXT: ldr w8, [sp, #256] 1894; CHECK-GI-NEXT: mov v5.b[13], w10 1895; CHECK-GI-NEXT: ldr w10, [sp, #520] 1896; CHECK-GI-NEXT: mov v7.b[6], w11 1897; CHECK-GI-NEXT: mov v1.b[14], w9 1898; CHECK-GI-NEXT: ldr w9, [sp, #448] 1899; CHECK-GI-NEXT: ldr w11, [sp, #640] 1900; CHECK-GI-NEXT: mov v3.b[14], w8 1901; CHECK-GI-NEXT: mov v6.b[7], w10 1902; CHECK-GI-NEXT: ldr w10, [sp, #712] 1903; CHECK-GI-NEXT: ldr w8, [sp, #72] 1904; CHECK-GI-NEXT: mov v4.b[14], w9 1905; CHECK-GI-NEXT: ldr w9, [sp, #264] 1906; CHECK-GI-NEXT: mov v5.b[14], w11 1907; CHECK-GI-NEXT: mov v7.b[7], w10 1908; CHECK-GI-NEXT: sdot v18.4s, v0.16b, v2.16b 1909; CHECK-GI-NEXT: mov v1.b[15], w8 1910; CHECK-GI-NEXT: ldr w8, [sp, #456] 1911; CHECK-GI-NEXT: mov v3.b[15], w9 1912; CHECK-GI-NEXT: ldr w9, [sp, #648] 1913; CHECK-GI-NEXT: fmov d6, d6 1914; CHECK-GI-NEXT: mov v4.b[15], w8 1915; CHECK-GI-NEXT: mov v5.b[15], w9 1916; CHECK-GI-NEXT: fmov d7, d7 1917; CHECK-GI-NEXT: sdot v17.4s, v1.16b, v3.16b 1918; CHECK-GI-NEXT: sdot v19.4s, v4.16b, v5.16b 1919; CHECK-GI-NEXT: sdot v16.4s, v6.16b, v7.16b 1920; CHECK-GI-NEXT: add v0.4s, v17.4s, v18.4s 1921; CHECK-GI-NEXT: add v1.4s, v19.4s, v16.4s 1922; CHECK-GI-NEXT: addv s0, v0.4s 1923; CHECK-GI-NEXT: addv s1, v1.4s 1924; CHECK-GI-NEXT: fmov w8, s0 1925; CHECK-GI-NEXT: fmov w9, s1 1926; CHECK-GI-NEXT: add w0, w8, w9 1927; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1928; CHECK-GI-NEXT: ret 1929entry: 1930 %az = sext <24 x i8> %a to <24 x i32> 1931 %bz = sext <24 x i8> %b to <24 x i32> 1932 %m1 = mul nuw nsw <24 x i32> %az, %bz 1933 %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m1) 1934 %cz = sext <24 x i8> %c to <24 x i32> 1935 %dz = sext <24 x i8> %d to <24 x i32> 1936 %m2 = mul nuw nsw <24 x i32> %cz, %dz 1937 %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m2) 1938 %x = add i32 %r1, %r2 1939 ret i32 %x 1940} 1941 1942define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) { 1943; CHECK-SD-LABEL: test_sdot_v24i8_double_nomla: 1944; CHECK-SD: // %bb.0: // %entry 1945; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1946; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 1947; CHECK-SD-NEXT: .cfi_offset w29, -16 1948; CHECK-SD-NEXT: fmov s0, w0 1949; CHECK-SD-NEXT: ldr b1, [sp, #336] 1950; CHECK-SD-NEXT: add x8, sp, #344 1951; CHECK-SD-NEXT: add x9, sp, #400 1952; CHECK-SD-NEXT: ldr b2, [sp, #80] 1953; CHECK-SD-NEXT: ldr b3, [sp, #464] 1954; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] 1955; CHECK-SD-NEXT: add x8, sp, #352 1956; CHECK-SD-NEXT: add x10, sp, #408 1957; CHECK-SD-NEXT: mov v0.b[1], w1 1958; CHECK-SD-NEXT: add x11, sp, #472 1959; CHECK-SD-NEXT: add x12, sp, #480 1960; CHECK-SD-NEXT: ld1 { v3.b }[1], [x11] 1961; CHECK-SD-NEXT: add x11, sp, #416 1962; CHECK-SD-NEXT: add x13, sp, #488 1963; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] 1964; CHECK-SD-NEXT: add x8, sp, #360 1965; CHECK-SD-NEXT: add x14, sp, #496 1966; CHECK-SD-NEXT: movi v4.16b, #1 1967; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 1968; CHECK-SD-NEXT: movi v6.2d, #0000000000000000 1969; CHECK-SD-NEXT: mov v0.b[2], w2 1970; CHECK-SD-NEXT: ld1 { v3.b }[2], [x12] 1971; CHECK-SD-NEXT: add x12, sp, #424 1972; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] 1973; CHECK-SD-NEXT: add x8, sp, #368 1974; CHECK-SD-NEXT: movi v7.2d, #0000000000000000 1975; CHECK-SD-NEXT: movi v16.8b, #1 1976; CHECK-SD-NEXT: movi v17.2d, #0000000000000000 1977; CHECK-SD-NEXT: ld1 { v3.b }[3], [x13] 1978; CHECK-SD-NEXT: add x13, sp, #432 1979; CHECK-SD-NEXT: mov v0.b[3], w3 1980; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] 1981; CHECK-SD-NEXT: add x8, sp, #376 1982; CHECK-SD-NEXT: ld1 { v3.b }[4], [x14] 1983; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] 1984; CHECK-SD-NEXT: add x8, sp, #384 1985; CHECK-SD-NEXT: mov v0.b[4], w4 1986; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] 1987; CHECK-SD-NEXT: add x8, sp, #392 1988; CHECK-SD-NEXT: mov v0.b[5], w5 1989; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] 1990; CHECK-SD-NEXT: add x8, sp, #16 1991; CHECK-SD-NEXT: mov v0.b[6], w6 1992; CHECK-SD-NEXT: ld1 { v1.b }[8], [x9] 1993; CHECK-SD-NEXT: add x9, sp, #88 1994; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] 1995; CHECK-SD-NEXT: add x9, sp, #40 1996; CHECK-SD-NEXT: ld1 { v1.b }[9], [x10] 1997; CHECK-SD-NEXT: add x10, sp, #96 1998; CHECK-SD-NEXT: mov v0.b[7], w7 1999; CHECK-SD-NEXT: ld1 { v2.b }[2], [x10] 2000; CHECK-SD-NEXT: add x10, sp, #56 2001; CHECK-SD-NEXT: ld1 { v1.b }[10], [x11] 2002; CHECK-SD-NEXT: add x11, sp, #104 2003; CHECK-SD-NEXT: ld1 { v2.b }[3], [x11] 2004; CHECK-SD-NEXT: add x11, sp, #72 2005; CHECK-SD-NEXT: ld1 { v0.b }[8], [x8] 2006; CHECK-SD-NEXT: add x8, sp, #24 2007; CHECK-SD-NEXT: ld1 { v1.b }[11], [x12] 2008; CHECK-SD-NEXT: add x12, sp, #112 2009; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12] 2010; CHECK-SD-NEXT: add x12, sp, #440 2011; CHECK-SD-NEXT: ld1 { v0.b }[9], [x8] 2012; CHECK-SD-NEXT: add x8, sp, #32 2013; CHECK-SD-NEXT: ld1 { v1.b }[12], [x13] 2014; CHECK-SD-NEXT: add x13, sp, #504 2015; CHECK-SD-NEXT: ld1 { v3.b }[5], [x13] 2016; CHECK-SD-NEXT: add x13, sp, #512 2017; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] 2018; CHECK-SD-NEXT: add x8, sp, #48 2019; CHECK-SD-NEXT: ld1 { v1.b }[13], [x12] 2020; CHECK-SD-NEXT: add x12, sp, #448 2021; CHECK-SD-NEXT: ld1 { v3.b }[6], [x13] 2022; CHECK-SD-NEXT: ld1 { v0.b }[11], [x9] 2023; CHECK-SD-NEXT: add x9, sp, #64 2024; CHECK-SD-NEXT: ld1 { v1.b }[14], [x12] 2025; CHECK-SD-NEXT: ld1 { v0.b }[12], [x8] 2026; CHECK-SD-NEXT: add x8, sp, #120 2027; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] 2028; CHECK-SD-NEXT: add x8, sp, #128 2029; CHECK-SD-NEXT: ld1 { v0.b }[13], [x10] 2030; CHECK-SD-NEXT: add x10, sp, #136 2031; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] 2032; CHECK-SD-NEXT: add x8, sp, #456 2033; CHECK-SD-NEXT: ld1 { v1.b }[15], [x8] 2034; CHECK-SD-NEXT: ld1 { v0.b }[14], [x9] 2035; CHECK-SD-NEXT: add x9, sp, #520 2036; CHECK-SD-NEXT: ld1 { v2.b }[7], [x10] 2037; CHECK-SD-NEXT: ld1 { v3.b }[7], [x9] 2038; CHECK-SD-NEXT: sdot v5.4s, v1.16b, v4.16b 2039; CHECK-SD-NEXT: ld1 { v0.b }[15], [x11] 2040; CHECK-SD-NEXT: sdot v17.2s, v2.8b, v16.8b 2041; CHECK-SD-NEXT: sdot v7.2s, v3.8b, v16.8b 2042; CHECK-SD-NEXT: sdot v6.4s, v0.16b, v4.16b 2043; CHECK-SD-NEXT: addv s3, v5.4s 2044; CHECK-SD-NEXT: addp v1.2s, v17.2s, v17.2s 2045; CHECK-SD-NEXT: addp v2.2s, v7.2s, v7.2s 2046; CHECK-SD-NEXT: fmov w10, s3 2047; CHECK-SD-NEXT: addv s0, v6.4s 2048; CHECK-SD-NEXT: fmov w9, s1 2049; CHECK-SD-NEXT: fmov w11, s2 2050; CHECK-SD-NEXT: fmov w8, s0 2051; CHECK-SD-NEXT: add w8, w8, w9 2052; CHECK-SD-NEXT: add w9, w10, w11 2053; CHECK-SD-NEXT: add w0, w8, w9 2054; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 2055; CHECK-SD-NEXT: ret 2056; 2057; CHECK-GI-LABEL: test_sdot_v24i8_double_nomla: 2058; CHECK-GI: // %bb.0: // %entry 2059; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 2060; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 2061; CHECK-GI-NEXT: .cfi_offset w29, -16 2062; CHECK-GI-NEXT: ldr w9, [sp, #336] 2063; CHECK-GI-NEXT: ldr w8, [sp, #344] 2064; CHECK-GI-NEXT: fmov s0, w0 2065; CHECK-GI-NEXT: ldr w10, [sp, #16] 2066; CHECK-GI-NEXT: ldr w11, [sp, #88] 2067; CHECK-GI-NEXT: movi v4.8b, #1 2068; CHECK-GI-NEXT: fmov s1, w9 2069; CHECK-GI-NEXT: ldr w9, [sp, #464] 2070; CHECK-GI-NEXT: ldr w12, [sp, #400] 2071; CHECK-GI-NEXT: mov v0.b[1], w1 2072; CHECK-GI-NEXT: movi v5.8b, #1 2073; CHECK-GI-NEXT: movi v6.8b, #1 2074; CHECK-GI-NEXT: fmov s2, w9 2075; CHECK-GI-NEXT: ldr w9, [sp, #96] 2076; CHECK-GI-NEXT: movi v7.2d, #0000000000000000 2077; CHECK-GI-NEXT: mov v1.b[1], w8 2078; CHECK-GI-NEXT: ldr w8, [sp, #352] 2079; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 2080; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 2081; CHECK-GI-NEXT: movi v18.2d, #0000000000000000 2082; CHECK-GI-NEXT: mov v0.b[2], w2 2083; CHECK-GI-NEXT: mov v5.d[1], v4.d[0] 2084; CHECK-GI-NEXT: mov v6.d[1], v4.d[0] 2085; CHECK-GI-NEXT: mov v1.b[2], w8 2086; CHECK-GI-NEXT: ldr w8, [sp, #360] 2087; CHECK-GI-NEXT: mov v0.b[3], w3 2088; CHECK-GI-NEXT: mov v1.b[3], w8 2089; CHECK-GI-NEXT: ldr w8, [sp, #368] 2090; CHECK-GI-NEXT: mov v0.b[4], w4 2091; CHECK-GI-NEXT: mov v1.b[4], w8 2092; CHECK-GI-NEXT: ldr w8, [sp, #376] 2093; CHECK-GI-NEXT: mov v0.b[5], w5 2094; CHECK-GI-NEXT: mov v1.b[5], w8 2095; CHECK-GI-NEXT: ldr w8, [sp, #384] 2096; CHECK-GI-NEXT: mov v0.b[6], w6 2097; CHECK-GI-NEXT: mov v1.b[6], w8 2098; CHECK-GI-NEXT: ldr w8, [sp, #392] 2099; CHECK-GI-NEXT: mov v0.b[7], w7 2100; CHECK-GI-NEXT: mov v1.b[7], w8 2101; CHECK-GI-NEXT: ldr w8, [sp, #80] 2102; CHECK-GI-NEXT: fmov s3, w8 2103; CHECK-GI-NEXT: ldr w8, [sp, #472] 2104; CHECK-GI-NEXT: mov v0.b[8], w10 2105; CHECK-GI-NEXT: ldr w10, [sp, #408] 2106; CHECK-GI-NEXT: mov v1.b[8], w12 2107; CHECK-GI-NEXT: mov v2.b[1], w8 2108; CHECK-GI-NEXT: ldr w8, [sp, #24] 2109; CHECK-GI-NEXT: mov v3.b[1], w11 2110; CHECK-GI-NEXT: ldr w11, [sp, #480] 2111; CHECK-GI-NEXT: mov v0.b[9], w8 2112; CHECK-GI-NEXT: ldr w8, [sp, #32] 2113; CHECK-GI-NEXT: mov v1.b[9], w10 2114; CHECK-GI-NEXT: mov v2.b[2], w11 2115; CHECK-GI-NEXT: ldr w10, [sp, #416] 2116; CHECK-GI-NEXT: mov v3.b[2], w9 2117; CHECK-GI-NEXT: ldr w9, [sp, #104] 2118; CHECK-GI-NEXT: ldr w11, [sp, #488] 2119; CHECK-GI-NEXT: mov v0.b[10], w8 2120; CHECK-GI-NEXT: ldr w8, [sp, #40] 2121; CHECK-GI-NEXT: mov v1.b[10], w10 2122; CHECK-GI-NEXT: mov v2.b[3], w11 2123; CHECK-GI-NEXT: ldr w10, [sp, #424] 2124; CHECK-GI-NEXT: mov v3.b[3], w9 2125; CHECK-GI-NEXT: ldr w9, [sp, #112] 2126; CHECK-GI-NEXT: ldr w11, [sp, #496] 2127; CHECK-GI-NEXT: mov v0.b[11], w8 2128; CHECK-GI-NEXT: ldr w8, [sp, #48] 2129; CHECK-GI-NEXT: mov v1.b[11], w10 2130; CHECK-GI-NEXT: mov v2.b[4], w11 2131; CHECK-GI-NEXT: ldr w10, [sp, #432] 2132; CHECK-GI-NEXT: mov v3.b[4], w9 2133; CHECK-GI-NEXT: ldr w9, [sp, #120] 2134; CHECK-GI-NEXT: ldr w11, [sp, #504] 2135; CHECK-GI-NEXT: mov v0.b[12], w8 2136; CHECK-GI-NEXT: ldr w8, [sp, #56] 2137; CHECK-GI-NEXT: mov v1.b[12], w10 2138; CHECK-GI-NEXT: mov v2.b[5], w11 2139; CHECK-GI-NEXT: ldr w10, [sp, #440] 2140; CHECK-GI-NEXT: mov v3.b[5], w9 2141; CHECK-GI-NEXT: ldr w9, [sp, #128] 2142; CHECK-GI-NEXT: ldr w11, [sp, #512] 2143; CHECK-GI-NEXT: mov v0.b[13], w8 2144; CHECK-GI-NEXT: ldr w8, [sp, #64] 2145; CHECK-GI-NEXT: mov v1.b[13], w10 2146; CHECK-GI-NEXT: mov v2.b[6], w11 2147; CHECK-GI-NEXT: ldr w10, [sp, #448] 2148; CHECK-GI-NEXT: mov v3.b[6], w9 2149; CHECK-GI-NEXT: ldr w9, [sp, #136] 2150; CHECK-GI-NEXT: ldr w11, [sp, #520] 2151; CHECK-GI-NEXT: mov v0.b[14], w8 2152; CHECK-GI-NEXT: ldr w8, [sp, #72] 2153; CHECK-GI-NEXT: mov v1.b[14], w10 2154; CHECK-GI-NEXT: mov v2.b[7], w11 2155; CHECK-GI-NEXT: mov v3.b[7], w9 2156; CHECK-GI-NEXT: ldr w9, [sp, #456] 2157; CHECK-GI-NEXT: mov v0.b[15], w8 2158; CHECK-GI-NEXT: mov v1.b[15], w9 2159; CHECK-GI-NEXT: fmov d2, d2 2160; CHECK-GI-NEXT: fmov d3, d3 2161; CHECK-GI-NEXT: sdot v16.4s, v0.16b, v5.16b 2162; CHECK-GI-NEXT: sdot v18.4s, v1.16b, v6.16b 2163; CHECK-GI-NEXT: sdot v7.4s, v2.16b, v4.16b 2164; CHECK-GI-NEXT: sdot v17.4s, v3.16b, v4.16b 2165; CHECK-GI-NEXT: add v1.4s, v18.4s, v7.4s 2166; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s 2167; CHECK-GI-NEXT: addv s1, v1.4s 2168; CHECK-GI-NEXT: addv s0, v0.4s 2169; CHECK-GI-NEXT: fmov w9, s1 2170; CHECK-GI-NEXT: fmov w8, s0 2171; CHECK-GI-NEXT: add w0, w8, w9 2172; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 2173; CHECK-GI-NEXT: ret 2174entry: 2175 %az = sext <24 x i8> %a to <24 x i32> 2176 %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %az) 2177 %cz = sext <24 x i8> %c to <24 x i32> 2178 %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %cz) 2179 %x = add i32 %r1, %r2 2180 ret i32 %x 2181} 2182 2183define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 2184; CHECK-LABEL: test_udot_v25i8: 2185; CHECK: // %bb.0: // %entry 2186; CHECK-NEXT: ldp q3, q0, [x1] 2187; CHECK-NEXT: movi v5.2d, #0000000000000000 2188; CHECK-NEXT: ldp q2, q1, [x0] 2189; CHECK-NEXT: umull2 v4.8h, v0.16b, v1.16b 2190; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b 2191; CHECK-NEXT: umull v1.8h, v3.8b, v2.8b 2192; CHECK-NEXT: umull2 v2.8h, v3.16b, v2.16b 2193; CHECK-NEXT: ushll v3.4s, v4.4h, #0 2194; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v0.8h 2195; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h 2196; CHECK-NEXT: mov v5.s[0], v3.s[0] 2197; CHECK-NEXT: uaddw2 v1.4s, v4.4s, v2.8h 2198; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 2199; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h 2200; CHECK-NEXT: add v0.4s, v0.4s, v2.4s 2201; CHECK-NEXT: addv s0, v0.4s 2202; CHECK-NEXT: fmov w8, s0 2203; CHECK-NEXT: add w0, w8, w2 2204; CHECK-NEXT: ret 2205entry: 2206 %0 = load <25 x i8>, ptr %a 2207 %1 = zext <25 x i8> %0 to <25 x i32> 2208 %2 = load <25 x i8>, ptr %b 2209 %3 = zext <25 x i8> %2 to <25 x i32> 2210 %4 = mul nuw nsw <25 x i32> %3, %1 2211 %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4) 2212 %op.extra = add i32 %5, %sum 2213 ret i32 %op.extra 2214} 2215 2216define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { 2217; CHECK-LABEL: test_udot_v25i8_nomla: 2218; CHECK: // %bb.0: // %entry 2219; CHECK-NEXT: ldp q2, q1, [x0] 2220; CHECK-NEXT: movi v0.2d, #0000000000000000 2221; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 2222; CHECK-NEXT: ushll v1.8h, v1.8b, #0 2223; CHECK-NEXT: ushll v4.8h, v2.8b, #0 2224; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 2225; CHECK-NEXT: ushll v3.4s, v3.4h, #0 2226; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v1.8h 2227; CHECK-NEXT: uaddl v1.4s, v4.4h, v1.4h 2228; CHECK-NEXT: mov v0.s[0], v3.s[0] 2229; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v2.8h 2230; CHECK-NEXT: add v1.4s, v1.4s, v3.4s 2231; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h 2232; CHECK-NEXT: add v0.4s, v1.4s, v0.4s 2233; CHECK-NEXT: addv s0, v0.4s 2234; CHECK-NEXT: fmov w0, s0 2235; CHECK-NEXT: ret 2236entry: 2237 %0 = load <25 x i8>, ptr %a1 2238 %1 = zext <25 x i8> %0 to <25 x i32> 2239 %2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %1) 2240 ret i32 %2 2241} 2242define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 2243; CHECK-LABEL: test_sdot_v25i8: 2244; CHECK: // %bb.0: // %entry 2245; CHECK-NEXT: ldp q3, q0, [x1] 2246; CHECK-NEXT: movi v5.2d, #0000000000000000 2247; CHECK-NEXT: ldp q2, q1, [x0] 2248; CHECK-NEXT: smull2 v4.8h, v0.16b, v1.16b 2249; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b 2250; CHECK-NEXT: smull v1.8h, v3.8b, v2.8b 2251; CHECK-NEXT: smull2 v2.8h, v3.16b, v2.16b 2252; CHECK-NEXT: sshll v3.4s, v4.4h, #0 2253; CHECK-NEXT: saddl2 v4.4s, v1.8h, v0.8h 2254; CHECK-NEXT: saddl v0.4s, v1.4h, v0.4h 2255; CHECK-NEXT: mov v5.s[0], v3.s[0] 2256; CHECK-NEXT: saddw2 v1.4s, v4.4s, v2.8h 2257; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 2258; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h 2259; CHECK-NEXT: add v0.4s, v0.4s, v2.4s 2260; CHECK-NEXT: addv s0, v0.4s 2261; CHECK-NEXT: fmov w8, s0 2262; CHECK-NEXT: add w0, w8, w2 2263; CHECK-NEXT: ret 2264entry: 2265 %0 = load <25 x i8>, ptr %a 2266 %1 = sext <25 x i8> %0 to <25 x i32> 2267 %2 = load <25 x i8>, ptr %b 2268 %3 = sext <25 x i8> %2 to <25 x i32> 2269 %4 = mul nsw <25 x i32> %3, %1 2270 %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4) 2271 %op.extra = add nsw i32 %5, %sum 2272 ret i32 %op.extra 2273} 2274 2275define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) { 2276; CHECK-LABEL: test_sdot_v25i8_double: 2277; CHECK: // %bb.0: // %entry 2278; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 2279; CHECK-NEXT: .cfi_def_cfa_offset 16 2280; CHECK-NEXT: .cfi_offset w29, -16 2281; CHECK-NEXT: ldr b0, [sp, #216] 2282; CHECK-NEXT: add x8, sp, #224 2283; CHECK-NEXT: ldr b1, [sp, #16] 2284; CHECK-NEXT: ldr b2, [sp, #280] 2285; CHECK-NEXT: add x9, sp, #240 2286; CHECK-NEXT: ldr b4, [sp, #80] 2287; CHECK-NEXT: ld1 { v0.b }[1], [x8] 2288; CHECK-NEXT: add x8, sp, #24 2289; CHECK-NEXT: add x10, sp, #48 2290; CHECK-NEXT: ld1 { v1.b }[1], [x8] 2291; CHECK-NEXT: add x8, sp, #232 2292; CHECK-NEXT: add x11, sp, #96 2293; CHECK-NEXT: ldr b5, [sp, #152] 2294; CHECK-NEXT: add x12, sp, #168 2295; CHECK-NEXT: ldr b6, [sp, #616] 2296; CHECK-NEXT: ld1 { v0.b }[2], [x8] 2297; CHECK-NEXT: add x8, sp, #32 2298; CHECK-NEXT: fmov s3, w0 2299; CHECK-NEXT: ld1 { v1.b }[2], [x8] 2300; CHECK-NEXT: add x8, sp, #288 2301; CHECK-NEXT: ldr b7, [sp, #416] 2302; CHECK-NEXT: ld1 { v2.b }[1], [x8] 2303; CHECK-NEXT: add x8, sp, #40 2304; CHECK-NEXT: ldr b22, [sp, #744] 2305; CHECK-NEXT: ld1 { v0.b }[3], [x9] 2306; CHECK-NEXT: add x9, sp, #248 2307; CHECK-NEXT: mov v3.b[1], w1 2308; CHECK-NEXT: ld1 { v1.b }[3], [x8] 2309; CHECK-NEXT: add x8, sp, #88 2310; CHECK-NEXT: ldr b23, [sp, #544] 2311; CHECK-NEXT: ld1 { v4.b }[1], [x8] 2312; CHECK-NEXT: add x8, sp, #256 2313; CHECK-NEXT: ldr b19, [sp, #680] 2314; CHECK-NEXT: ld1 { v0.b }[4], [x9] 2315; CHECK-NEXT: add x9, sp, #296 2316; CHECK-NEXT: ldr b20, [sp, #480] 2317; CHECK-NEXT: ld1 { v1.b }[4], [x10] 2318; CHECK-NEXT: ld1 { v2.b }[2], [x9] 2319; CHECK-NEXT: add x10, sp, #160 2320; CHECK-NEXT: ld1 { v4.b }[2], [x11] 2321; CHECK-NEXT: add x11, sp, #304 2322; CHECK-NEXT: ld1 { v5.b }[1], [x10] 2323; CHECK-NEXT: ld1 { v0.b }[5], [x8] 2324; CHECK-NEXT: add x8, sp, #56 2325; CHECK-NEXT: add x10, sp, #264 2326; CHECK-NEXT: ld1 { v1.b }[5], [x8] 2327; CHECK-NEXT: add x8, sp, #64 2328; CHECK-NEXT: ld1 { v2.b }[3], [x11] 2329; CHECK-NEXT: add x9, sp, #272 2330; CHECK-NEXT: ld1 { v5.b }[2], [x12] 2331; CHECK-NEXT: add x11, sp, #72 2332; CHECK-NEXT: ld1 { v0.b }[6], [x10] 2333; CHECK-NEXT: add x10, sp, #312 2334; CHECK-NEXT: mov v3.b[2], w2 2335; CHECK-NEXT: ld1 { v1.b }[6], [x8] 2336; CHECK-NEXT: add x8, sp, #104 2337; CHECK-NEXT: ld1 { v2.b }[4], [x10] 2338; CHECK-NEXT: ld1 { v4.b }[3], [x8] 2339; CHECK-NEXT: add x8, sp, #112 2340; CHECK-NEXT: add x10, sp, #128 2341; CHECK-NEXT: ld1 { v0.b }[7], [x9] 2342; CHECK-NEXT: add x9, sp, #320 2343; CHECK-NEXT: ldr b21, [sp, #552] 2344; CHECK-NEXT: ld1 { v2.b }[5], [x9] 2345; CHECK-NEXT: add x9, sp, #176 2346; CHECK-NEXT: ld1 { v1.b }[7], [x11] 2347; CHECK-NEXT: ld1 { v4.b }[4], [x8] 2348; CHECK-NEXT: add x8, sp, #624 2349; CHECK-NEXT: ld1 { v5.b }[3], [x9] 2350; CHECK-NEXT: ld1 { v6.b }[1], [x8] 2351; CHECK-NEXT: add x8, sp, #120 2352; CHECK-NEXT: add x9, sp, #328 2353; CHECK-NEXT: ld1 { v2.b }[6], [x9] 2354; CHECK-NEXT: add x9, sp, #184 2355; CHECK-NEXT: add x11, sp, #192 2356; CHECK-NEXT: ld1 { v4.b }[5], [x8] 2357; CHECK-NEXT: add x8, sp, #632 2358; CHECK-NEXT: ld1 { v5.b }[4], [x9] 2359; CHECK-NEXT: ld1 { v6.b }[2], [x8] 2360; CHECK-NEXT: add x9, sp, #640 2361; CHECK-NEXT: add x8, sp, #336 2362; CHECK-NEXT: ld1 { v2.b }[7], [x8] 2363; CHECK-NEXT: add x8, sp, #656 2364; CHECK-NEXT: smull v23.8h, v23.8b, v22.8b 2365; CHECK-NEXT: ld1 { v5.b }[5], [x11] 2366; CHECK-NEXT: add x11, sp, #648 2367; CHECK-NEXT: ld1 { v4.b }[6], [x10] 2368; CHECK-NEXT: ld1 { v6.b }[3], [x9] 2369; CHECK-NEXT: add x9, sp, #200 2370; CHECK-NEXT: add x10, sp, #136 2371; CHECK-NEXT: ldr b22, [sp, #352] 2372; CHECK-NEXT: add x12, sp, #360 2373; CHECK-NEXT: mov v3.b[3], w3 2374; CHECK-NEXT: ld1 { v5.b }[6], [x9] 2375; CHECK-NEXT: add x9, sp, #208 2376; CHECK-NEXT: ld1 { v4.b }[7], [x10] 2377; CHECK-NEXT: ld1 { v6.b }[4], [x11] 2378; CHECK-NEXT: add x11, sp, #424 2379; CHECK-NEXT: add x10, sp, #488 2380; CHECK-NEXT: ld1 { v7.b }[1], [x11] 2381; CHECK-NEXT: add x11, sp, #560 2382; CHECK-NEXT: ld1 { v20.b }[1], [x10] 2383; CHECK-NEXT: ld1 { v5.b }[7], [x9] 2384; CHECK-NEXT: add x9, sp, #440 2385; CHECK-NEXT: ld1 { v21.b }[1], [x11] 2386; CHECK-NEXT: ld1 { v6.b }[5], [x8] 2387; CHECK-NEXT: add x8, sp, #432 2388; CHECK-NEXT: ld1 { v22.b }[1], [x12] 2389; CHECK-NEXT: ld1 { v7.b }[2], [x8] 2390; CHECK-NEXT: add x11, sp, #496 2391; CHECK-NEXT: add x12, sp, #568 2392; CHECK-NEXT: add x13, sp, #368 2393; CHECK-NEXT: ld1 { v20.b }[2], [x11] 2394; CHECK-NEXT: ld1 { v21.b }[2], [x12] 2395; CHECK-NEXT: ld1 { v22.b }[2], [x13] 2396; CHECK-NEXT: add x10, sp, #448 2397; CHECK-NEXT: mov v3.b[4], w4 2398; CHECK-NEXT: ld1 { v7.b }[3], [x9] 2399; CHECK-NEXT: add x9, sp, #688 2400; CHECK-NEXT: add x11, sp, #576 2401; CHECK-NEXT: ld1 { v19.b }[1], [x9] 2402; CHECK-NEXT: add x9, sp, #696 2403; CHECK-NEXT: add x12, sp, #376 2404; CHECK-NEXT: ld1 { v21.b }[3], [x11] 2405; CHECK-NEXT: ld1 { v22.b }[3], [x12] 2406; CHECK-NEXT: add x11, sp, #512 2407; CHECK-NEXT: ld1 { v7.b }[4], [x10] 2408; CHECK-NEXT: add x10, sp, #504 2409; CHECK-NEXT: add x12, sp, #584 2410; CHECK-NEXT: ld1 { v19.b }[2], [x9] 2411; CHECK-NEXT: add x9, sp, #704 2412; CHECK-NEXT: ld1 { v20.b }[3], [x10] 2413; CHECK-NEXT: add x13, sp, #384 2414; CHECK-NEXT: mov v3.b[5], w5 2415; CHECK-NEXT: ld1 { v21.b }[4], [x12] 2416; CHECK-NEXT: ld1 { v22.b }[4], [x13] 2417; CHECK-NEXT: add x10, sp, #456 2418; CHECK-NEXT: ldr b16, [sp, #344] 2419; CHECK-NEXT: ld1 { v19.b }[3], [x9] 2420; CHECK-NEXT: add x9, sp, #712 2421; CHECK-NEXT: ld1 { v20.b }[4], [x11] 2422; CHECK-NEXT: ldr b17, [sp, #144] 2423; CHECK-NEXT: ld1 { v7.b }[5], [x10] 2424; CHECK-NEXT: add x10, sp, #520 2425; CHECK-NEXT: add x11, sp, #592 2426; CHECK-NEXT: add x12, sp, #392 2427; CHECK-NEXT: mov v3.b[6], w6 2428; CHECK-NEXT: ld1 { v19.b }[4], [x9] 2429; CHECK-NEXT: add x9, sp, #720 2430; CHECK-NEXT: ld1 { v20.b }[5], [x10] 2431; CHECK-NEXT: ld1 { v21.b }[5], [x11] 2432; CHECK-NEXT: ld1 { v22.b }[5], [x12] 2433; CHECK-NEXT: smull v16.8h, v17.8b, v16.8b 2434; CHECK-NEXT: add x8, sp, #664 2435; CHECK-NEXT: add x10, sp, #464 2436; CHECK-NEXT: add x11, sp, #528 2437; CHECK-NEXT: ld1 { v19.b }[5], [x9] 2438; CHECK-NEXT: add x9, sp, #728 2439; CHECK-NEXT: add x12, sp, #600 2440; CHECK-NEXT: add x13, sp, #400 2441; CHECK-NEXT: ld1 { v6.b }[6], [x8] 2442; CHECK-NEXT: ld1 { v20.b }[6], [x11] 2443; CHECK-NEXT: ld1 { v21.b }[6], [x12] 2444; CHECK-NEXT: ld1 { v22.b }[6], [x13] 2445; CHECK-NEXT: ld1 { v7.b }[6], [x10] 2446; CHECK-NEXT: ld1 { v19.b }[6], [x9] 2447; CHECK-NEXT: add x9, sp, #736 2448; CHECK-NEXT: mov v3.b[7], w7 2449; CHECK-NEXT: sshll v18.4s, v16.4h, #0 2450; CHECK-NEXT: movi v16.2d, #0000000000000000 2451; CHECK-NEXT: movi v17.2d, #0000000000000000 2452; CHECK-NEXT: add x8, sp, #672 2453; CHECK-NEXT: add x10, sp, #472 2454; CHECK-NEXT: add x11, sp, #608 2455; CHECK-NEXT: ld1 { v19.b }[7], [x9] 2456; CHECK-NEXT: add x9, sp, #536 2457; CHECK-NEXT: add x12, sp, #408 2458; CHECK-NEXT: ld1 { v20.b }[7], [x9] 2459; CHECK-NEXT: ld1 { v21.b }[7], [x11] 2460; CHECK-NEXT: ld1 { v22.b }[7], [x12] 2461; CHECK-NEXT: ld1 { v6.b }[7], [x8] 2462; CHECK-NEXT: ld1 { v7.b }[7], [x10] 2463; CHECK-NEXT: sshll v23.4s, v23.4h, #0 2464; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b 2465; CHECK-NEXT: smull v1.8h, v4.8b, v2.8b 2466; CHECK-NEXT: smull v2.8h, v3.8b, v5.8b 2467; CHECK-NEXT: smull v3.8h, v20.8b, v19.8b 2468; CHECK-NEXT: smull v4.8h, v22.8b, v21.8b 2469; CHECK-NEXT: mov v17.s[0], v18.s[0] 2470; CHECK-NEXT: smull v5.8h, v7.8b, v6.8b 2471; CHECK-NEXT: mov v16.s[0], v23.s[0] 2472; CHECK-NEXT: saddl2 v6.4s, v2.8h, v1.8h 2473; CHECK-NEXT: saddl v1.4s, v2.4h, v1.4h 2474; CHECK-NEXT: saddl2 v2.4s, v4.8h, v3.8h 2475; CHECK-NEXT: saddl v3.4s, v4.4h, v3.4h 2476; CHECK-NEXT: saddw v4.4s, v17.4s, v0.4h 2477; CHECK-NEXT: saddw v7.4s, v16.4s, v5.4h 2478; CHECK-NEXT: saddw2 v0.4s, v6.4s, v0.8h 2479; CHECK-NEXT: add v1.4s, v1.4s, v4.4s 2480; CHECK-NEXT: saddw2 v2.4s, v2.4s, v5.8h 2481; CHECK-NEXT: add v3.4s, v3.4s, v7.4s 2482; CHECK-NEXT: add v0.4s, v1.4s, v0.4s 2483; CHECK-NEXT: add v1.4s, v3.4s, v2.4s 2484; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 2485; CHECK-NEXT: addv s0, v0.4s 2486; CHECK-NEXT: fmov w0, s0 2487; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 2488; CHECK-NEXT: ret 2489entry: 2490 %az = sext <25 x i8> %a to <25 x i32> 2491 %bz = sext <25 x i8> %b to <25 x i32> 2492 %m1 = mul nuw nsw <25 x i32> %az, %bz 2493 %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m1) 2494 %cz = sext <25 x i8> %c to <25 x i32> 2495 %dz = sext <25 x i8> %d to <25 x i32> 2496 %m2 = mul nuw nsw <25 x i32> %cz, %dz 2497 %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m2) 2498 %x = add i32 %r1, %r2 2499 ret i32 %x 2500} 2501 2502define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) { 2503; CHECK-LABEL: test_sdot_v25i8_double_nomla: 2504; CHECK: // %bb.0: // %entry 2505; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 2506; CHECK-NEXT: .cfi_def_cfa_offset 16 2507; CHECK-NEXT: .cfi_offset w29, -16 2508; CHECK-NEXT: fmov s0, w0 2509; CHECK-NEXT: ldr b1, [sp, #80] 2510; CHECK-NEXT: add x10, sp, #88 2511; CHECK-NEXT: ldr b2, [sp, #16] 2512; CHECK-NEXT: add x9, sp, #96 2513; CHECK-NEXT: ldr b3, [sp, #480] 2514; CHECK-NEXT: ld1 { v1.b }[1], [x10] 2515; CHECK-NEXT: add x10, sp, #24 2516; CHECK-NEXT: ldr b4, [sp, #352] 2517; CHECK-NEXT: mov v0.b[1], w1 2518; CHECK-NEXT: ld1 { v2.b }[1], [x10] 2519; CHECK-NEXT: add x11, sp, #488 2520; CHECK-NEXT: add x10, sp, #360 2521; CHECK-NEXT: ldr b5, [sp, #416] 2522; CHECK-NEXT: add x8, sp, #104 2523; CHECK-NEXT: ld1 { v1.b }[2], [x9] 2524; CHECK-NEXT: add x9, sp, #32 2525; CHECK-NEXT: ld1 { v3.b }[1], [x11] 2526; CHECK-NEXT: ld1 { v2.b }[2], [x9] 2527; CHECK-NEXT: add x11, sp, #424 2528; CHECK-NEXT: ld1 { v4.b }[1], [x10] 2529; CHECK-NEXT: mov v0.b[2], w2 2530; CHECK-NEXT: ld1 { v5.b }[1], [x11] 2531; CHECK-NEXT: add x9, sp, #368 2532; CHECK-NEXT: ld1 { v1.b }[3], [x8] 2533; CHECK-NEXT: add x8, sp, #40 2534; CHECK-NEXT: add x12, sp, #496 2535; CHECK-NEXT: ld1 { v2.b }[3], [x8] 2536; CHECK-NEXT: ld1 { v4.b }[2], [x9] 2537; CHECK-NEXT: add x8, sp, #432 2538; CHECK-NEXT: ld1 { v3.b }[2], [x12] 2539; CHECK-NEXT: add x13, sp, #48 2540; CHECK-NEXT: ld1 { v5.b }[2], [x8] 2541; CHECK-NEXT: mov v0.b[3], w3 2542; CHECK-NEXT: add x10, sp, #112 2543; CHECK-NEXT: add x8, sp, #504 2544; CHECK-NEXT: ld1 { v2.b }[4], [x13] 2545; CHECK-NEXT: add x13, sp, #376 2546; CHECK-NEXT: ld1 { v1.b }[4], [x10] 2547; CHECK-NEXT: ld1 { v4.b }[3], [x13] 2548; CHECK-NEXT: add x13, sp, #440 2549; CHECK-NEXT: ld1 { v3.b }[3], [x8] 2550; CHECK-NEXT: ld1 { v5.b }[3], [x13] 2551; CHECK-NEXT: add x11, sp, #120 2552; CHECK-NEXT: add x8, sp, #56 2553; CHECK-NEXT: mov v0.b[4], w4 2554; CHECK-NEXT: add x13, sp, #512 2555; CHECK-NEXT: ld1 { v1.b }[5], [x11] 2556; CHECK-NEXT: ld1 { v2.b }[5], [x8] 2557; CHECK-NEXT: add x8, sp, #384 2558; CHECK-NEXT: add x11, sp, #448 2559; CHECK-NEXT: ld1 { v3.b }[4], [x13] 2560; CHECK-NEXT: ld1 { v4.b }[4], [x8] 2561; CHECK-NEXT: ld1 { v5.b }[4], [x11] 2562; CHECK-NEXT: add x12, sp, #128 2563; CHECK-NEXT: add x10, sp, #64 2564; CHECK-NEXT: add x8, sp, #520 2565; CHECK-NEXT: mov v0.b[5], w5 2566; CHECK-NEXT: ld1 { v1.b }[6], [x12] 2567; CHECK-NEXT: ld1 { v2.b }[6], [x10] 2568; CHECK-NEXT: add x10, sp, #392 2569; CHECK-NEXT: add x11, sp, #456 2570; CHECK-NEXT: ldr b6, [sp, #144] 2571; CHECK-NEXT: ldr b7, [sp, #544] 2572; CHECK-NEXT: ld1 { v3.b }[5], [x8] 2573; CHECK-NEXT: ld1 { v4.b }[5], [x10] 2574; CHECK-NEXT: ld1 { v5.b }[5], [x11] 2575; CHECK-NEXT: add x9, sp, #136 2576; CHECK-NEXT: sshll v6.8h, v6.8b, #0 2577; CHECK-NEXT: mov v0.b[6], w6 2578; CHECK-NEXT: ld1 { v1.b }[7], [x9] 2579; CHECK-NEXT: add x8, sp, #528 2580; CHECK-NEXT: add x9, sp, #400 2581; CHECK-NEXT: add x10, sp, #464 2582; CHECK-NEXT: sshll v7.8h, v7.8b, #0 2583; CHECK-NEXT: ld1 { v3.b }[6], [x8] 2584; CHECK-NEXT: ld1 { v4.b }[6], [x9] 2585; CHECK-NEXT: ld1 { v5.b }[6], [x10] 2586; CHECK-NEXT: movi v16.2d, #0000000000000000 2587; CHECK-NEXT: movi v17.2d, #0000000000000000 2588; CHECK-NEXT: add x14, sp, #72 2589; CHECK-NEXT: mov v0.b[7], w7 2590; CHECK-NEXT: sshll v6.4s, v6.4h, #0 2591; CHECK-NEXT: add x8, sp, #536 2592; CHECK-NEXT: add x9, sp, #408 2593; CHECK-NEXT: add x10, sp, #472 2594; CHECK-NEXT: sshll v7.4s, v7.4h, #0 2595; CHECK-NEXT: ld1 { v2.b }[7], [x14] 2596; CHECK-NEXT: ld1 { v3.b }[7], [x8] 2597; CHECK-NEXT: ld1 { v4.b }[7], [x9] 2598; CHECK-NEXT: ld1 { v5.b }[7], [x10] 2599; CHECK-NEXT: mov v16.s[0], v6.s[0] 2600; CHECK-NEXT: sshll v1.8h, v1.8b, #0 2601; CHECK-NEXT: mov v17.s[0], v7.s[0] 2602; CHECK-NEXT: sshll v0.8h, v0.8b, #0 2603; CHECK-NEXT: sshll v2.8h, v2.8b, #0 2604; CHECK-NEXT: sshll v3.8h, v3.8b, #0 2605; CHECK-NEXT: sshll v4.8h, v4.8b, #0 2606; CHECK-NEXT: sshll v5.8h, v5.8b, #0 2607; CHECK-NEXT: saddl v7.4s, v0.4h, v1.4h 2608; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h 2609; CHECK-NEXT: saddw v6.4s, v16.4s, v2.4h 2610; CHECK-NEXT: saddl v1.4s, v4.4h, v3.4h 2611; CHECK-NEXT: saddl2 v3.4s, v4.8h, v3.8h 2612; CHECK-NEXT: saddw v4.4s, v17.4s, v5.4h 2613; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h 2614; CHECK-NEXT: add v6.4s, v7.4s, v6.4s 2615; CHECK-NEXT: saddw2 v2.4s, v3.4s, v5.8h 2616; CHECK-NEXT: add v1.4s, v1.4s, v4.4s 2617; CHECK-NEXT: add v0.4s, v6.4s, v0.4s 2618; CHECK-NEXT: add v1.4s, v1.4s, v2.4s 2619; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 2620; CHECK-NEXT: addv s0, v0.4s 2621; CHECK-NEXT: fmov w0, s0 2622; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 2623; CHECK-NEXT: ret 2624entry: 2625 %az = sext <25 x i8> %a to <25 x i32> 2626 %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %az) 2627 %cz = sext <25 x i8> %c to <25 x i32> 2628 %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %cz) 2629 %x = add i32 %r1, %r2 2630 ret i32 %x 2631} 2632 2633define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 2634; CHECK-SD-LABEL: test_udot_v32i8: 2635; CHECK-SD: // %bb.0: // %entry 2636; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 2637; CHECK-SD-NEXT: ldp q1, q3, [x0] 2638; CHECK-SD-NEXT: ldp q2, q4, [x1] 2639; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b 2640; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b 2641; CHECK-SD-NEXT: addv s0, v0.4s 2642; CHECK-SD-NEXT: fmov w8, s0 2643; CHECK-SD-NEXT: add w0, w8, w2 2644; CHECK-SD-NEXT: ret 2645; 2646; CHECK-GI-LABEL: test_udot_v32i8: 2647; CHECK-GI: // %bb.0: // %entry 2648; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 2649; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 2650; CHECK-GI-NEXT: ldp q2, q3, [x0] 2651; CHECK-GI-NEXT: ldp q4, q5, [x1] 2652; CHECK-GI-NEXT: udot v1.4s, v4.16b, v2.16b 2653; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b 2654; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s 2655; CHECK-GI-NEXT: addv s0, v0.4s 2656; CHECK-GI-NEXT: fmov w8, s0 2657; CHECK-GI-NEXT: add w0, w8, w2 2658; CHECK-GI-NEXT: ret 2659entry: 2660 %0 = load <32 x i8>, ptr %a 2661 %1 = zext <32 x i8> %0 to <32 x i32> 2662 %2 = load <32 x i8>, ptr %b 2663 %3 = zext <32 x i8> %2 to <32 x i32> 2664 %4 = mul nuw nsw <32 x i32> %3, %1 2665 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 2666 %op.extra = add i32 %5, %sum 2667 ret i32 %op.extra 2668} 2669 2670define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) { 2671; CHECK-SD-LABEL: test_udot_v32i8_nomla: 2672; CHECK-SD: // %bb.0: // %entry 2673; CHECK-SD-NEXT: movi v0.16b, #1 2674; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 2675; CHECK-SD-NEXT: ldp q2, q3, [x0] 2676; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b 2677; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b 2678; CHECK-SD-NEXT: addv s0, v1.4s 2679; CHECK-SD-NEXT: fmov w0, s0 2680; CHECK-SD-NEXT: ret 2681; 2682; CHECK-GI-LABEL: test_udot_v32i8_nomla: 2683; CHECK-GI: // %bb.0: // %entry 2684; CHECK-GI-NEXT: movi v0.16b, #1 2685; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 2686; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 2687; CHECK-GI-NEXT: ldp q3, q4, [x0] 2688; CHECK-GI-NEXT: udot v2.4s, v3.16b, v0.16b 2689; CHECK-GI-NEXT: udot v1.4s, v4.16b, v0.16b 2690; CHECK-GI-NEXT: add v0.4s, v2.4s, v1.4s 2691; CHECK-GI-NEXT: addv s0, v0.4s 2692; CHECK-GI-NEXT: fmov w0, s0 2693; CHECK-GI-NEXT: ret 2694entry: 2695 %0 = load <32 x i8>, ptr %a1 2696 %1 = zext <32 x i8> %0 to <32 x i32> 2697 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) 2698 ret i32 %2 2699} 2700define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 2701; CHECK-SD-LABEL: test_sdot_v32i8: 2702; CHECK-SD: // %bb.0: // %entry 2703; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 2704; CHECK-SD-NEXT: ldp q1, q3, [x0] 2705; CHECK-SD-NEXT: ldp q2, q4, [x1] 2706; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v3.16b 2707; CHECK-SD-NEXT: sdot v0.4s, v2.16b, v1.16b 2708; CHECK-SD-NEXT: addv s0, v0.4s 2709; CHECK-SD-NEXT: fmov w8, s0 2710; CHECK-SD-NEXT: add w0, w8, w2 2711; CHECK-SD-NEXT: ret 2712; 2713; CHECK-GI-LABEL: test_sdot_v32i8: 2714; CHECK-GI: // %bb.0: // %entry 2715; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 2716; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 2717; CHECK-GI-NEXT: ldp q2, q3, [x0] 2718; CHECK-GI-NEXT: ldp q4, q5, [x1] 2719; CHECK-GI-NEXT: sdot v1.4s, v4.16b, v2.16b 2720; CHECK-GI-NEXT: sdot v0.4s, v5.16b, v3.16b 2721; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s 2722; CHECK-GI-NEXT: addv s0, v0.4s 2723; CHECK-GI-NEXT: fmov w8, s0 2724; CHECK-GI-NEXT: add w0, w8, w2 2725; CHECK-GI-NEXT: ret 2726entry: 2727 %0 = load <32 x i8>, ptr %a 2728 %1 = sext <32 x i8> %0 to <32 x i32> 2729 %2 = load <32 x i8>, ptr %b 2730 %3 = sext <32 x i8> %2 to <32 x i32> 2731 %4 = mul nsw <32 x i32> %3, %1 2732 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 2733 %op.extra = add nsw i32 %5, %sum 2734 ret i32 %op.extra 2735} 2736 2737define i32 @test_sdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { 2738; CHECK-SD-LABEL: test_sdot_v32i8_double: 2739; CHECK-SD: // %bb.0: // %entry 2740; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 2741; CHECK-SD-NEXT: movi v17.2d, #0000000000000000 2742; CHECK-SD-NEXT: sdot v17.4s, v1.16b, v3.16b 2743; CHECK-SD-NEXT: sdot v16.4s, v5.16b, v7.16b 2744; CHECK-SD-NEXT: sdot v17.4s, v0.16b, v2.16b 2745; CHECK-SD-NEXT: sdot v16.4s, v4.16b, v6.16b 2746; CHECK-SD-NEXT: add v0.4s, v17.4s, v16.4s 2747; CHECK-SD-NEXT: addv s0, v0.4s 2748; CHECK-SD-NEXT: fmov w0, s0 2749; CHECK-SD-NEXT: ret 2750; 2751; CHECK-GI-LABEL: test_sdot_v32i8_double: 2752; CHECK-GI: // %bb.0: // %entry 2753; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 2754; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 2755; CHECK-GI-NEXT: movi v18.2d, #0000000000000000 2756; CHECK-GI-NEXT: movi v19.2d, #0000000000000000 2757; CHECK-GI-NEXT: sdot v16.4s, v0.16b, v2.16b 2758; CHECK-GI-NEXT: sdot v18.4s, v1.16b, v3.16b 2759; CHECK-GI-NEXT: sdot v17.4s, v5.16b, v7.16b 2760; CHECK-GI-NEXT: sdot v19.4s, v4.16b, v6.16b 2761; CHECK-GI-NEXT: add v0.4s, v16.4s, v18.4s 2762; CHECK-GI-NEXT: add v1.4s, v19.4s, v17.4s 2763; CHECK-GI-NEXT: addv s0, v0.4s 2764; CHECK-GI-NEXT: addv s1, v1.4s 2765; CHECK-GI-NEXT: fmov w8, s0 2766; CHECK-GI-NEXT: fmov w9, s1 2767; CHECK-GI-NEXT: add w0, w8, w9 2768; CHECK-GI-NEXT: ret 2769entry: 2770 %az = sext <32 x i8> %a to <32 x i32> 2771 %bz = sext <32 x i8> %b to <32 x i32> 2772 %m1 = mul nuw nsw <32 x i32> %az, %bz 2773 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1) 2774 %cz = sext <32 x i8> %c to <32 x i32> 2775 %dz = sext <32 x i8> %d to <32 x i32> 2776 %m2 = mul nuw nsw <32 x i32> %cz, %dz 2777 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2) 2778 %x = add i32 %r1, %r2 2779 ret i32 %x 2780} 2781 2782define i32 @test_sdot_v32i8_double_nomla(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { 2783; CHECK-SD-LABEL: test_sdot_v32i8_double_nomla: 2784; CHECK-SD: // %bb.0: // %entry 2785; CHECK-SD-NEXT: movi v2.16b, #1 2786; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 2787; CHECK-SD-NEXT: movi v6.2d, #0000000000000000 2788; CHECK-SD-NEXT: sdot v6.4s, v1.16b, v2.16b 2789; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v2.16b 2790; CHECK-SD-NEXT: sdot v6.4s, v0.16b, v2.16b 2791; CHECK-SD-NEXT: sdot v3.4s, v4.16b, v2.16b 2792; CHECK-SD-NEXT: add v0.4s, v6.4s, v3.4s 2793; CHECK-SD-NEXT: addv s0, v0.4s 2794; CHECK-SD-NEXT: fmov w0, s0 2795; CHECK-SD-NEXT: ret 2796; 2797; CHECK-GI-LABEL: test_sdot_v32i8_double_nomla: 2798; CHECK-GI: // %bb.0: // %entry 2799; CHECK-GI-NEXT: movi v2.16b, #1 2800; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 2801; CHECK-GI-NEXT: movi v6.2d, #0000000000000000 2802; CHECK-GI-NEXT: movi v7.2d, #0000000000000000 2803; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 2804; CHECK-GI-NEXT: sdot v3.4s, v0.16b, v2.16b 2805; CHECK-GI-NEXT: sdot v6.4s, v5.16b, v2.16b 2806; CHECK-GI-NEXT: sdot v7.4s, v1.16b, v2.16b 2807; CHECK-GI-NEXT: sdot v16.4s, v4.16b, v2.16b 2808; CHECK-GI-NEXT: add v0.4s, v3.4s, v7.4s 2809; CHECK-GI-NEXT: add v1.4s, v16.4s, v6.4s 2810; CHECK-GI-NEXT: addv s0, v0.4s 2811; CHECK-GI-NEXT: addv s1, v1.4s 2812; CHECK-GI-NEXT: fmov w8, s0 2813; CHECK-GI-NEXT: fmov w9, s1 2814; CHECK-GI-NEXT: add w0, w8, w9 2815; CHECK-GI-NEXT: ret 2816entry: 2817 %az = sext <32 x i8> %a to <32 x i32> 2818 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %az) 2819 %cz = sext <32 x i8> %c to <32 x i32> 2820 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %cz) 2821 %x = add i32 %r1, %r2 2822 ret i32 %x 2823} 2824 2825define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 2826; CHECK-SD-LABEL: test_usdot_v32i8: 2827; CHECK-SD: // %bb.0: // %entry 2828; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 2829; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 2830; CHECK-SD-NEXT: ldp q2, q3, [x0] 2831; CHECK-SD-NEXT: ldp q4, q5, [x1] 2832; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b 2833; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b 2834; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 2835; CHECK-SD-NEXT: addv s0, v0.4s 2836; CHECK-SD-NEXT: fmov w8, s0 2837; CHECK-SD-NEXT: add w0, w8, w2 2838; CHECK-SD-NEXT: ret 2839; 2840; CHECK-GI-LABEL: test_usdot_v32i8: 2841; CHECK-GI: // %bb.0: // %entry 2842; CHECK-GI-NEXT: ldp q0, q1, [x1] 2843; CHECK-GI-NEXT: ldp q2, q3, [x0] 2844; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0 2845; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 2846; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0 2847; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 2848; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0 2849; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0 2850; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 2851; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 2852; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0 2853; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0 2854; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0 2855; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0 2856; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0 2857; CHECK-GI-NEXT: ushll2 v21.4s, v2.8h, #0 2858; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0 2859; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0 2860; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 2861; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 2862; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 2863; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 2864; CHECK-GI-NEXT: mul v16.4s, v16.4s, v20.4s 2865; CHECK-GI-NEXT: mul v17.4s, v17.4s, v21.4s 2866; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 2867; CHECK-GI-NEXT: mul v18.4s, v18.4s, v22.4s 2868; CHECK-GI-NEXT: mul v19.4s, v19.4s, v23.4s 2869; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 2870; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 2871; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 2872; CHECK-GI-NEXT: mla v16.4s, v4.4s, v6.4s 2873; CHECK-GI-NEXT: mla v17.4s, v0.4s, v2.4s 2874; CHECK-GI-NEXT: mla v18.4s, v5.4s, v7.4s 2875; CHECK-GI-NEXT: mla v19.4s, v1.4s, v3.4s 2876; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s 2877; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s 2878; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 2879; CHECK-GI-NEXT: addv s0, v0.4s 2880; CHECK-GI-NEXT: fmov w8, s0 2881; CHECK-GI-NEXT: add w0, w8, w2 2882; CHECK-GI-NEXT: ret 2883entry: 2884 %0 = load <32 x i8>, ptr %a 2885 %1 = zext <32 x i8> %0 to <32 x i32> 2886 %2 = load <32 x i8>, ptr %b 2887 %3 = sext <32 x i8> %2 to <32 x i32> 2888 %4 = mul nsw <32 x i32> %3, %1 2889 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) 2890 %op.extra = add nsw i32 %5, %sum 2891 ret i32 %op.extra 2892} 2893 2894define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { 2895; CHECK-SD-LABEL: test_usdot_v32i8_double: 2896; CHECK-SD: // %bb.0: // %entry 2897; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 2898; CHECK-SD-NEXT: movi v17.2d, #0000000000000000 2899; CHECK-SD-NEXT: movi v18.2d, #0000000000000000 2900; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 2901; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b 2902; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b 2903; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b 2904; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b 2905; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s 2906; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s 2907; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 2908; CHECK-SD-NEXT: addv s0, v0.4s 2909; CHECK-SD-NEXT: fmov w0, s0 2910; CHECK-SD-NEXT: ret 2911; 2912; CHECK-GI-LABEL: test_usdot_v32i8_double: 2913; CHECK-GI: // %bb.0: // %entry 2914; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill 2915; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill 2916; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill 2917; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill 2918; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 2919; CHECK-GI-NEXT: .cfi_offset b8, -8 2920; CHECK-GI-NEXT: .cfi_offset b9, -16 2921; CHECK-GI-NEXT: .cfi_offset b10, -24 2922; CHECK-GI-NEXT: .cfi_offset b11, -32 2923; CHECK-GI-NEXT: .cfi_offset b12, -40 2924; CHECK-GI-NEXT: .cfi_offset b13, -48 2925; CHECK-GI-NEXT: .cfi_offset b14, -56 2926; CHECK-GI-NEXT: .cfi_offset b15, -64 2927; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0 2928; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 2929; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0 2930; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 2931; CHECK-GI-NEXT: sshll v18.8h, v2.8b, #0 2932; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 2933; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0 2934; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0 2935; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0 2936; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0 2937; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0 2938; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0 2939; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0 2940; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0 2941; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0 2942; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0 2943; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0 2944; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0 2945; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0 2946; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0 2947; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0 2948; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0 2949; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0 2950; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0 2951; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0 2952; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0 2953; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0 2954; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0 2955; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0 2956; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0 2957; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0 2958; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0 2959; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s 2960; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s 2961; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s 2962; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s 2963; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s 2964; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s 2965; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 2966; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload 2967; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s 2968; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload 2969; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s 2970; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload 2971; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 2972; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0 2973; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 2974; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0 2975; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 2976; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0 2977; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 2978; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0 2979; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 2980; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0 2981; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 2982; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0 2983; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 2984; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0 2985; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0 2986; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s 2987; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s 2988; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s 2989; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s 2990; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s 2991; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s 2992; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s 2993; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s 2994; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s 2995; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s 2996; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s 2997; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s 2998; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 2999; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s 3000; CHECK-GI-NEXT: addv s0, v0.4s 3001; CHECK-GI-NEXT: addv s1, v1.4s 3002; CHECK-GI-NEXT: fmov w8, s0 3003; CHECK-GI-NEXT: fmov w9, s1 3004; CHECK-GI-NEXT: add w0, w8, w9 3005; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload 3006; CHECK-GI-NEXT: ret 3007entry: 3008 %az = zext <32 x i8> %a to <32 x i32> 3009 %bz = sext <32 x i8> %b to <32 x i32> 3010 %m1 = mul nuw nsw <32 x i32> %az, %bz 3011 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1) 3012 %cz = zext <32 x i8> %c to <32 x i32> 3013 %dz = sext <32 x i8> %d to <32 x i32> 3014 %m2 = mul nuw nsw <32 x i32> %cz, %dz 3015 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2) 3016 %x = add i32 %r1, %r2 3017 ret i32 %x 3018} 3019 3020 3021define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 3022; CHECK-LABEL: test_udot_v33i8: 3023; CHECK: // %bb.0: // %entry 3024; CHECK-NEXT: ldr b0, [x0, #32] 3025; CHECK-NEXT: ldr b1, [x1, #32] 3026; CHECK-NEXT: movi v7.2d, #0000000000000000 3027; CHECK-NEXT: ldp q3, q4, [x1] 3028; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b 3029; CHECK-NEXT: ldp q1, q2, [x0] 3030; CHECK-NEXT: umull v5.8h, v4.8b, v2.8b 3031; CHECK-NEXT: umull v6.8h, v3.8b, v1.8b 3032; CHECK-NEXT: umull2 v2.8h, v4.16b, v2.16b 3033; CHECK-NEXT: ushll v0.4s, v0.4h, #0 3034; CHECK-NEXT: umull2 v1.8h, v3.16b, v1.16b 3035; CHECK-NEXT: mov v7.s[0], v0.s[0] 3036; CHECK-NEXT: uaddl2 v3.4s, v6.8h, v5.8h 3037; CHECK-NEXT: uaddl2 v0.4s, v1.8h, v2.8h 3038; CHECK-NEXT: uaddl v1.4s, v1.4h, v2.4h 3039; CHECK-NEXT: add v0.4s, v3.4s, v0.4s 3040; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h 3041; CHECK-NEXT: uaddw v2.4s, v2.4s, v5.4h 3042; CHECK-NEXT: add v0.4s, v1.4s, v0.4s 3043; CHECK-NEXT: add v0.4s, v2.4s, v0.4s 3044; CHECK-NEXT: addv s0, v0.4s 3045; CHECK-NEXT: fmov w8, s0 3046; CHECK-NEXT: add w0, w8, w2 3047; CHECK-NEXT: ret 3048entry: 3049 %0 = load <33 x i8>, ptr %a 3050 %1 = zext <33 x i8> %0 to <33 x i32> 3051 %2 = load <33 x i8>, ptr %b 3052 %3 = zext <33 x i8> %2 to <33 x i32> 3053 %4 = mul nuw nsw <33 x i32> %3, %1 3054 %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4) 3055 %op.extra = add i32 %5, %sum 3056 ret i32 %op.extra 3057} 3058 3059define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { 3060; CHECK-LABEL: test_udot_v33i8_nomla: 3061; CHECK: // %bb.0: // %entry 3062; CHECK-NEXT: ldr b1, [x0, #32] 3063; CHECK-NEXT: ldp q3, q2, [x0] 3064; CHECK-NEXT: movi v0.2d, #0000000000000000 3065; CHECK-NEXT: ushll v1.8h, v1.8b, #0 3066; CHECK-NEXT: ushll v4.8h, v2.8b, #0 3067; CHECK-NEXT: ushll v5.8h, v3.8b, #0 3068; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 3069; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 3070; CHECK-NEXT: ushll v1.4s, v1.4h, #0 3071; CHECK-NEXT: uaddl2 v6.4s, v5.8h, v4.8h 3072; CHECK-NEXT: mov v0.s[0], v1.s[0] 3073; CHECK-NEXT: uaddl2 v1.4s, v3.8h, v2.8h 3074; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h 3075; CHECK-NEXT: add v1.4s, v6.4s, v1.4s 3076; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h 3077; CHECK-NEXT: add v1.4s, v2.4s, v1.4s 3078; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h 3079; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 3080; CHECK-NEXT: addv s0, v0.4s 3081; CHECK-NEXT: fmov w0, s0 3082; CHECK-NEXT: ret 3083entry: 3084 %0 = load <33 x i8>, ptr %a1 3085 %1 = zext <33 x i8> %0 to <33 x i32> 3086 %2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %1) 3087 ret i32 %2 3088} 3089define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 3090; CHECK-LABEL: test_sdot_v33i8: 3091; CHECK: // %bb.0: // %entry 3092; CHECK-NEXT: ldr b0, [x0, #32] 3093; CHECK-NEXT: ldr b1, [x1, #32] 3094; CHECK-NEXT: movi v7.2d, #0000000000000000 3095; CHECK-NEXT: ldp q3, q4, [x1] 3096; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b 3097; CHECK-NEXT: ldp q1, q2, [x0] 3098; CHECK-NEXT: smull v5.8h, v4.8b, v2.8b 3099; CHECK-NEXT: smull v6.8h, v3.8b, v1.8b 3100; CHECK-NEXT: smull2 v2.8h, v4.16b, v2.16b 3101; CHECK-NEXT: sshll v0.4s, v0.4h, #0 3102; CHECK-NEXT: smull2 v1.8h, v3.16b, v1.16b 3103; CHECK-NEXT: mov v7.s[0], v0.s[0] 3104; CHECK-NEXT: saddl2 v3.4s, v6.8h, v5.8h 3105; CHECK-NEXT: saddl2 v0.4s, v1.8h, v2.8h 3106; CHECK-NEXT: saddl v1.4s, v1.4h, v2.4h 3107; CHECK-NEXT: add v0.4s, v3.4s, v0.4s 3108; CHECK-NEXT: saddw v2.4s, v7.4s, v6.4h 3109; CHECK-NEXT: saddw v2.4s, v2.4s, v5.4h 3110; CHECK-NEXT: add v0.4s, v1.4s, v0.4s 3111; CHECK-NEXT: add v0.4s, v2.4s, v0.4s 3112; CHECK-NEXT: addv s0, v0.4s 3113; CHECK-NEXT: fmov w8, s0 3114; CHECK-NEXT: add w0, w8, w2 3115; CHECK-NEXT: ret 3116entry: 3117 %0 = load <33 x i8>, ptr %a 3118 %1 = sext <33 x i8> %0 to <33 x i32> 3119 %2 = load <33 x i8>, ptr %b 3120 %3 = sext <33 x i8> %2 to <33 x i32> 3121 %4 = mul nsw <33 x i32> %3, %1 3122 %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4) 3123 %op.extra = add nsw i32 %5, %sum 3124 ret i32 %op.extra 3125} 3126 3127define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) { 3128; CHECK-LABEL: test_sdot_v33i8_double: 3129; CHECK: // %bb.0: // %entry 3130; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 3131; CHECK-NEXT: .cfi_def_cfa_offset 16 3132; CHECK-NEXT: .cfi_offset w29, -16 3133; CHECK-NEXT: ldr b0, [sp, #344] 3134; CHECK-NEXT: add x8, sp, #352 3135; CHECK-NEXT: ldr b1, [sp, #80] 3136; CHECK-NEXT: ldr b2, [sp, #216] 3137; CHECK-NEXT: add x9, sp, #96 3138; CHECK-NEXT: add x10, sp, #104 3139; CHECK-NEXT: ld1 { v0.b }[1], [x8] 3140; CHECK-NEXT: add x8, sp, #88 3141; CHECK-NEXT: ldr b4, [sp, #408] 3142; CHECK-NEXT: ld1 { v1.b }[1], [x8] 3143; CHECK-NEXT: add x8, sp, #360 3144; CHECK-NEXT: add x12, sp, #248 3145; CHECK-NEXT: add x13, sp, #432 3146; CHECK-NEXT: add x11, sp, #384 3147; CHECK-NEXT: ldr b5, [sp, #144] 3148; CHECK-NEXT: ld1 { v0.b }[2], [x8] 3149; CHECK-NEXT: add x8, sp, #224 3150; CHECK-NEXT: ldr b6, [sp, #280] 3151; CHECK-NEXT: ld1 { v2.b }[1], [x8] 3152; CHECK-NEXT: ld1 { v1.b }[2], [x9] 3153; CHECK-NEXT: add x8, sp, #368 3154; CHECK-NEXT: add x9, sp, #232 3155; CHECK-NEXT: ldr b16, [sp, #744] 3156; CHECK-NEXT: ldr b17, [sp, #480] 3157; CHECK-NEXT: ld1 { v0.b }[3], [x8] 3158; CHECK-NEXT: add x8, sp, #376 3159; CHECK-NEXT: ldr b18, [sp, #936] 3160; CHECK-NEXT: ld1 { v2.b }[2], [x9] 3161; CHECK-NEXT: ld1 { v1.b }[3], [x10] 3162; CHECK-NEXT: add x9, sp, #240 3163; CHECK-NEXT: add x10, sp, #392 3164; CHECK-NEXT: ldr b19, [sp, #672] 3165; CHECK-NEXT: ldr b7, [sp, #16] 3166; CHECK-NEXT: ld1 { v0.b }[4], [x8] 3167; CHECK-NEXT: add x8, sp, #112 3168; CHECK-NEXT: ldr b21, [sp, #1000] 3169; CHECK-NEXT: ld1 { v2.b }[3], [x9] 3170; CHECK-NEXT: ld1 { v1.b }[4], [x8] 3171; CHECK-NEXT: add x8, sp, #416 3172; CHECK-NEXT: ld1 { v4.b }[1], [x8] 3173; CHECK-NEXT: add x8, sp, #120 3174; CHECK-NEXT: add x9, sp, #400 3175; CHECK-NEXT: ld1 { v0.b }[5], [x11] 3176; CHECK-NEXT: add x11, sp, #128 3177; CHECK-NEXT: ldr b22, [sp, #736] 3178; CHECK-NEXT: ld1 { v2.b }[4], [x12] 3179; CHECK-NEXT: add x12, sp, #424 3180; CHECK-NEXT: ld1 { v1.b }[5], [x8] 3181; CHECK-NEXT: ld1 { v4.b }[2], [x12] 3182; CHECK-NEXT: add x12, sp, #152 3183; CHECK-NEXT: add x8, sp, #136 3184; CHECK-NEXT: ld1 { v5.b }[1], [x12] 3185; CHECK-NEXT: add x12, sp, #440 3186; CHECK-NEXT: ld1 { v0.b }[6], [x10] 3187; CHECK-NEXT: ld1 { v1.b }[6], [x11] 3188; CHECK-NEXT: add x11, sp, #288 3189; CHECK-NEXT: add x10, sp, #256 3190; CHECK-NEXT: ld1 { v4.b }[3], [x13] 3191; CHECK-NEXT: ld1 { v6.b }[1], [x11] 3192; CHECK-NEXT: add x11, sp, #296 3193; CHECK-NEXT: ld1 { v0.b }[7], [x9] 3194; CHECK-NEXT: add x9, sp, #160 3195; CHECK-NEXT: ld1 { v2.b }[5], [x10] 3196; CHECK-NEXT: ld1 { v5.b }[2], [x9] 3197; CHECK-NEXT: add x10, sp, #168 3198; CHECK-NEXT: ld1 { v1.b }[7], [x8] 3199; CHECK-NEXT: ld1 { v4.b }[4], [x12] 3200; CHECK-NEXT: add x12, sp, #448 3201; CHECK-NEXT: ld1 { v6.b }[2], [x11] 3202; CHECK-NEXT: add x11, sp, #304 3203; CHECK-NEXT: add x8, sp, #464 3204; CHECK-NEXT: add x13, sp, #768 3205; CHECK-NEXT: ld1 { v5.b }[3], [x10] 3206; CHECK-NEXT: add x10, sp, #176 3207; CHECK-NEXT: add x9, sp, #264 3208; CHECK-NEXT: ld1 { v4.b }[5], [x12] 3209; CHECK-NEXT: add x12, sp, #456 3210; CHECK-NEXT: ld1 { v6.b }[3], [x11] 3211; CHECK-NEXT: add x11, sp, #760 3212; CHECK-NEXT: ld1 { v2.b }[6], [x9] 3213; CHECK-NEXT: add x9, sp, #272 3214; CHECK-NEXT: ld1 { v5.b }[4], [x10] 3215; CHECK-NEXT: add x10, sp, #312 3216; CHECK-NEXT: fmov s3, w0 3217; CHECK-NEXT: ld1 { v4.b }[6], [x12] 3218; CHECK-NEXT: ld1 { v6.b }[4], [x10] 3219; CHECK-NEXT: add x10, sp, #320 3220; CHECK-NEXT: add x12, sp, #680 3221; CHECK-NEXT: ld1 { v2.b }[7], [x9] 3222; CHECK-NEXT: add x9, sp, #184 3223; CHECK-NEXT: ld1 { v19.b }[1], [x12] 3224; CHECK-NEXT: add x12, sp, #776 3225; CHECK-NEXT: ld1 { v5.b }[5], [x9] 3226; CHECK-NEXT: ld1 { v4.b }[7], [x8] 3227; CHECK-NEXT: add x8, sp, #752 3228; CHECK-NEXT: ld1 { v6.b }[5], [x10] 3229; CHECK-NEXT: ld1 { v16.b }[1], [x8] 3230; CHECK-NEXT: add x10, sp, #24 3231; CHECK-NEXT: smull v22.8h, v22.8b, v21.8b 3232; CHECK-NEXT: ld1 { v7.b }[1], [x10] 3233; CHECK-NEXT: add x10, sp, #496 3234; CHECK-NEXT: mov v3.b[1], w1 3235; CHECK-NEXT: add x9, sp, #192 3236; CHECK-NEXT: ldr b20, [sp, #472] 3237; CHECK-NEXT: ldr b23, [sp, #208] 3238; CHECK-NEXT: ld1 { v16.b }[2], [x11] 3239; CHECK-NEXT: add x11, sp, #488 3240; CHECK-NEXT: ld1 { v5.b }[6], [x9] 3241; CHECK-NEXT: ld1 { v17.b }[1], [x11] 3242; CHECK-NEXT: add x11, sp, #944 3243; CHECK-NEXT: add x9, sp, #328 3244; CHECK-NEXT: ld1 { v18.b }[1], [x11] 3245; CHECK-NEXT: add x11, sp, #688 3246; CHECK-NEXT: ld1 { v6.b }[6], [x9] 3247; CHECK-NEXT: ld1 { v16.b }[3], [x13] 3248; CHECK-NEXT: ld1 { v19.b }[2], [x11] 3249; CHECK-NEXT: add x11, sp, #504 3250; CHECK-NEXT: ld1 { v17.b }[2], [x10] 3251; CHECK-NEXT: add x10, sp, #952 3252; CHECK-NEXT: add x13, sp, #784 3253; CHECK-NEXT: ld1 { v18.b }[2], [x10] 3254; CHECK-NEXT: add x10, sp, #32 3255; CHECK-NEXT: add x9, sp, #40 3256; CHECK-NEXT: ld1 { v16.b }[4], [x12] 3257; CHECK-NEXT: add x12, sp, #696 3258; CHECK-NEXT: ld1 { v7.b }[2], [x10] 3259; CHECK-NEXT: ld1 { v17.b }[3], [x11] 3260; CHECK-NEXT: add x11, sp, #960 3261; CHECK-NEXT: ld1 { v19.b }[3], [x12] 3262; CHECK-NEXT: ld1 { v18.b }[3], [x11] 3263; CHECK-NEXT: add x10, sp, #512 3264; CHECK-NEXT: add x11, sp, #704 3265; CHECK-NEXT: ld1 { v16.b }[5], [x13] 3266; CHECK-NEXT: add x12, sp, #792 3267; CHECK-NEXT: sshll v24.4s, v22.4h, #0 3268; CHECK-NEXT: ld1 { v17.b }[4], [x10] 3269; CHECK-NEXT: add x10, sp, #968 3270; CHECK-NEXT: ld1 { v19.b }[4], [x11] 3271; CHECK-NEXT: ld1 { v18.b }[4], [x10] 3272; CHECK-NEXT: add x10, sp, #520 3273; CHECK-NEXT: add x11, sp, #976 3274; CHECK-NEXT: ld1 { v16.b }[6], [x12] 3275; CHECK-NEXT: add x12, sp, #712 3276; CHECK-NEXT: smull v20.8h, v23.8b, v20.8b 3277; CHECK-NEXT: ld1 { v17.b }[5], [x10] 3278; CHECK-NEXT: ld1 { v19.b }[5], [x12] 3279; CHECK-NEXT: add x12, sp, #720 3280; CHECK-NEXT: ld1 { v18.b }[5], [x11] 3281; CHECK-NEXT: add x11, sp, #528 3282; CHECK-NEXT: add x10, sp, #800 3283; CHECK-NEXT: ld1 { v16.b }[7], [x10] 3284; CHECK-NEXT: add x10, sp, #536 3285; CHECK-NEXT: ldr b22, [sp, #872] 3286; CHECK-NEXT: ld1 { v17.b }[6], [x11] 3287; CHECK-NEXT: add x11, sp, #984 3288; CHECK-NEXT: ld1 { v19.b }[6], [x12] 3289; CHECK-NEXT: ld1 { v18.b }[6], [x11] 3290; CHECK-NEXT: add x11, sp, #992 3291; CHECK-NEXT: add x12, sp, #728 3292; CHECK-NEXT: ldr b23, [sp, #608] 3293; CHECK-NEXT: ld1 { v7.b }[3], [x9] 3294; CHECK-NEXT: add x9, sp, #880 3295; CHECK-NEXT: ld1 { v17.b }[7], [x10] 3296; CHECK-NEXT: ld1 { v19.b }[7], [x12] 3297; CHECK-NEXT: add x10, sp, #816 3298; CHECK-NEXT: ld1 { v18.b }[7], [x11] 3299; CHECK-NEXT: add x11, sp, #552 3300; CHECK-NEXT: add x12, sp, #616 3301; CHECK-NEXT: mov v3.b[2], w2 3302; CHECK-NEXT: ld1 { v22.b }[1], [x9] 3303; CHECK-NEXT: ld1 { v23.b }[1], [x12] 3304; CHECK-NEXT: smull v16.8h, v17.8b, v16.8b 3305; CHECK-NEXT: add x12, sp, #560 3306; CHECK-NEXT: add x9, sp, #888 3307; CHECK-NEXT: smull v17.8h, v19.8b, v18.8b 3308; CHECK-NEXT: ldr b18, [sp, #808] 3309; CHECK-NEXT: ldr b19, [sp, #544] 3310; CHECK-NEXT: add x13, sp, #624 3311; CHECK-NEXT: ld1 { v22.b }[2], [x9] 3312; CHECK-NEXT: add x9, sp, #896 3313; CHECK-NEXT: ld1 { v18.b }[1], [x10] 3314; CHECK-NEXT: ld1 { v19.b }[1], [x11] 3315; CHECK-NEXT: add x11, sp, #824 3316; CHECK-NEXT: add x10, sp, #48 3317; CHECK-NEXT: ld1 { v23.b }[2], [x13] 3318; CHECK-NEXT: mov v3.b[3], w3 3319; CHECK-NEXT: ld1 { v7.b }[4], [x10] 3320; CHECK-NEXT: add x10, sp, #832 3321; CHECK-NEXT: ld1 { v22.b }[3], [x9] 3322; CHECK-NEXT: ld1 { v18.b }[2], [x11] 3323; CHECK-NEXT: ld1 { v19.b }[2], [x12] 3324; CHECK-NEXT: add x11, sp, #568 3325; CHECK-NEXT: add x12, sp, #632 3326; CHECK-NEXT: add x9, sp, #904 3327; CHECK-NEXT: add x13, sp, #640 3328; CHECK-NEXT: ld1 { v23.b }[3], [x12] 3329; CHECK-NEXT: add x12, sp, #576 3330; CHECK-NEXT: mov v3.b[4], w4 3331; CHECK-NEXT: ld1 { v18.b }[3], [x10] 3332; CHECK-NEXT: ld1 { v19.b }[3], [x11] 3333; CHECK-NEXT: add x11, sp, #840 3334; CHECK-NEXT: add x10, sp, #56 3335; CHECK-NEXT: ld1 { v22.b }[4], [x9] 3336; CHECK-NEXT: add x9, sp, #912 3337; CHECK-NEXT: ld1 { v23.b }[4], [x13] 3338; CHECK-NEXT: ld1 { v7.b }[5], [x10] 3339; CHECK-NEXT: add x10, sp, #848 3340; CHECK-NEXT: ld1 { v18.b }[4], [x11] 3341; CHECK-NEXT: ld1 { v19.b }[4], [x12] 3342; CHECK-NEXT: add x11, sp, #584 3343; CHECK-NEXT: add x12, sp, #648 3344; CHECK-NEXT: mov v3.b[5], w5 3345; CHECK-NEXT: ld1 { v22.b }[5], [x9] 3346; CHECK-NEXT: ld1 { v23.b }[5], [x12] 3347; CHECK-NEXT: add x12, sp, #592 3348; CHECK-NEXT: movi v21.2d, #0000000000000000 3349; CHECK-NEXT: ld1 { v18.b }[5], [x10] 3350; CHECK-NEXT: ld1 { v19.b }[5], [x11] 3351; CHECK-NEXT: add x11, sp, #856 3352; CHECK-NEXT: add x9, sp, #920 3353; CHECK-NEXT: add x13, sp, #656 3354; CHECK-NEXT: add x10, sp, #64 3355; CHECK-NEXT: ld1 { v22.b }[6], [x9] 3356; CHECK-NEXT: ld1 { v23.b }[6], [x13] 3357; CHECK-NEXT: mov v3.b[6], w6 3358; CHECK-NEXT: ld1 { v18.b }[6], [x11] 3359; CHECK-NEXT: ld1 { v19.b }[6], [x12] 3360; CHECK-NEXT: ld1 { v7.b }[6], [x10] 3361; CHECK-NEXT: add x10, sp, #864 3362; CHECK-NEXT: add x11, sp, #600 3363; CHECK-NEXT: add x9, sp, #928 3364; CHECK-NEXT: add x12, sp, #664 3365; CHECK-NEXT: mov v21.s[0], v24.s[0] 3366; CHECK-NEXT: ld1 { v22.b }[7], [x9] 3367; CHECK-NEXT: ld1 { v18.b }[7], [x10] 3368; CHECK-NEXT: ld1 { v19.b }[7], [x11] 3369; CHECK-NEXT: ld1 { v23.b }[7], [x12] 3370; CHECK-NEXT: add x8, sp, #200 3371; CHECK-NEXT: mov v3.b[7], w7 3372; CHECK-NEXT: add x10, sp, #336 3373; CHECK-NEXT: ld1 { v5.b }[7], [x8] 3374; CHECK-NEXT: add x8, sp, #72 3375; CHECK-NEXT: ld1 { v6.b }[7], [x10] 3376; CHECK-NEXT: smull v18.8h, v19.8b, v18.8b 3377; CHECK-NEXT: movi v19.2d, #0000000000000000 3378; CHECK-NEXT: ld1 { v7.b }[7], [x8] 3379; CHECK-NEXT: smull v22.8h, v23.8b, v22.8b 3380; CHECK-NEXT: sshll v20.4s, v20.4h, #0 3381; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b 3382; CHECK-NEXT: saddw v1.4s, v21.4s, v16.4h 3383; CHECK-NEXT: smull v2.8h, v3.8b, v2.8b 3384; CHECK-NEXT: smull v3.8h, v5.8b, v4.8b 3385; CHECK-NEXT: smull v4.8h, v7.8b, v6.8b 3386; CHECK-NEXT: mov v19.s[0], v20.s[0] 3387; CHECK-NEXT: saddl2 v5.4s, v18.8h, v17.8h 3388; CHECK-NEXT: saddl v7.4s, v18.4h, v17.4h 3389; CHECK-NEXT: saddl2 v6.4s, v16.8h, v22.8h 3390; CHECK-NEXT: saddw v1.4s, v1.4s, v22.4h 3391; CHECK-NEXT: saddl2 v17.4s, v2.8h, v0.8h 3392; CHECK-NEXT: saddl2 v16.4s, v4.8h, v3.8h 3393; CHECK-NEXT: saddl v3.4s, v4.4h, v3.4h 3394; CHECK-NEXT: saddw v2.4s, v19.4s, v2.4h 3395; CHECK-NEXT: add v5.4s, v6.4s, v5.4s 3396; CHECK-NEXT: add v1.4s, v1.4s, v7.4s 3397; CHECK-NEXT: add v6.4s, v17.4s, v16.4s 3398; CHECK-NEXT: saddw v0.4s, v2.4s, v0.4h 3399; CHECK-NEXT: add v1.4s, v1.4s, v5.4s 3400; CHECK-NEXT: add v0.4s, v0.4s, v3.4s 3401; CHECK-NEXT: add v1.4s, v6.4s, v1.4s 3402; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 3403; CHECK-NEXT: addv s0, v0.4s 3404; CHECK-NEXT: fmov w0, s0 3405; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 3406; CHECK-NEXT: ret 3407entry: 3408 %az = sext <33 x i8> %a to <33 x i32> 3409 %bz = sext <33 x i8> %b to <33 x i32> 3410 %m1 = mul nuw nsw <33 x i32> %az, %bz 3411 %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m1) 3412 %cz = sext <33 x i8> %c to <33 x i32> 3413 %dz = sext <33 x i8> %d to <33 x i32> 3414 %m2 = mul nuw nsw <33 x i32> %cz, %dz 3415 %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m2) 3416 %x = add i32 %r1, %r2 3417 ret i32 %x 3418} 3419 3420define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) { 3421; CHECK-LABEL: test_sdot_v33i8_double_nomla: 3422; CHECK: // %bb.0: // %entry 3423; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 3424; CHECK-NEXT: .cfi_def_cfa_offset 16 3425; CHECK-NEXT: .cfi_offset w29, -16 3426; CHECK-NEXT: ldr b0, [sp, #80] 3427; CHECK-NEXT: add x8, sp, #88 3428; CHECK-NEXT: ldr b2, [sp, #144] 3429; CHECK-NEXT: add x9, sp, #152 3430; CHECK-NEXT: ldr b3, [sp, #16] 3431; CHECK-NEXT: add x11, sp, #104 3432; CHECK-NEXT: ld1 { v0.b }[1], [x8] 3433; CHECK-NEXT: ld1 { v2.b }[1], [x9] 3434; CHECK-NEXT: add x9, sp, #24 3435; CHECK-NEXT: add x8, sp, #96 3436; CHECK-NEXT: ld1 { v3.b }[1], [x9] 3437; CHECK-NEXT: ldr b5, [sp, #480] 3438; CHECK-NEXT: fmov s1, w0 3439; CHECK-NEXT: add x10, sp, #112 3440; CHECK-NEXT: add x12, sp, #168 3441; CHECK-NEXT: ld1 { v0.b }[2], [x8] 3442; CHECK-NEXT: add x8, sp, #160 3443; CHECK-NEXT: ldr b4, [sp, #608] 3444; CHECK-NEXT: ld1 { v2.b }[2], [x8] 3445; CHECK-NEXT: add x8, sp, #32 3446; CHECK-NEXT: add x13, sp, #496 3447; CHECK-NEXT: ld1 { v3.b }[2], [x8] 3448; CHECK-NEXT: mov v1.b[1], w1 3449; CHECK-NEXT: ldr b6, [sp, #672] 3450; CHECK-NEXT: ld1 { v0.b }[3], [x11] 3451; CHECK-NEXT: add x11, sp, #488 3452; CHECK-NEXT: add x9, sp, #120 3453; CHECK-NEXT: ld1 { v5.b }[1], [x11] 3454; CHECK-NEXT: add x11, sp, #40 3455; CHECK-NEXT: ld1 { v2.b }[3], [x12] 3456; CHECK-NEXT: ld1 { v3.b }[3], [x11] 3457; CHECK-NEXT: add x12, sp, #616 3458; CHECK-NEXT: ldr b16, [sp, #544] 3459; CHECK-NEXT: ld1 { v0.b }[4], [x10] 3460; CHECK-NEXT: add x10, sp, #48 3461; CHECK-NEXT: ld1 { v4.b }[1], [x12] 3462; CHECK-NEXT: add x12, sp, #176 3463; CHECK-NEXT: ld1 { v5.b }[2], [x13] 3464; CHECK-NEXT: add x13, sp, #680 3465; CHECK-NEXT: ld1 { v3.b }[4], [x10] 3466; CHECK-NEXT: ld1 { v2.b }[4], [x12] 3467; CHECK-NEXT: ld1 { v6.b }[1], [x13] 3468; CHECK-NEXT: add x13, sp, #56 3469; CHECK-NEXT: ld1 { v0.b }[5], [x9] 3470; CHECK-NEXT: mov v1.b[2], w2 3471; CHECK-NEXT: add x8, sp, #128 3472; CHECK-NEXT: add x14, sp, #184 3473; CHECK-NEXT: add x11, sp, #136 3474; CHECK-NEXT: ld1 { v3.b }[5], [x13] 3475; CHECK-NEXT: add x13, sp, #552 3476; CHECK-NEXT: ld1 { v2.b }[5], [x14] 3477; CHECK-NEXT: ld1 { v16.b }[1], [x13] 3478; CHECK-NEXT: add x14, sp, #624 3479; CHECK-NEXT: ld1 { v0.b }[6], [x8] 3480; CHECK-NEXT: add x8, sp, #688 3481; CHECK-NEXT: add x13, sp, #504 3482; CHECK-NEXT: ld1 { v4.b }[2], [x14] 3483; CHECK-NEXT: ld1 { v6.b }[2], [x8] 3484; CHECK-NEXT: add x8, sp, #560 3485; CHECK-NEXT: ld1 { v5.b }[3], [x13] 3486; CHECK-NEXT: ld1 { v16.b }[2], [x8] 3487; CHECK-NEXT: mov v1.b[3], w3 3488; CHECK-NEXT: add x9, sp, #64 3489; CHECK-NEXT: add x15, sp, #632 3490; CHECK-NEXT: ld1 { v3.b }[6], [x9] 3491; CHECK-NEXT: ld1 { v0.b }[7], [x11] 3492; CHECK-NEXT: ld1 { v4.b }[3], [x15] 3493; CHECK-NEXT: add x8, sp, #696 3494; CHECK-NEXT: add x9, sp, #568 3495; CHECK-NEXT: add x11, sp, #512 3496; CHECK-NEXT: ld1 { v6.b }[3], [x8] 3497; CHECK-NEXT: ld1 { v16.b }[3], [x9] 3498; CHECK-NEXT: ld1 { v5.b }[4], [x11] 3499; CHECK-NEXT: add x8, sp, #640 3500; CHECK-NEXT: mov v1.b[4], w4 3501; CHECK-NEXT: ld1 { v4.b }[4], [x8] 3502; CHECK-NEXT: add x8, sp, #704 3503; CHECK-NEXT: add x9, sp, #576 3504; CHECK-NEXT: add x11, sp, #520 3505; CHECK-NEXT: ld1 { v6.b }[4], [x8] 3506; CHECK-NEXT: ld1 { v16.b }[4], [x9] 3507; CHECK-NEXT: ld1 { v5.b }[5], [x11] 3508; CHECK-NEXT: ldr b18, [sp, #736] 3509; CHECK-NEXT: add x12, sp, #192 3510; CHECK-NEXT: ld1 { v2.b }[6], [x12] 3511; CHECK-NEXT: add x8, sp, #648 3512; CHECK-NEXT: add x9, sp, #528 3513; CHECK-NEXT: add x11, sp, #712 3514; CHECK-NEXT: add x12, sp, #584 3515; CHECK-NEXT: sshll v18.8h, v18.8b, #0 3516; CHECK-NEXT: mov v1.b[5], w5 3517; CHECK-NEXT: ld1 { v6.b }[5], [x11] 3518; CHECK-NEXT: ld1 { v16.b }[5], [x12] 3519; CHECK-NEXT: ld1 { v4.b }[5], [x8] 3520; CHECK-NEXT: ld1 { v5.b }[6], [x9] 3521; CHECK-NEXT: movi v17.2d, #0000000000000000 3522; CHECK-NEXT: add x8, sp, #656 3523; CHECK-NEXT: add x9, sp, #536 3524; CHECK-NEXT: add x11, sp, #720 3525; CHECK-NEXT: add x12, sp, #592 3526; CHECK-NEXT: sshll v18.4s, v18.4h, #0 3527; CHECK-NEXT: ldr b7, [sp, #208] 3528; CHECK-NEXT: ld1 { v6.b }[6], [x11] 3529; CHECK-NEXT: ld1 { v16.b }[6], [x12] 3530; CHECK-NEXT: ld1 { v4.b }[6], [x8] 3531; CHECK-NEXT: ld1 { v5.b }[7], [x9] 3532; CHECK-NEXT: mov v1.b[6], w6 3533; CHECK-NEXT: sshll v7.8h, v7.8b, #0 3534; CHECK-NEXT: add x8, sp, #664 3535; CHECK-NEXT: add x9, sp, #728 3536; CHECK-NEXT: add x11, sp, #600 3537; CHECK-NEXT: mov v17.s[0], v18.s[0] 3538; CHECK-NEXT: ld1 { v6.b }[7], [x9] 3539; CHECK-NEXT: ld1 { v16.b }[7], [x11] 3540; CHECK-NEXT: ld1 { v4.b }[7], [x8] 3541; CHECK-NEXT: sshll v5.8h, v5.8b, #0 3542; CHECK-NEXT: movi v18.2d, #0000000000000000 3543; CHECK-NEXT: add x10, sp, #200 3544; CHECK-NEXT: mov v1.b[7], w7 3545; CHECK-NEXT: add x9, sp, #72 3546; CHECK-NEXT: sshll v7.4s, v7.4h, #0 3547; CHECK-NEXT: ld1 { v2.b }[7], [x10] 3548; CHECK-NEXT: ld1 { v3.b }[7], [x9] 3549; CHECK-NEXT: sshll v6.8h, v6.8b, #0 3550; CHECK-NEXT: sshll v16.8h, v16.8b, #0 3551; CHECK-NEXT: sshll v4.8h, v4.8b, #0 3552; CHECK-NEXT: saddw v17.4s, v17.4s, v5.4h 3553; CHECK-NEXT: sshll v0.8h, v0.8b, #0 3554; CHECK-NEXT: mov v18.s[0], v7.s[0] 3555; CHECK-NEXT: sshll v1.8h, v1.8b, #0 3556; CHECK-NEXT: sshll v2.8h, v2.8b, #0 3557; CHECK-NEXT: sshll v3.8h, v3.8b, #0 3558; CHECK-NEXT: saddl2 v7.4s, v16.8h, v6.8h 3559; CHECK-NEXT: saddl2 v5.4s, v5.8h, v4.8h 3560; CHECK-NEXT: saddl v6.4s, v16.4h, v6.4h 3561; CHECK-NEXT: saddw v4.4s, v17.4s, v4.4h 3562; CHECK-NEXT: saddl2 v17.4s, v1.8h, v0.8h 3563; CHECK-NEXT: saddl2 v16.4s, v3.8h, v2.8h 3564; CHECK-NEXT: saddw v1.4s, v18.4s, v1.4h 3565; CHECK-NEXT: add v5.4s, v5.4s, v7.4s 3566; CHECK-NEXT: add v4.4s, v4.4s, v6.4s 3567; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h 3568; CHECK-NEXT: add v6.4s, v17.4s, v16.4s 3569; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h 3570; CHECK-NEXT: add v1.4s, v4.4s, v5.4s 3571; CHECK-NEXT: add v0.4s, v0.4s, v2.4s 3572; CHECK-NEXT: add v1.4s, v6.4s, v1.4s 3573; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 3574; CHECK-NEXT: addv s0, v0.4s 3575; CHECK-NEXT: fmov w0, s0 3576; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 3577; CHECK-NEXT: ret 3578entry: 3579 %az = sext <33 x i8> %a to <33 x i32> 3580 %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %az) 3581 %cz = sext <33 x i8> %c to <33 x i32> 3582 %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %cz) 3583 %x = add i32 %r1, %r2 3584 ret i32 %x 3585} 3586 3587define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 3588; CHECK-SD-LABEL: test_udot_v48i8: 3589; CHECK-SD: // %bb.0: // %entry 3590; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 3591; CHECK-SD-NEXT: ldr q1, [x0, #32] 3592; CHECK-SD-NEXT: ldr q2, [x1, #32] 3593; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b 3594; CHECK-SD-NEXT: ldp q3, q1, [x0] 3595; CHECK-SD-NEXT: ldp q4, q2, [x1] 3596; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b 3597; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b 3598; CHECK-SD-NEXT: addv s0, v0.4s 3599; CHECK-SD-NEXT: fmov w8, s0 3600; CHECK-SD-NEXT: add w0, w8, w2 3601; CHECK-SD-NEXT: ret 3602; 3603; CHECK-GI-LABEL: test_udot_v48i8: 3604; CHECK-GI: // %bb.0: // %entry 3605; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 3606; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 3607; CHECK-GI-NEXT: ldr q7, [x0, #32] 3608; CHECK-GI-NEXT: ldp q3, q4, [x0] 3609; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 3610; CHECK-GI-NEXT: ldp q5, q6, [x1] 3611; CHECK-GI-NEXT: ldr q16, [x1, #32] 3612; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b 3613; CHECK-GI-NEXT: udot v1.4s, v6.16b, v4.16b 3614; CHECK-GI-NEXT: udot v2.4s, v16.16b, v7.16b 3615; CHECK-GI-NEXT: addv s0, v0.4s 3616; CHECK-GI-NEXT: addv s1, v1.4s 3617; CHECK-GI-NEXT: addv s2, v2.4s 3618; CHECK-GI-NEXT: fmov w8, s0 3619; CHECK-GI-NEXT: fmov w9, s1 3620; CHECK-GI-NEXT: add w8, w8, w9 3621; CHECK-GI-NEXT: fmov w9, s2 3622; CHECK-GI-NEXT: add w8, w8, w9 3623; CHECK-GI-NEXT: add w0, w8, w2 3624; CHECK-GI-NEXT: ret 3625entry: 3626 %0 = load <48 x i8>, ptr %a 3627 %1 = zext <48 x i8> %0 to <48 x i32> 3628 %2 = load <48 x i8>, ptr %b 3629 %3 = zext <48 x i8> %2 to <48 x i32> 3630 %4 = mul nuw nsw <48 x i32> %3, %1 3631 %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4) 3632 %op.extra = add i32 %5, %sum 3633 ret i32 %op.extra 3634} 3635 3636define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) { 3637; CHECK-SD-LABEL: test_udot_v48i8_nomla: 3638; CHECK-SD: // %bb.0: // %entry 3639; CHECK-SD-NEXT: movi v0.16b, #1 3640; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 3641; CHECK-SD-NEXT: ldr q2, [x0, #32] 3642; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b 3643; CHECK-SD-NEXT: ldp q3, q2, [x0] 3644; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b 3645; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b 3646; CHECK-SD-NEXT: addv s0, v1.4s 3647; CHECK-SD-NEXT: fmov w0, s0 3648; CHECK-SD-NEXT: ret 3649; 3650; CHECK-GI-LABEL: test_udot_v48i8_nomla: 3651; CHECK-GI: // %bb.0: // %entry 3652; CHECK-GI-NEXT: movi v0.16b, #1 3653; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 3654; CHECK-GI-NEXT: ldr q6, [x0, #32] 3655; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 3656; CHECK-GI-NEXT: ldp q4, q5, [x0] 3657; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 3658; CHECK-GI-NEXT: udot v1.4s, v4.16b, v0.16b 3659; CHECK-GI-NEXT: udot v2.4s, v5.16b, v0.16b 3660; CHECK-GI-NEXT: udot v3.4s, v6.16b, v0.16b 3661; CHECK-GI-NEXT: addv s0, v1.4s 3662; CHECK-GI-NEXT: addv s1, v2.4s 3663; CHECK-GI-NEXT: addv s2, v3.4s 3664; CHECK-GI-NEXT: fmov w8, s0 3665; CHECK-GI-NEXT: fmov w9, s1 3666; CHECK-GI-NEXT: add w8, w8, w9 3667; CHECK-GI-NEXT: fmov w9, s2 3668; CHECK-GI-NEXT: add w0, w8, w9 3669; CHECK-GI-NEXT: ret 3670entry: 3671 %0 = load <48 x i8>, ptr %a1 3672 %1 = zext <48 x i8> %0 to <48 x i32> 3673 %2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %1) 3674 ret i32 %2 3675} 3676define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 3677; CHECK-SD-LABEL: test_sdot_v48i8: 3678; CHECK-SD: // %bb.0: // %entry 3679; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 3680; CHECK-SD-NEXT: ldr q1, [x0, #32] 3681; CHECK-SD-NEXT: ldr q2, [x1, #32] 3682; CHECK-SD-NEXT: sdot v0.4s, v2.16b, v1.16b 3683; CHECK-SD-NEXT: ldp q3, q1, [x0] 3684; CHECK-SD-NEXT: ldp q4, q2, [x1] 3685; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v3.16b 3686; CHECK-SD-NEXT: sdot v0.4s, v2.16b, v1.16b 3687; CHECK-SD-NEXT: addv s0, v0.4s 3688; CHECK-SD-NEXT: fmov w8, s0 3689; CHECK-SD-NEXT: add w0, w8, w2 3690; CHECK-SD-NEXT: ret 3691; 3692; CHECK-GI-LABEL: test_sdot_v48i8: 3693; CHECK-GI: // %bb.0: // %entry 3694; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 3695; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 3696; CHECK-GI-NEXT: ldr q7, [x0, #32] 3697; CHECK-GI-NEXT: ldp q3, q4, [x0] 3698; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 3699; CHECK-GI-NEXT: ldp q5, q6, [x1] 3700; CHECK-GI-NEXT: ldr q16, [x1, #32] 3701; CHECK-GI-NEXT: sdot v0.4s, v5.16b, v3.16b 3702; CHECK-GI-NEXT: sdot v1.4s, v6.16b, v4.16b 3703; CHECK-GI-NEXT: sdot v2.4s, v16.16b, v7.16b 3704; CHECK-GI-NEXT: addv s0, v0.4s 3705; CHECK-GI-NEXT: addv s1, v1.4s 3706; CHECK-GI-NEXT: addv s2, v2.4s 3707; CHECK-GI-NEXT: fmov w8, s0 3708; CHECK-GI-NEXT: fmov w9, s1 3709; CHECK-GI-NEXT: add w8, w8, w9 3710; CHECK-GI-NEXT: fmov w9, s2 3711; CHECK-GI-NEXT: add w8, w8, w9 3712; CHECK-GI-NEXT: add w0, w8, w2 3713; CHECK-GI-NEXT: ret 3714entry: 3715 %0 = load <48 x i8>, ptr %a 3716 %1 = sext <48 x i8> %0 to <48 x i32> 3717 %2 = load <48 x i8>, ptr %b 3718 %3 = sext <48 x i8> %2 to <48 x i32> 3719 %4 = mul nsw <48 x i32> %3, %1 3720 %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4) 3721 %op.extra = add nsw i32 %5, %sum 3722 ret i32 %op.extra 3723} 3724 3725define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) { 3726; CHECK-SD-LABEL: test_sdot_v48i8_double: 3727; CHECK-SD: // %bb.0: // %entry 3728; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 3729; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 3730; CHECK-SD-NEXT: .cfi_offset w29, -16 3731; CHECK-SD-NEXT: ldr b3, [sp, #592] 3732; CHECK-SD-NEXT: add x8, sp, #600 3733; CHECK-SD-NEXT: ldr b6, [sp, #208] 3734; CHECK-SD-NEXT: ldr b0, [sp, #336] 3735; CHECK-SD-NEXT: add x9, sp, #344 3736; CHECK-SD-NEXT: ldr b2, [sp, #464] 3737; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8] 3738; CHECK-SD-NEXT: add x8, sp, #216 3739; CHECK-SD-NEXT: add x10, sp, #624 3740; CHECK-SD-NEXT: ld1 { v6.b }[1], [x8] 3741; CHECK-SD-NEXT: add x8, sp, #608 3742; CHECK-SD-NEXT: ld1 { v0.b }[1], [x9] 3743; CHECK-SD-NEXT: add x9, sp, #232 3744; CHECK-SD-NEXT: fmov s1, w0 3745; CHECK-SD-NEXT: ldr b7, [sp, #1360] 3746; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] 3747; CHECK-SD-NEXT: add x8, sp, #224 3748; CHECK-SD-NEXT: add x11, sp, #648 3749; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] 3750; CHECK-SD-NEXT: add x8, sp, #616 3751; CHECK-SD-NEXT: add x12, sp, #376 3752; CHECK-SD-NEXT: mov v1.b[1], w1 3753; CHECK-SD-NEXT: ldr b16, [sp, #976] 3754; CHECK-SD-NEXT: add x14, sp, #288 3755; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] 3756; CHECK-SD-NEXT: add x8, sp, #632 3757; CHECK-SD-NEXT: add x15, sp, #408 3758; CHECK-SD-NEXT: ld1 { v6.b }[3], [x9] 3759; CHECK-SD-NEXT: add x9, sp, #472 3760; CHECK-SD-NEXT: add x13, sp, #696 3761; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] 3762; CHECK-SD-NEXT: add x9, sp, #240 3763; CHECK-SD-NEXT: add x16, sp, #448 3764; CHECK-SD-NEXT: ld1 { v3.b }[4], [x10] 3765; CHECK-SD-NEXT: add x10, sp, #352 3766; CHECK-SD-NEXT: mov v1.b[2], w2 3767; CHECK-SD-NEXT: ld1 { v6.b }[4], [x9] 3768; CHECK-SD-NEXT: ld1 { v0.b }[2], [x10] 3769; CHECK-SD-NEXT: add x10, sp, #1368 3770; CHECK-SD-NEXT: ld1 { v7.b }[1], [x10] 3771; CHECK-SD-NEXT: add x10, sp, #248 3772; CHECK-SD-NEXT: add x9, sp, #640 3773; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8] 3774; CHECK-SD-NEXT: add x8, sp, #656 3775; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 3776; CHECK-SD-NEXT: ld1 { v6.b }[5], [x10] 3777; CHECK-SD-NEXT: add x10, sp, #360 3778; CHECK-SD-NEXT: mov v1.b[3], w3 3779; CHECK-SD-NEXT: ld1 { v0.b }[3], [x10] 3780; CHECK-SD-NEXT: add x10, sp, #256 3781; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 3782; CHECK-SD-NEXT: ld1 { v3.b }[6], [x9] 3783; CHECK-SD-NEXT: add x9, sp, #368 3784; CHECK-SD-NEXT: ldr b17, [sp, #720] 3785; CHECK-SD-NEXT: ld1 { v6.b }[6], [x10] 3786; CHECK-SD-NEXT: add x10, sp, #984 3787; CHECK-SD-NEXT: ld1 { v0.b }[4], [x9] 3788; CHECK-SD-NEXT: ld1 { v16.b }[1], [x10] 3789; CHECK-SD-NEXT: add x10, sp, #664 3790; CHECK-SD-NEXT: ld1 { v3.b }[7], [x11] 3791; CHECK-SD-NEXT: add x11, sp, #264 3792; CHECK-SD-NEXT: mov v1.b[4], w4 3793; CHECK-SD-NEXT: ld1 { v6.b }[7], [x11] 3794; CHECK-SD-NEXT: add x9, sp, #672 3795; CHECK-SD-NEXT: add x11, sp, #680 3796; CHECK-SD-NEXT: ld1 { v0.b }[5], [x12] 3797; CHECK-SD-NEXT: add x12, sp, #480 3798; CHECK-SD-NEXT: ld1 { v2.b }[2], [x12] 3799; CHECK-SD-NEXT: add x12, sp, #272 3800; CHECK-SD-NEXT: ld1 { v3.b }[8], [x8] 3801; CHECK-SD-NEXT: ld1 { v6.b }[8], [x12] 3802; CHECK-SD-NEXT: add x12, sp, #384 3803; CHECK-SD-NEXT: mov v1.b[5], w5 3804; CHECK-SD-NEXT: ld1 { v0.b }[6], [x12] 3805; CHECK-SD-NEXT: add x12, sp, #280 3806; CHECK-SD-NEXT: add x8, sp, #688 3807; CHECK-SD-NEXT: ld1 { v3.b }[9], [x10] 3808; CHECK-SD-NEXT: add x10, sp, #1376 3809; CHECK-SD-NEXT: ld1 { v7.b }[2], [x10] 3810; CHECK-SD-NEXT: add x10, sp, #392 3811; CHECK-SD-NEXT: ld1 { v6.b }[9], [x12] 3812; CHECK-SD-NEXT: ld1 { v0.b }[7], [x10] 3813; CHECK-SD-NEXT: mov v1.b[6], w6 3814; CHECK-SD-NEXT: add x12, sp, #704 3815; CHECK-SD-NEXT: ld1 { v3.b }[10], [x9] 3816; CHECK-SD-NEXT: add x9, sp, #400 3817; CHECK-SD-NEXT: add x10, sp, #712 3818; CHECK-SD-NEXT: ld1 { v6.b }[10], [x14] 3819; CHECK-SD-NEXT: add x14, sp, #992 3820; CHECK-SD-NEXT: ld1 { v0.b }[8], [x9] 3821; CHECK-SD-NEXT: ld1 { v16.b }[2], [x14] 3822; CHECK-SD-NEXT: add x14, sp, #296 3823; CHECK-SD-NEXT: ld1 { v3.b }[11], [x11] 3824; CHECK-SD-NEXT: add x9, sp, #304 3825; CHECK-SD-NEXT: add x11, sp, #312 3826; CHECK-SD-NEXT: ld1 { v6.b }[11], [x14] 3827; CHECK-SD-NEXT: mov v1.b[7], w7 3828; CHECK-SD-NEXT: add x14, sp, #320 3829; CHECK-SD-NEXT: ld1 { v0.b }[9], [x15] 3830; CHECK-SD-NEXT: add x15, sp, #328 3831; CHECK-SD-NEXT: ld1 { v3.b }[12], [x8] 3832; CHECK-SD-NEXT: add x8, sp, #416 3833; CHECK-SD-NEXT: ld1 { v6.b }[12], [x9] 3834; CHECK-SD-NEXT: add x9, sp, #1384 3835; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] 3836; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9] 3837; CHECK-SD-NEXT: add x9, sp, #424 3838; CHECK-SD-NEXT: ld1 { v3.b }[13], [x13] 3839; CHECK-SD-NEXT: add x8, sp, #432 3840; CHECK-SD-NEXT: add x13, sp, #440 3841; CHECK-SD-NEXT: ld1 { v6.b }[13], [x11] 3842; CHECK-SD-NEXT: add x11, sp, #16 3843; CHECK-SD-NEXT: ld1 { v0.b }[11], [x9] 3844; CHECK-SD-NEXT: add x9, sp, #1000 3845; CHECK-SD-NEXT: ld1 { v1.b }[8], [x11] 3846; CHECK-SD-NEXT: ld1 { v16.b }[3], [x9] 3847; CHECK-SD-NEXT: ld1 { v3.b }[14], [x12] 3848; CHECK-SD-NEXT: add x12, sp, #488 3849; CHECK-SD-NEXT: ld1 { v6.b }[14], [x14] 3850; CHECK-SD-NEXT: add x14, sp, #1392 3851; CHECK-SD-NEXT: ld1 { v2.b }[3], [x12] 3852; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14] 3853; CHECK-SD-NEXT: add x11, sp, #1008 3854; CHECK-SD-NEXT: ld1 { v0.b }[12], [x8] 3855; CHECK-SD-NEXT: ld1 { v16.b }[4], [x11] 3856; CHECK-SD-NEXT: add x8, sp, #1400 3857; CHECK-SD-NEXT: ld1 { v3.b }[15], [x10] 3858; CHECK-SD-NEXT: add x10, sp, #496 3859; CHECK-SD-NEXT: add x9, sp, #24 3860; CHECK-SD-NEXT: ld1 { v6.b }[15], [x15] 3861; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] 3862; CHECK-SD-NEXT: ld1 { v2.b }[4], [x10] 3863; CHECK-SD-NEXT: add x10, sp, #1016 3864; CHECK-SD-NEXT: ld1 { v16.b }[5], [x10] 3865; CHECK-SD-NEXT: ld1 { v0.b }[13], [x13] 3866; CHECK-SD-NEXT: add x8, sp, #1408 3867; CHECK-SD-NEXT: ld1 { v1.b }[9], [x9] 3868; CHECK-SD-NEXT: add x9, sp, #504 3869; CHECK-SD-NEXT: add x10, sp, #512 3870; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8] 3871; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9] 3872; CHECK-SD-NEXT: add x9, sp, #1024 3873; CHECK-SD-NEXT: add x8, sp, #32 3874; CHECK-SD-NEXT: ld1 { v16.b }[6], [x9] 3875; CHECK-SD-NEXT: ld1 { v0.b }[14], [x16] 3876; CHECK-SD-NEXT: ld1 { v1.b }[10], [x8] 3877; CHECK-SD-NEXT: add x8, sp, #1416 3878; CHECK-SD-NEXT: add x9, sp, #456 3879; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] 3880; CHECK-SD-NEXT: ld1 { v2.b }[6], [x10] 3881; CHECK-SD-NEXT: add x10, sp, #1032 3882; CHECK-SD-NEXT: add x8, sp, #40 3883; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10] 3884; CHECK-SD-NEXT: ld1 { v0.b }[15], [x9] 3885; CHECK-SD-NEXT: ld1 { v1.b }[11], [x8] 3886; CHECK-SD-NEXT: add x8, sp, #1424 3887; CHECK-SD-NEXT: add x9, sp, #520 3888; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8] 3889; CHECK-SD-NEXT: ld1 { v2.b }[7], [x9] 3890; CHECK-SD-NEXT: add x9, sp, #1040 3891; CHECK-SD-NEXT: add x8, sp, #48 3892; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9] 3893; CHECK-SD-NEXT: add x10, sp, #528 3894; CHECK-SD-NEXT: ld1 { v1.b }[12], [x8] 3895; CHECK-SD-NEXT: add x8, sp, #1432 3896; CHECK-SD-NEXT: sdot v5.4s, v6.16b, v3.16b 3897; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8] 3898; CHECK-SD-NEXT: ld1 { v2.b }[8], [x10] 3899; CHECK-SD-NEXT: add x8, sp, #1048 3900; CHECK-SD-NEXT: ldr b3, [sp, #80] 3901; CHECK-SD-NEXT: ld1 { v16.b }[9], [x8] 3902; CHECK-SD-NEXT: add x10, sp, #88 3903; CHECK-SD-NEXT: add x8, sp, #536 3904; CHECK-SD-NEXT: add x11, sp, #1440 3905; CHECK-SD-NEXT: add x9, sp, #56 3906; CHECK-SD-NEXT: ld1 { v3.b }[1], [x10] 3907; CHECK-SD-NEXT: ld1 { v2.b }[9], [x8] 3908; CHECK-SD-NEXT: add x8, sp, #1056 3909; CHECK-SD-NEXT: ld1 { v7.b }[10], [x11] 3910; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8] 3911; CHECK-SD-NEXT: ld1 { v1.b }[13], [x9] 3912; CHECK-SD-NEXT: add x9, sp, #96 3913; CHECK-SD-NEXT: add x8, sp, #544 3914; CHECK-SD-NEXT: add x10, sp, #1448 3915; CHECK-SD-NEXT: ld1 { v3.b }[2], [x9] 3916; CHECK-SD-NEXT: ld1 { v2.b }[10], [x8] 3917; CHECK-SD-NEXT: add x8, sp, #1064 3918; CHECK-SD-NEXT: ld1 { v7.b }[11], [x10] 3919; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8] 3920; CHECK-SD-NEXT: add x10, sp, #104 3921; CHECK-SD-NEXT: add x8, sp, #552 3922; CHECK-SD-NEXT: add x11, sp, #1456 3923; CHECK-SD-NEXT: add x9, sp, #64 3924; CHECK-SD-NEXT: ld1 { v3.b }[3], [x10] 3925; CHECK-SD-NEXT: ld1 { v2.b }[11], [x8] 3926; CHECK-SD-NEXT: add x8, sp, #1072 3927; CHECK-SD-NEXT: ld1 { v7.b }[12], [x11] 3928; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8] 3929; CHECK-SD-NEXT: ld1 { v1.b }[14], [x9] 3930; CHECK-SD-NEXT: add x9, sp, #112 3931; CHECK-SD-NEXT: add x8, sp, #560 3932; CHECK-SD-NEXT: add x10, sp, #1464 3933; CHECK-SD-NEXT: ld1 { v3.b }[4], [x9] 3934; CHECK-SD-NEXT: ld1 { v2.b }[12], [x8] 3935; CHECK-SD-NEXT: add x8, sp, #1080 3936; CHECK-SD-NEXT: ld1 { v7.b }[13], [x10] 3937; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8] 3938; CHECK-SD-NEXT: add x10, sp, #120 3939; CHECK-SD-NEXT: add x8, sp, #568 3940; CHECK-SD-NEXT: add x11, sp, #1472 3941; CHECK-SD-NEXT: add x9, sp, #72 3942; CHECK-SD-NEXT: ld1 { v3.b }[5], [x10] 3943; CHECK-SD-NEXT: ld1 { v2.b }[13], [x8] 3944; CHECK-SD-NEXT: add x8, sp, #1088 3945; CHECK-SD-NEXT: ld1 { v7.b }[14], [x11] 3946; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8] 3947; CHECK-SD-NEXT: ld1 { v1.b }[15], [x9] 3948; CHECK-SD-NEXT: add x9, sp, #128 3949; CHECK-SD-NEXT: ldr b6, [sp, #1104] 3950; CHECK-SD-NEXT: add x10, sp, #1480 3951; CHECK-SD-NEXT: ld1 { v3.b }[6], [x9] 3952; CHECK-SD-NEXT: add x8, sp, #1096 3953; CHECK-SD-NEXT: add x9, sp, #1112 3954; CHECK-SD-NEXT: ld1 { v7.b }[15], [x10] 3955; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8] 3956; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9] 3957; CHECK-SD-NEXT: add x8, sp, #728 3958; CHECK-SD-NEXT: add x9, sp, #576 3959; CHECK-SD-NEXT: add x10, sp, #136 3960; CHECK-SD-NEXT: ld1 { v17.b }[1], [x8] 3961; CHECK-SD-NEXT: add x8, sp, #1120 3962; CHECK-SD-NEXT: ld1 { v2.b }[14], [x9] 3963; CHECK-SD-NEXT: sdot v4.4s, v16.16b, v7.16b 3964; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] 3965; CHECK-SD-NEXT: add x8, sp, #736 3966; CHECK-SD-NEXT: ldr b7, [sp, #1232] 3967; CHECK-SD-NEXT: ldr b16, [sp, #848] 3968; CHECK-SD-NEXT: ld1 { v3.b }[7], [x10] 3969; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8] 3970; CHECK-SD-NEXT: add x9, sp, #1240 3971; CHECK-SD-NEXT: add x10, sp, #856 3972; CHECK-SD-NEXT: ld1 { v7.b }[1], [x9] 3973; CHECK-SD-NEXT: ld1 { v16.b }[1], [x10] 3974; CHECK-SD-NEXT: add x8, sp, #1128 3975; CHECK-SD-NEXT: add x11, sp, #744 3976; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8] 3977; CHECK-SD-NEXT: add x10, sp, #1248 3978; CHECK-SD-NEXT: ld1 { v17.b }[3], [x11] 3979; CHECK-SD-NEXT: add x11, sp, #864 3980; CHECK-SD-NEXT: add x9, sp, #144 3981; CHECK-SD-NEXT: ld1 { v7.b }[2], [x10] 3982; CHECK-SD-NEXT: ld1 { v16.b }[2], [x11] 3983; CHECK-SD-NEXT: add x8, sp, #1136 3984; CHECK-SD-NEXT: add x12, sp, #752 3985; CHECK-SD-NEXT: ld1 { v3.b }[8], [x9] 3986; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8] 3987; CHECK-SD-NEXT: ld1 { v17.b }[4], [x12] 3988; CHECK-SD-NEXT: add x9, sp, #1256 3989; CHECK-SD-NEXT: add x10, sp, #872 3990; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9] 3991; CHECK-SD-NEXT: ld1 { v16.b }[3], [x10] 3992; CHECK-SD-NEXT: add x8, sp, #1144 3993; CHECK-SD-NEXT: add x11, sp, #760 3994; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8] 3995; CHECK-SD-NEXT: add x10, sp, #1264 3996; CHECK-SD-NEXT: ld1 { v17.b }[5], [x11] 3997; CHECK-SD-NEXT: add x11, sp, #880 3998; CHECK-SD-NEXT: add x9, sp, #152 3999; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10] 4000; CHECK-SD-NEXT: ld1 { v16.b }[4], [x11] 4001; CHECK-SD-NEXT: add x8, sp, #1152 4002; CHECK-SD-NEXT: add x12, sp, #768 4003; CHECK-SD-NEXT: ld1 { v3.b }[9], [x9] 4004; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8] 4005; CHECK-SD-NEXT: ld1 { v17.b }[6], [x12] 4006; CHECK-SD-NEXT: add x9, sp, #1272 4007; CHECK-SD-NEXT: add x10, sp, #888 4008; CHECK-SD-NEXT: ld1 { v7.b }[5], [x9] 4009; CHECK-SD-NEXT: ld1 { v16.b }[5], [x10] 4010; CHECK-SD-NEXT: add x8, sp, #1160 4011; CHECK-SD-NEXT: add x11, sp, #776 4012; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8] 4013; CHECK-SD-NEXT: add x10, sp, #1280 4014; CHECK-SD-NEXT: ld1 { v17.b }[7], [x11] 4015; CHECK-SD-NEXT: add x11, sp, #896 4016; CHECK-SD-NEXT: add x9, sp, #160 4017; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10] 4018; CHECK-SD-NEXT: ld1 { v16.b }[6], [x11] 4019; CHECK-SD-NEXT: add x8, sp, #1168 4020; CHECK-SD-NEXT: add x12, sp, #784 4021; CHECK-SD-NEXT: ld1 { v3.b }[10], [x9] 4022; CHECK-SD-NEXT: ld1 { v6.b }[8], [x8] 4023; CHECK-SD-NEXT: ld1 { v17.b }[8], [x12] 4024; CHECK-SD-NEXT: add x9, sp, #1288 4025; CHECK-SD-NEXT: add x10, sp, #904 4026; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] 4027; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10] 4028; CHECK-SD-NEXT: add x8, sp, #1176 4029; CHECK-SD-NEXT: add x11, sp, #792 4030; CHECK-SD-NEXT: ld1 { v6.b }[9], [x8] 4031; CHECK-SD-NEXT: add x10, sp, #1296 4032; CHECK-SD-NEXT: ld1 { v17.b }[9], [x11] 4033; CHECK-SD-NEXT: add x11, sp, #912 4034; CHECK-SD-NEXT: add x9, sp, #168 4035; CHECK-SD-NEXT: ld1 { v7.b }[8], [x10] 4036; CHECK-SD-NEXT: ld1 { v16.b }[8], [x11] 4037; CHECK-SD-NEXT: add x8, sp, #1184 4038; CHECK-SD-NEXT: add x12, sp, #800 4039; CHECK-SD-NEXT: ld1 { v3.b }[11], [x9] 4040; CHECK-SD-NEXT: ld1 { v6.b }[10], [x8] 4041; CHECK-SD-NEXT: ld1 { v17.b }[10], [x12] 4042; CHECK-SD-NEXT: add x9, sp, #1304 4043; CHECK-SD-NEXT: add x10, sp, #920 4044; CHECK-SD-NEXT: ld1 { v7.b }[9], [x9] 4045; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10] 4046; CHECK-SD-NEXT: add x8, sp, #1192 4047; CHECK-SD-NEXT: add x11, sp, #808 4048; CHECK-SD-NEXT: ld1 { v6.b }[11], [x8] 4049; CHECK-SD-NEXT: add x10, sp, #1312 4050; CHECK-SD-NEXT: ld1 { v17.b }[11], [x11] 4051; CHECK-SD-NEXT: add x11, sp, #928 4052; CHECK-SD-NEXT: add x9, sp, #176 4053; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10] 4054; CHECK-SD-NEXT: ld1 { v16.b }[10], [x11] 4055; CHECK-SD-NEXT: add x8, sp, #1200 4056; CHECK-SD-NEXT: add x12, sp, #816 4057; CHECK-SD-NEXT: ld1 { v3.b }[12], [x9] 4058; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8] 4059; CHECK-SD-NEXT: ld1 { v17.b }[12], [x12] 4060; CHECK-SD-NEXT: add x9, sp, #1320 4061; CHECK-SD-NEXT: add x10, sp, #936 4062; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9] 4063; CHECK-SD-NEXT: ld1 { v16.b }[11], [x10] 4064; CHECK-SD-NEXT: add x8, sp, #1208 4065; CHECK-SD-NEXT: add x11, sp, #824 4066; CHECK-SD-NEXT: ld1 { v6.b }[13], [x8] 4067; CHECK-SD-NEXT: add x10, sp, #1328 4068; CHECK-SD-NEXT: ld1 { v17.b }[13], [x11] 4069; CHECK-SD-NEXT: add x11, sp, #944 4070; CHECK-SD-NEXT: add x9, sp, #184 4071; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10] 4072; CHECK-SD-NEXT: ld1 { v16.b }[12], [x11] 4073; CHECK-SD-NEXT: add x8, sp, #1216 4074; CHECK-SD-NEXT: add x12, sp, #832 4075; CHECK-SD-NEXT: ld1 { v3.b }[13], [x9] 4076; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8] 4077; CHECK-SD-NEXT: ld1 { v17.b }[14], [x12] 4078; CHECK-SD-NEXT: add x9, sp, #1336 4079; CHECK-SD-NEXT: add x10, sp, #952 4080; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9] 4081; CHECK-SD-NEXT: ld1 { v16.b }[13], [x10] 4082; CHECK-SD-NEXT: add x8, sp, #1224 4083; CHECK-SD-NEXT: add x11, sp, #840 4084; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8] 4085; CHECK-SD-NEXT: add x8, sp, #192 4086; CHECK-SD-NEXT: ld1 { v17.b }[15], [x11] 4087; CHECK-SD-NEXT: add x10, sp, #1344 4088; CHECK-SD-NEXT: add x11, sp, #960 4089; CHECK-SD-NEXT: ld1 { v3.b }[14], [x8] 4090; CHECK-SD-NEXT: ld1 { v7.b }[14], [x10] 4091; CHECK-SD-NEXT: ld1 { v16.b }[14], [x11] 4092; CHECK-SD-NEXT: add x9, sp, #584 4093; CHECK-SD-NEXT: sdot v5.4s, v1.16b, v0.16b 4094; CHECK-SD-NEXT: add x8, sp, #200 4095; CHECK-SD-NEXT: sdot v4.4s, v17.16b, v6.16b 4096; CHECK-SD-NEXT: ld1 { v2.b }[15], [x9] 4097; CHECK-SD-NEXT: add x9, sp, #1352 4098; CHECK-SD-NEXT: add x10, sp, #968 4099; CHECK-SD-NEXT: ld1 { v3.b }[15], [x8] 4100; CHECK-SD-NEXT: ld1 { v7.b }[15], [x9] 4101; CHECK-SD-NEXT: ld1 { v16.b }[15], [x10] 4102; CHECK-SD-NEXT: sdot v5.4s, v3.16b, v2.16b 4103; CHECK-SD-NEXT: sdot v4.4s, v16.16b, v7.16b 4104; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s 4105; CHECK-SD-NEXT: addv s0, v0.4s 4106; CHECK-SD-NEXT: fmov w0, s0 4107; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 4108; CHECK-SD-NEXT: ret 4109; 4110; CHECK-GI-LABEL: test_sdot_v48i8_double: 4111; CHECK-GI: // %bb.0: // %entry 4112; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 4113; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 4114; CHECK-GI-NEXT: .cfi_offset w29, -16 4115; CHECK-GI-NEXT: ldr w11, [sp, #80] 4116; CHECK-GI-NEXT: ldr w10, [sp, #208] 4117; CHECK-GI-NEXT: fmov s0, w0 4118; CHECK-GI-NEXT: ldr w8, [sp, #88] 4119; CHECK-GI-NEXT: ldr w12, [sp, #344] 4120; CHECK-GI-NEXT: movi v20.2d, #0000000000000000 4121; CHECK-GI-NEXT: fmov s1, w11 4122; CHECK-GI-NEXT: ldr w11, [sp, #336] 4123; CHECK-GI-NEXT: fmov s2, w10 4124; CHECK-GI-NEXT: ldr w10, [sp, #464] 4125; CHECK-GI-NEXT: ldr w9, [sp, #216] 4126; CHECK-GI-NEXT: mov v0.b[1], w1 4127; CHECK-GI-NEXT: fmov s3, w11 4128; CHECK-GI-NEXT: ldr w11, [sp, #600] 4129; CHECK-GI-NEXT: movi v21.2d, #0000000000000000 4130; CHECK-GI-NEXT: mov v1.b[1], w8 4131; CHECK-GI-NEXT: ldr w8, [sp, #592] 4132; CHECK-GI-NEXT: fmov s4, w10 4133; CHECK-GI-NEXT: mov v2.b[1], w9 4134; CHECK-GI-NEXT: ldr w9, [sp, #472] 4135; CHECK-GI-NEXT: ldr w10, [sp, #608] 4136; CHECK-GI-NEXT: mov v3.b[1], w12 4137; CHECK-GI-NEXT: fmov s5, w8 4138; CHECK-GI-NEXT: ldr w8, [sp, #96] 4139; CHECK-GI-NEXT: mov v4.b[1], w9 4140; CHECK-GI-NEXT: ldr w9, [sp, #224] 4141; CHECK-GI-NEXT: mov v0.b[2], w2 4142; CHECK-GI-NEXT: mov v1.b[2], w8 4143; CHECK-GI-NEXT: ldr w8, [sp, #352] 4144; CHECK-GI-NEXT: ldr w12, [sp, #848] 4145; CHECK-GI-NEXT: mov v2.b[2], w9 4146; CHECK-GI-NEXT: ldr w9, [sp, #480] 4147; CHECK-GI-NEXT: mov v5.b[1], w11 4148; CHECK-GI-NEXT: mov v3.b[2], w8 4149; CHECK-GI-NEXT: ldr w8, [sp, #104] 4150; CHECK-GI-NEXT: ldr w11, [sp, #16] 4151; CHECK-GI-NEXT: mov v4.b[2], w9 4152; CHECK-GI-NEXT: ldr w9, [sp, #232] 4153; CHECK-GI-NEXT: mov v0.b[3], w3 4154; CHECK-GI-NEXT: mov v1.b[3], w8 4155; CHECK-GI-NEXT: ldr w8, [sp, #360] 4156; CHECK-GI-NEXT: fmov s7, w12 4157; CHECK-GI-NEXT: mov v2.b[3], w9 4158; CHECK-GI-NEXT: ldr w9, [sp, #488] 4159; CHECK-GI-NEXT: mov v5.b[2], w10 4160; CHECK-GI-NEXT: mov v3.b[3], w8 4161; CHECK-GI-NEXT: ldr w8, [sp, #112] 4162; CHECK-GI-NEXT: ldr w10, [sp, #616] 4163; CHECK-GI-NEXT: mov v4.b[3], w9 4164; CHECK-GI-NEXT: ldr w9, [sp, #240] 4165; CHECK-GI-NEXT: mov v0.b[4], w4 4166; CHECK-GI-NEXT: mov v1.b[4], w8 4167; CHECK-GI-NEXT: ldr w8, [sp, #368] 4168; CHECK-GI-NEXT: ldr w12, [sp, #1112] 4169; CHECK-GI-NEXT: mov v2.b[4], w9 4170; CHECK-GI-NEXT: ldr w9, [sp, #496] 4171; CHECK-GI-NEXT: mov v5.b[3], w10 4172; CHECK-GI-NEXT: mov v3.b[4], w8 4173; CHECK-GI-NEXT: ldr w8, [sp, #120] 4174; CHECK-GI-NEXT: ldr w10, [sp, #624] 4175; CHECK-GI-NEXT: mov v4.b[4], w9 4176; CHECK-GI-NEXT: ldr w9, [sp, #248] 4177; CHECK-GI-NEXT: mov v0.b[5], w5 4178; CHECK-GI-NEXT: mov v1.b[5], w8 4179; CHECK-GI-NEXT: ldr w8, [sp, #376] 4180; CHECK-GI-NEXT: movi v22.2d, #0000000000000000 4181; CHECK-GI-NEXT: mov v2.b[5], w9 4182; CHECK-GI-NEXT: ldr w9, [sp, #504] 4183; CHECK-GI-NEXT: mov v5.b[4], w10 4184; CHECK-GI-NEXT: mov v3.b[5], w8 4185; CHECK-GI-NEXT: ldr w8, [sp, #128] 4186; CHECK-GI-NEXT: ldr w10, [sp, #632] 4187; CHECK-GI-NEXT: mov v4.b[5], w9 4188; CHECK-GI-NEXT: ldr w9, [sp, #256] 4189; CHECK-GI-NEXT: mov v0.b[6], w6 4190; CHECK-GI-NEXT: mov v1.b[6], w8 4191; CHECK-GI-NEXT: ldr w8, [sp, #384] 4192; CHECK-GI-NEXT: movi v23.2d, #0000000000000000 4193; CHECK-GI-NEXT: mov v2.b[6], w9 4194; CHECK-GI-NEXT: ldr w9, [sp, #512] 4195; CHECK-GI-NEXT: mov v5.b[5], w10 4196; CHECK-GI-NEXT: mov v3.b[6], w8 4197; CHECK-GI-NEXT: ldr w8, [sp, #136] 4198; CHECK-GI-NEXT: ldr w10, [sp, #640] 4199; CHECK-GI-NEXT: mov v4.b[6], w9 4200; CHECK-GI-NEXT: ldr w9, [sp, #264] 4201; CHECK-GI-NEXT: mov v0.b[7], w7 4202; CHECK-GI-NEXT: mov v1.b[7], w8 4203; CHECK-GI-NEXT: ldr w8, [sp, #392] 4204; CHECK-GI-NEXT: movi v24.2d, #0000000000000000 4205; CHECK-GI-NEXT: mov v2.b[7], w9 4206; CHECK-GI-NEXT: ldr w9, [sp, #520] 4207; CHECK-GI-NEXT: mov v5.b[6], w10 4208; CHECK-GI-NEXT: mov v3.b[7], w8 4209; CHECK-GI-NEXT: ldr w8, [sp, #144] 4210; CHECK-GI-NEXT: ldr w10, [sp, #648] 4211; CHECK-GI-NEXT: mov v4.b[7], w9 4212; CHECK-GI-NEXT: ldr w9, [sp, #272] 4213; CHECK-GI-NEXT: mov v0.b[8], w11 4214; CHECK-GI-NEXT: mov v1.b[8], w8 4215; CHECK-GI-NEXT: ldr w8, [sp, #400] 4216; CHECK-GI-NEXT: ldr w11, [sp, #24] 4217; CHECK-GI-NEXT: mov v2.b[8], w9 4218; CHECK-GI-NEXT: ldr w9, [sp, #528] 4219; CHECK-GI-NEXT: mov v5.b[7], w10 4220; CHECK-GI-NEXT: mov v3.b[8], w8 4221; CHECK-GI-NEXT: ldr w8, [sp, #152] 4222; CHECK-GI-NEXT: ldr w10, [sp, #656] 4223; CHECK-GI-NEXT: mov v4.b[8], w9 4224; CHECK-GI-NEXT: ldr w9, [sp, #280] 4225; CHECK-GI-NEXT: mov v0.b[9], w11 4226; CHECK-GI-NEXT: mov v1.b[9], w8 4227; CHECK-GI-NEXT: ldr w8, [sp, #408] 4228; CHECK-GI-NEXT: ldr w11, [sp, #32] 4229; CHECK-GI-NEXT: mov v2.b[9], w9 4230; CHECK-GI-NEXT: ldr w9, [sp, #536] 4231; CHECK-GI-NEXT: mov v5.b[8], w10 4232; CHECK-GI-NEXT: mov v3.b[9], w8 4233; CHECK-GI-NEXT: ldr w8, [sp, #160] 4234; CHECK-GI-NEXT: ldr w10, [sp, #664] 4235; CHECK-GI-NEXT: mov v4.b[9], w9 4236; CHECK-GI-NEXT: ldr w9, [sp, #288] 4237; CHECK-GI-NEXT: mov v0.b[10], w11 4238; CHECK-GI-NEXT: mov v1.b[10], w8 4239; CHECK-GI-NEXT: ldr w8, [sp, #416] 4240; CHECK-GI-NEXT: ldr w11, [sp, #40] 4241; CHECK-GI-NEXT: mov v2.b[10], w9 4242; CHECK-GI-NEXT: ldr w9, [sp, #544] 4243; CHECK-GI-NEXT: mov v5.b[9], w10 4244; CHECK-GI-NEXT: mov v3.b[10], w8 4245; CHECK-GI-NEXT: ldr w8, [sp, #168] 4246; CHECK-GI-NEXT: ldr w10, [sp, #672] 4247; CHECK-GI-NEXT: mov v4.b[10], w9 4248; CHECK-GI-NEXT: ldr w9, [sp, #296] 4249; CHECK-GI-NEXT: mov v0.b[11], w11 4250; CHECK-GI-NEXT: mov v1.b[11], w8 4251; CHECK-GI-NEXT: ldr w8, [sp, #424] 4252; CHECK-GI-NEXT: ldr w11, [sp, #48] 4253; CHECK-GI-NEXT: mov v2.b[11], w9 4254; CHECK-GI-NEXT: ldr w9, [sp, #552] 4255; CHECK-GI-NEXT: mov v5.b[10], w10 4256; CHECK-GI-NEXT: mov v3.b[11], w8 4257; CHECK-GI-NEXT: ldr w8, [sp, #176] 4258; CHECK-GI-NEXT: ldr w10, [sp, #680] 4259; CHECK-GI-NEXT: mov v4.b[11], w9 4260; CHECK-GI-NEXT: ldr w9, [sp, #304] 4261; CHECK-GI-NEXT: mov v0.b[12], w11 4262; CHECK-GI-NEXT: mov v1.b[12], w8 4263; CHECK-GI-NEXT: ldr w8, [sp, #432] 4264; CHECK-GI-NEXT: ldr w11, [sp, #56] 4265; CHECK-GI-NEXT: mov v2.b[12], w9 4266; CHECK-GI-NEXT: ldr w9, [sp, #560] 4267; CHECK-GI-NEXT: mov v5.b[11], w10 4268; CHECK-GI-NEXT: mov v3.b[12], w8 4269; CHECK-GI-NEXT: ldr w8, [sp, #184] 4270; CHECK-GI-NEXT: ldr w10, [sp, #688] 4271; CHECK-GI-NEXT: mov v4.b[12], w9 4272; CHECK-GI-NEXT: ldr w9, [sp, #312] 4273; CHECK-GI-NEXT: mov v0.b[13], w11 4274; CHECK-GI-NEXT: mov v1.b[13], w8 4275; CHECK-GI-NEXT: ldr w8, [sp, #440] 4276; CHECK-GI-NEXT: ldr w11, [sp, #64] 4277; CHECK-GI-NEXT: mov v2.b[13], w9 4278; CHECK-GI-NEXT: ldr w9, [sp, #568] 4279; CHECK-GI-NEXT: mov v5.b[12], w10 4280; CHECK-GI-NEXT: mov v3.b[13], w8 4281; CHECK-GI-NEXT: ldr w8, [sp, #192] 4282; CHECK-GI-NEXT: ldr w10, [sp, #696] 4283; CHECK-GI-NEXT: mov v4.b[13], w9 4284; CHECK-GI-NEXT: ldr w9, [sp, #320] 4285; CHECK-GI-NEXT: mov v0.b[14], w11 4286; CHECK-GI-NEXT: mov v1.b[14], w8 4287; CHECK-GI-NEXT: ldr w8, [sp, #448] 4288; CHECK-GI-NEXT: ldr w11, [sp, #72] 4289; CHECK-GI-NEXT: mov v2.b[14], w9 4290; CHECK-GI-NEXT: ldr w9, [sp, #576] 4291; CHECK-GI-NEXT: mov v5.b[13], w10 4292; CHECK-GI-NEXT: mov v3.b[14], w8 4293; CHECK-GI-NEXT: ldr w8, [sp, #720] 4294; CHECK-GI-NEXT: ldr w10, [sp, #704] 4295; CHECK-GI-NEXT: mov v4.b[14], w9 4296; CHECK-GI-NEXT: ldr w9, [sp, #728] 4297; CHECK-GI-NEXT: mov v0.b[15], w11 4298; CHECK-GI-NEXT: fmov s6, w8 4299; CHECK-GI-NEXT: ldr w8, [sp, #328] 4300; CHECK-GI-NEXT: ldr w11, [sp, #456] 4301; CHECK-GI-NEXT: mov v5.b[14], w10 4302; CHECK-GI-NEXT: ldr w10, [sp, #200] 4303; CHECK-GI-NEXT: movi v25.2d, #0000000000000000 4304; CHECK-GI-NEXT: mov v2.b[15], w8 4305; CHECK-GI-NEXT: mov v3.b[15], w11 4306; CHECK-GI-NEXT: ldr w11, [sp, #736] 4307; CHECK-GI-NEXT: mov v6.b[1], w9 4308; CHECK-GI-NEXT: ldr w9, [sp, #584] 4309; CHECK-GI-NEXT: ldr w8, [sp, #856] 4310; CHECK-GI-NEXT: mov v1.b[15], w10 4311; CHECK-GI-NEXT: ldr w10, [sp, #712] 4312; CHECK-GI-NEXT: mov v4.b[15], w9 4313; CHECK-GI-NEXT: ldr w9, [sp, #976] 4314; CHECK-GI-NEXT: mov v7.b[1], w8 4315; CHECK-GI-NEXT: ldr w8, [sp, #1232] 4316; CHECK-GI-NEXT: mov v5.b[15], w10 4317; CHECK-GI-NEXT: ldr w10, [sp, #984] 4318; CHECK-GI-NEXT: mov v6.b[2], w11 4319; CHECK-GI-NEXT: ldr w11, [sp, #1104] 4320; CHECK-GI-NEXT: fmov s16, w9 4321; CHECK-GI-NEXT: ldr w9, [sp, #1360] 4322; CHECK-GI-NEXT: fmov s18, w8 4323; CHECK-GI-NEXT: ldr w8, [sp, #1368] 4324; CHECK-GI-NEXT: fmov s17, w11 4325; CHECK-GI-NEXT: ldr w11, [sp, #1240] 4326; CHECK-GI-NEXT: sdot v20.4s, v0.16b, v3.16b 4327; CHECK-GI-NEXT: mov v16.b[1], w10 4328; CHECK-GI-NEXT: fmov s19, w9 4329; CHECK-GI-NEXT: ldr w10, [sp, #864] 4330; CHECK-GI-NEXT: mov v18.b[1], w11 4331; CHECK-GI-NEXT: ldr w11, [sp, #992] 4332; CHECK-GI-NEXT: ldr w9, [sp, #1120] 4333; CHECK-GI-NEXT: mov v17.b[1], w12 4334; CHECK-GI-NEXT: mov v7.b[2], w10 4335; CHECK-GI-NEXT: ldr w10, [sp, #1248] 4336; CHECK-GI-NEXT: mov v19.b[1], w8 4337; CHECK-GI-NEXT: ldr w8, [sp, #744] 4338; CHECK-GI-NEXT: sdot v21.4s, v1.16b, v4.16b 4339; CHECK-GI-NEXT: mov v16.b[2], w11 4340; CHECK-GI-NEXT: ldr w11, [sp, #872] 4341; CHECK-GI-NEXT: addv s0, v20.4s 4342; CHECK-GI-NEXT: mov v6.b[3], w8 4343; CHECK-GI-NEXT: ldr w8, [sp, #1000] 4344; CHECK-GI-NEXT: mov v18.b[2], w10 4345; CHECK-GI-NEXT: mov v17.b[2], w9 4346; CHECK-GI-NEXT: ldr w9, [sp, #1376] 4347; CHECK-GI-NEXT: ldr w10, [sp, #1128] 4348; CHECK-GI-NEXT: mov v7.b[3], w11 4349; CHECK-GI-NEXT: ldr w11, [sp, #880] 4350; CHECK-GI-NEXT: addv s1, v21.4s 4351; CHECK-GI-NEXT: mov v19.b[2], w9 4352; CHECK-GI-NEXT: ldr w9, [sp, #752] 4353; CHECK-GI-NEXT: mov v16.b[3], w8 4354; CHECK-GI-NEXT: ldr w8, [sp, #1256] 4355; CHECK-GI-NEXT: sdot v25.4s, v2.16b, v5.16b 4356; CHECK-GI-NEXT: mov v17.b[3], w10 4357; CHECK-GI-NEXT: ldr w10, [sp, #1384] 4358; CHECK-GI-NEXT: mov v6.b[4], w9 4359; CHECK-GI-NEXT: ldr w9, [sp, #1008] 4360; CHECK-GI-NEXT: mov v18.b[3], w8 4361; CHECK-GI-NEXT: ldr w8, [sp, #1136] 4362; CHECK-GI-NEXT: mov v19.b[3], w10 4363; CHECK-GI-NEXT: ldr w10, [sp, #760] 4364; CHECK-GI-NEXT: mov v7.b[4], w11 4365; CHECK-GI-NEXT: mov v16.b[4], w9 4366; CHECK-GI-NEXT: ldr w9, [sp, #1264] 4367; CHECK-GI-NEXT: ldr w11, [sp, #888] 4368; CHECK-GI-NEXT: mov v17.b[4], w8 4369; CHECK-GI-NEXT: ldr w8, [sp, #1392] 4370; CHECK-GI-NEXT: mov v6.b[5], w10 4371; CHECK-GI-NEXT: ldr w10, [sp, #1016] 4372; CHECK-GI-NEXT: mov v18.b[4], w9 4373; CHECK-GI-NEXT: ldr w9, [sp, #1144] 4374; CHECK-GI-NEXT: mov v19.b[4], w8 4375; CHECK-GI-NEXT: ldr w8, [sp, #768] 4376; CHECK-GI-NEXT: mov v7.b[5], w11 4377; CHECK-GI-NEXT: mov v16.b[5], w10 4378; CHECK-GI-NEXT: ldr w10, [sp, #1272] 4379; CHECK-GI-NEXT: ldr w11, [sp, #896] 4380; CHECK-GI-NEXT: mov v17.b[5], w9 4381; CHECK-GI-NEXT: ldr w9, [sp, #1400] 4382; CHECK-GI-NEXT: mov v6.b[6], w8 4383; CHECK-GI-NEXT: ldr w8, [sp, #1024] 4384; CHECK-GI-NEXT: mov v18.b[5], w10 4385; CHECK-GI-NEXT: ldr w10, [sp, #1152] 4386; CHECK-GI-NEXT: mov v19.b[5], w9 4387; CHECK-GI-NEXT: ldr w9, [sp, #776] 4388; CHECK-GI-NEXT: mov v7.b[6], w11 4389; CHECK-GI-NEXT: mov v16.b[6], w8 4390; CHECK-GI-NEXT: ldr w8, [sp, #1280] 4391; CHECK-GI-NEXT: ldr w11, [sp, #904] 4392; CHECK-GI-NEXT: mov v17.b[6], w10 4393; CHECK-GI-NEXT: ldr w10, [sp, #1408] 4394; CHECK-GI-NEXT: mov v6.b[7], w9 4395; CHECK-GI-NEXT: ldr w9, [sp, #1032] 4396; CHECK-GI-NEXT: mov v18.b[6], w8 4397; CHECK-GI-NEXT: ldr w8, [sp, #1160] 4398; CHECK-GI-NEXT: mov v19.b[6], w10 4399; CHECK-GI-NEXT: ldr w10, [sp, #784] 4400; CHECK-GI-NEXT: mov v7.b[7], w11 4401; CHECK-GI-NEXT: mov v16.b[7], w9 4402; CHECK-GI-NEXT: ldr w9, [sp, #1288] 4403; CHECK-GI-NEXT: ldr w11, [sp, #912] 4404; CHECK-GI-NEXT: mov v17.b[7], w8 4405; CHECK-GI-NEXT: ldr w8, [sp, #1416] 4406; CHECK-GI-NEXT: mov v6.b[8], w10 4407; CHECK-GI-NEXT: ldr w10, [sp, #1040] 4408; CHECK-GI-NEXT: mov v18.b[7], w9 4409; CHECK-GI-NEXT: ldr w9, [sp, #1168] 4410; CHECK-GI-NEXT: mov v19.b[7], w8 4411; CHECK-GI-NEXT: ldr w8, [sp, #792] 4412; CHECK-GI-NEXT: mov v7.b[8], w11 4413; CHECK-GI-NEXT: mov v16.b[8], w10 4414; CHECK-GI-NEXT: ldr w10, [sp, #1296] 4415; CHECK-GI-NEXT: ldr w11, [sp, #920] 4416; CHECK-GI-NEXT: mov v17.b[8], w9 4417; CHECK-GI-NEXT: ldr w9, [sp, #1424] 4418; CHECK-GI-NEXT: mov v6.b[9], w8 4419; CHECK-GI-NEXT: ldr w8, [sp, #1048] 4420; CHECK-GI-NEXT: mov v18.b[8], w10 4421; CHECK-GI-NEXT: ldr w10, [sp, #1176] 4422; CHECK-GI-NEXT: mov v19.b[8], w9 4423; CHECK-GI-NEXT: ldr w9, [sp, #800] 4424; CHECK-GI-NEXT: mov v7.b[9], w11 4425; CHECK-GI-NEXT: mov v16.b[9], w8 4426; CHECK-GI-NEXT: ldr w8, [sp, #1304] 4427; CHECK-GI-NEXT: ldr w11, [sp, #928] 4428; CHECK-GI-NEXT: mov v17.b[9], w10 4429; CHECK-GI-NEXT: ldr w10, [sp, #1432] 4430; CHECK-GI-NEXT: mov v6.b[10], w9 4431; CHECK-GI-NEXT: ldr w9, [sp, #1056] 4432; CHECK-GI-NEXT: mov v18.b[9], w8 4433; CHECK-GI-NEXT: ldr w8, [sp, #1184] 4434; CHECK-GI-NEXT: mov v19.b[9], w10 4435; CHECK-GI-NEXT: ldr w10, [sp, #808] 4436; CHECK-GI-NEXT: mov v7.b[10], w11 4437; CHECK-GI-NEXT: mov v16.b[10], w9 4438; CHECK-GI-NEXT: ldr w9, [sp, #1312] 4439; CHECK-GI-NEXT: ldr w11, [sp, #936] 4440; CHECK-GI-NEXT: mov v17.b[10], w8 4441; CHECK-GI-NEXT: ldr w8, [sp, #1440] 4442; CHECK-GI-NEXT: mov v6.b[11], w10 4443; CHECK-GI-NEXT: ldr w10, [sp, #1064] 4444; CHECK-GI-NEXT: mov v18.b[10], w9 4445; CHECK-GI-NEXT: ldr w9, [sp, #1192] 4446; CHECK-GI-NEXT: mov v19.b[10], w8 4447; CHECK-GI-NEXT: ldr w8, [sp, #816] 4448; CHECK-GI-NEXT: mov v7.b[11], w11 4449; CHECK-GI-NEXT: mov v16.b[11], w10 4450; CHECK-GI-NEXT: ldr w10, [sp, #1320] 4451; CHECK-GI-NEXT: ldr w11, [sp, #944] 4452; CHECK-GI-NEXT: mov v17.b[11], w9 4453; CHECK-GI-NEXT: ldr w9, [sp, #1448] 4454; CHECK-GI-NEXT: mov v6.b[12], w8 4455; CHECK-GI-NEXT: ldr w8, [sp, #1072] 4456; CHECK-GI-NEXT: mov v18.b[11], w10 4457; CHECK-GI-NEXT: ldr w10, [sp, #1200] 4458; CHECK-GI-NEXT: mov v19.b[11], w9 4459; CHECK-GI-NEXT: ldr w9, [sp, #824] 4460; CHECK-GI-NEXT: mov v7.b[12], w11 4461; CHECK-GI-NEXT: mov v16.b[12], w8 4462; CHECK-GI-NEXT: ldr w8, [sp, #1328] 4463; CHECK-GI-NEXT: ldr w11, [sp, #952] 4464; CHECK-GI-NEXT: mov v17.b[12], w10 4465; CHECK-GI-NEXT: ldr w10, [sp, #1456] 4466; CHECK-GI-NEXT: mov v6.b[13], w9 4467; CHECK-GI-NEXT: ldr w9, [sp, #1080] 4468; CHECK-GI-NEXT: mov v18.b[12], w8 4469; CHECK-GI-NEXT: ldr w8, [sp, #1208] 4470; CHECK-GI-NEXT: mov v19.b[12], w10 4471; CHECK-GI-NEXT: ldr w10, [sp, #832] 4472; CHECK-GI-NEXT: mov v7.b[13], w11 4473; CHECK-GI-NEXT: mov v16.b[13], w9 4474; CHECK-GI-NEXT: ldr w9, [sp, #1336] 4475; CHECK-GI-NEXT: ldr w11, [sp, #960] 4476; CHECK-GI-NEXT: mov v17.b[13], w8 4477; CHECK-GI-NEXT: ldr w8, [sp, #1464] 4478; CHECK-GI-NEXT: mov v6.b[14], w10 4479; CHECK-GI-NEXT: ldr w10, [sp, #1088] 4480; CHECK-GI-NEXT: mov v18.b[13], w9 4481; CHECK-GI-NEXT: ldr w9, [sp, #1216] 4482; CHECK-GI-NEXT: mov v19.b[13], w8 4483; CHECK-GI-NEXT: ldr w8, [sp, #840] 4484; CHECK-GI-NEXT: mov v7.b[14], w11 4485; CHECK-GI-NEXT: mov v16.b[14], w10 4486; CHECK-GI-NEXT: ldr w10, [sp, #1344] 4487; CHECK-GI-NEXT: ldr w11, [sp, #968] 4488; CHECK-GI-NEXT: mov v17.b[14], w9 4489; CHECK-GI-NEXT: mov v6.b[15], w8 4490; CHECK-GI-NEXT: ldr w8, [sp, #1096] 4491; CHECK-GI-NEXT: mov v18.b[14], w10 4492; CHECK-GI-NEXT: ldr w9, [sp, #1472] 4493; CHECK-GI-NEXT: ldr w10, [sp, #1224] 4494; CHECK-GI-NEXT: mov v7.b[15], w11 4495; CHECK-GI-NEXT: addv s4, v25.4s 4496; CHECK-GI-NEXT: mov v16.b[15], w8 4497; CHECK-GI-NEXT: ldr w8, [sp, #1352] 4498; CHECK-GI-NEXT: mov v19.b[14], w9 4499; CHECK-GI-NEXT: mov v17.b[15], w10 4500; CHECK-GI-NEXT: ldr w9, [sp, #1480] 4501; CHECK-GI-NEXT: mov v18.b[15], w8 4502; CHECK-GI-NEXT: fmov w8, s0 4503; CHECK-GI-NEXT: fmov w11, s4 4504; CHECK-GI-NEXT: mov v19.b[15], w9 4505; CHECK-GI-NEXT: fmov w9, s1 4506; CHECK-GI-NEXT: sdot v22.4s, v6.16b, v17.16b 4507; CHECK-GI-NEXT: sdot v23.4s, v7.16b, v18.16b 4508; CHECK-GI-NEXT: add w8, w8, w9 4509; CHECK-GI-NEXT: sdot v24.4s, v16.16b, v19.16b 4510; CHECK-GI-NEXT: add w8, w8, w11 4511; CHECK-GI-NEXT: addv s2, v22.4s 4512; CHECK-GI-NEXT: addv s3, v23.4s 4513; CHECK-GI-NEXT: addv s5, v24.4s 4514; CHECK-GI-NEXT: fmov w9, s2 4515; CHECK-GI-NEXT: fmov w10, s3 4516; CHECK-GI-NEXT: add w9, w9, w10 4517; CHECK-GI-NEXT: fmov w10, s5 4518; CHECK-GI-NEXT: add w9, w9, w10 4519; CHECK-GI-NEXT: add w0, w8, w9 4520; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 4521; CHECK-GI-NEXT: ret 4522entry: 4523 %az = sext <48 x i8> %a to <48 x i32> 4524 %bz = sext <48 x i8> %b to <48 x i32> 4525 %m1 = mul nuw nsw <48 x i32> %az, %bz 4526 %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m1) 4527 %cz = sext <48 x i8> %c to <48 x i32> 4528 %dz = sext <48 x i8> %d to <48 x i32> 4529 %m2 = mul nuw nsw <48 x i32> %cz, %dz 4530 %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m2) 4531 %x = add i32 %r1, %r2 4532 ret i32 %x 4533} 4534 4535define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) { 4536; CHECK-SD-LABEL: test_sdot_v48i8_double_nomla: 4537; CHECK-SD: // %bb.0: // %entry 4538; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 4539; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 4540; CHECK-SD-NEXT: .cfi_offset w29, -16 4541; CHECK-SD-NEXT: ldr b5, [sp, #208] 4542; CHECK-SD-NEXT: add x8, sp, #216 4543; CHECK-SD-NEXT: fmov s0, w0 4544; CHECK-SD-NEXT: ldr b4, [sp, #976] 4545; CHECK-SD-NEXT: add x9, sp, #984 4546; CHECK-SD-NEXT: add x12, sp, #328 4547; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] 4548; CHECK-SD-NEXT: add x8, sp, #224 4549; CHECK-SD-NEXT: movi v1.16b, #1 4550; CHECK-SD-NEXT: mov v0.b[1], w1 4551; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] 4552; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 4553; CHECK-SD-NEXT: add x11, sp, #992 4554; CHECK-SD-NEXT: ldr b6, [sp, #720] 4555; CHECK-SD-NEXT: ldr b7, [sp, #80] 4556; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] 4557; CHECK-SD-NEXT: add x8, sp, #232 4558; CHECK-SD-NEXT: add x13, sp, #88 4559; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11] 4560; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13] 4561; CHECK-SD-NEXT: add x13, sp, #856 4562; CHECK-SD-NEXT: mov v0.b[2], w2 4563; CHECK-SD-NEXT: add x14, sp, #1008 4564; CHECK-SD-NEXT: add x15, sp, #872 4565; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] 4566; CHECK-SD-NEXT: add x8, sp, #240 4567; CHECK-SD-NEXT: add x16, sp, #888 4568; CHECK-SD-NEXT: add x10, sp, #16 4569; CHECK-SD-NEXT: add x9, sp, #24 4570; CHECK-SD-NEXT: add x11, sp, #40 4571; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 4572; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] 4573; CHECK-SD-NEXT: add x8, sp, #248 4574; CHECK-SD-NEXT: mov v0.b[3], w3 4575; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] 4576; CHECK-SD-NEXT: add x8, sp, #256 4577; CHECK-SD-NEXT: mov v0.b[4], w4 4578; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] 4579; CHECK-SD-NEXT: add x8, sp, #264 4580; CHECK-SD-NEXT: mov v0.b[5], w5 4581; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] 4582; CHECK-SD-NEXT: add x8, sp, #272 4583; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8] 4584; CHECK-SD-NEXT: add x8, sp, #280 4585; CHECK-SD-NEXT: mov v0.b[6], w6 4586; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8] 4587; CHECK-SD-NEXT: add x8, sp, #288 4588; CHECK-SD-NEXT: mov v0.b[7], w7 4589; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8] 4590; CHECK-SD-NEXT: add x8, sp, #296 4591; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10] 4592; CHECK-SD-NEXT: add x10, sp, #128 4593; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] 4594; CHECK-SD-NEXT: add x8, sp, #304 4595; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9] 4596; CHECK-SD-NEXT: add x9, sp, #136 4597; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8] 4598; CHECK-SD-NEXT: add x8, sp, #312 4599; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] 4600; CHECK-SD-NEXT: add x8, sp, #320 4601; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] 4602; CHECK-SD-NEXT: add x8, sp, #32 4603; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] 4604; CHECK-SD-NEXT: add x8, sp, #144 4605; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12] 4606; CHECK-SD-NEXT: add x12, sp, #728 4607; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12] 4608; CHECK-SD-NEXT: add x12, sp, #1000 4609; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11] 4610; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12] 4611; CHECK-SD-NEXT: add x12, sp, #736 4612; CHECK-SD-NEXT: add x11, sp, #920 4613; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b 4614; CHECK-SD-NEXT: ldr b5, [sp, #848] 4615; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12] 4616; CHECK-SD-NEXT: add x12, sp, #48 4617; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13] 4618; CHECK-SD-NEXT: add x13, sp, #744 4619; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14] 4620; CHECK-SD-NEXT: add x14, sp, #96 4621; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12] 4622; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13] 4623; CHECK-SD-NEXT: add x13, sp, #864 4624; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14] 4625; CHECK-SD-NEXT: add x14, sp, #1016 4626; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13] 4627; CHECK-SD-NEXT: add x13, sp, #752 4628; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14] 4629; CHECK-SD-NEXT: add x14, sp, #104 4630; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13] 4631; CHECK-SD-NEXT: add x13, sp, #1024 4632; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14] 4633; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15] 4634; CHECK-SD-NEXT: add x15, sp, #760 4635; CHECK-SD-NEXT: add x14, sp, #112 4636; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13] 4637; CHECK-SD-NEXT: add x13, sp, #880 4638; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15] 4639; CHECK-SD-NEXT: add x15, sp, #1032 4640; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14] 4641; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13] 4642; CHECK-SD-NEXT: add x14, sp, #768 4643; CHECK-SD-NEXT: add x13, sp, #120 4644; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15] 4645; CHECK-SD-NEXT: add x15, sp, #1040 4646; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14] 4647; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13] 4648; CHECK-SD-NEXT: add x13, sp, #776 4649; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16] 4650; CHECK-SD-NEXT: add x14, sp, #1048 4651; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15] 4652; CHECK-SD-NEXT: add x15, sp, #896 4653; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13] 4654; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10] 4655; CHECK-SD-NEXT: add x10, sp, #784 4656; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15] 4657; CHECK-SD-NEXT: add x13, sp, #1056 4658; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14] 4659; CHECK-SD-NEXT: add x14, sp, #904 4660; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10] 4661; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] 4662; CHECK-SD-NEXT: add x9, sp, #792 4663; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14] 4664; CHECK-SD-NEXT: add x10, sp, #1064 4665; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13] 4666; CHECK-SD-NEXT: add x13, sp, #912 4667; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9] 4668; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8] 4669; CHECK-SD-NEXT: add x9, sp, #800 4670; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13] 4671; CHECK-SD-NEXT: add x8, sp, #152 4672; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10] 4673; CHECK-SD-NEXT: add x10, sp, #1072 4674; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9] 4675; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8] 4676; CHECK-SD-NEXT: add x9, sp, #808 4677; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11] 4678; CHECK-SD-NEXT: add x8, sp, #56 4679; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10] 4680; CHECK-SD-NEXT: add x10, sp, #160 4681; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8] 4682; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9] 4683; CHECK-SD-NEXT: add x9, sp, #928 4684; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10] 4685; CHECK-SD-NEXT: add x10, sp, #1080 4686; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9] 4687; CHECK-SD-NEXT: add x8, sp, #816 4688; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10] 4689; CHECK-SD-NEXT: add x9, sp, #168 4690; CHECK-SD-NEXT: add x10, sp, #176 4691; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8] 4692; CHECK-SD-NEXT: add x8, sp, #936 4693; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9] 4694; CHECK-SD-NEXT: add x9, sp, #1088 4695; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] 4696; CHECK-SD-NEXT: add x8, sp, #64 4697; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9] 4698; CHECK-SD-NEXT: add x9, sp, #824 4699; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8] 4700; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9] 4701; CHECK-SD-NEXT: add x9, sp, #944 4702; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10] 4703; CHECK-SD-NEXT: add x10, sp, #1096 4704; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9] 4705; CHECK-SD-NEXT: add x8, sp, #832 4706; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10] 4707; CHECK-SD-NEXT: add x9, sp, #184 4708; CHECK-SD-NEXT: add x10, sp, #72 4709; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8] 4710; CHECK-SD-NEXT: add x8, sp, #952 4711; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9] 4712; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] 4713; CHECK-SD-NEXT: add x8, sp, #840 4714; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10] 4715; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b 4716; CHECK-SD-NEXT: add x9, sp, #192 4717; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8] 4718; CHECK-SD-NEXT: add x8, sp, #960 4719; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9] 4720; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] 4721; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b 4722; CHECK-SD-NEXT: add x8, sp, #200 4723; CHECK-SD-NEXT: add x9, sp, #968 4724; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b 4725; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8] 4726; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9] 4727; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b 4728; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b 4729; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s 4730; CHECK-SD-NEXT: addv s0, v0.4s 4731; CHECK-SD-NEXT: fmov w0, s0 4732; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 4733; CHECK-SD-NEXT: ret 4734; 4735; CHECK-GI-LABEL: test_sdot_v48i8_double_nomla: 4736; CHECK-GI: // %bb.0: // %entry 4737; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 4738; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 4739; CHECK-GI-NEXT: .cfi_offset w29, -16 4740; CHECK-GI-NEXT: ldr w10, [sp, #80] 4741; CHECK-GI-NEXT: ldr w11, [sp, #208] 4742; CHECK-GI-NEXT: fmov s0, w0 4743; CHECK-GI-NEXT: ldr w9, [sp, #88] 4744; CHECK-GI-NEXT: ldr w12, [sp, #728] 4745; CHECK-GI-NEXT: movi v6.16b, #1 4746; CHECK-GI-NEXT: fmov s1, w10 4747; CHECK-GI-NEXT: fmov s2, w11 4748; CHECK-GI-NEXT: ldr w11, [sp, #720] 4749; CHECK-GI-NEXT: ldr w10, [sp, #216] 4750; CHECK-GI-NEXT: mov v0.b[1], w1 4751; CHECK-GI-NEXT: ldr w13, [sp, #856] 4752; CHECK-GI-NEXT: fmov s3, w11 4753; CHECK-GI-NEXT: ldr w8, [sp, #96] 4754; CHECK-GI-NEXT: ldr w11, [sp, #224] 4755; CHECK-GI-NEXT: mov v1.b[1], w9 4756; CHECK-GI-NEXT: ldr w9, [sp, #848] 4757; CHECK-GI-NEXT: mov v2.b[1], w10 4758; CHECK-GI-NEXT: ldr w10, [sp, #976] 4759; CHECK-GI-NEXT: movi v7.2d, #0000000000000000 4760; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 4761; CHECK-GI-NEXT: fmov s4, w9 4762; CHECK-GI-NEXT: mov v3.b[1], w12 4763; CHECK-GI-NEXT: ldr w9, [sp, #984] 4764; CHECK-GI-NEXT: fmov s5, w10 4765; CHECK-GI-NEXT: mov v0.b[2], w2 4766; CHECK-GI-NEXT: ldr w10, [sp, #736] 4767; CHECK-GI-NEXT: mov v1.b[2], w8 4768; CHECK-GI-NEXT: ldr w8, [sp, #864] 4769; CHECK-GI-NEXT: mov v2.b[2], w11 4770; CHECK-GI-NEXT: mov v4.b[1], w13 4771; CHECK-GI-NEXT: ldr w11, [sp, #992] 4772; CHECK-GI-NEXT: ldr w12, [sp, #776] 4773; CHECK-GI-NEXT: mov v5.b[1], w9 4774; CHECK-GI-NEXT: mov v3.b[2], w10 4775; CHECK-GI-NEXT: ldr w9, [sp, #104] 4776; CHECK-GI-NEXT: ldr w10, [sp, #232] 4777; CHECK-GI-NEXT: mov v0.b[3], w3 4778; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 4779; CHECK-GI-NEXT: mov v1.b[3], w9 4780; CHECK-GI-NEXT: ldr w9, [sp, #872] 4781; CHECK-GI-NEXT: movi v18.2d, #0000000000000000 4782; CHECK-GI-NEXT: mov v4.b[2], w8 4783; CHECK-GI-NEXT: ldr w8, [sp, #744] 4784; CHECK-GI-NEXT: mov v2.b[3], w10 4785; CHECK-GI-NEXT: mov v5.b[2], w11 4786; CHECK-GI-NEXT: ldr w11, [sp, #1000] 4787; CHECK-GI-NEXT: ldr w10, [sp, #240] 4788; CHECK-GI-NEXT: mov v3.b[3], w8 4789; CHECK-GI-NEXT: ldr w8, [sp, #112] 4790; CHECK-GI-NEXT: mov v0.b[4], w4 4791; CHECK-GI-NEXT: movi v19.2d, #0000000000000000 4792; CHECK-GI-NEXT: movi v20.2d, #0000000000000000 4793; CHECK-GI-NEXT: mov v4.b[3], w9 4794; CHECK-GI-NEXT: ldr w9, [sp, #752] 4795; CHECK-GI-NEXT: mov v1.b[4], w8 4796; CHECK-GI-NEXT: ldr w8, [sp, #880] 4797; CHECK-GI-NEXT: mov v5.b[3], w11 4798; CHECK-GI-NEXT: mov v2.b[4], w10 4799; CHECK-GI-NEXT: mov v3.b[4], w9 4800; CHECK-GI-NEXT: ldr w9, [sp, #120] 4801; CHECK-GI-NEXT: ldr w11, [sp, #1008] 4802; CHECK-GI-NEXT: ldr w10, [sp, #248] 4803; CHECK-GI-NEXT: mov v0.b[5], w5 4804; CHECK-GI-NEXT: mov v4.b[4], w8 4805; CHECK-GI-NEXT: ldr w8, [sp, #760] 4806; CHECK-GI-NEXT: mov v1.b[5], w9 4807; CHECK-GI-NEXT: ldr w9, [sp, #888] 4808; CHECK-GI-NEXT: mov v5.b[4], w11 4809; CHECK-GI-NEXT: mov v2.b[5], w10 4810; CHECK-GI-NEXT: mov v3.b[5], w8 4811; CHECK-GI-NEXT: ldr w8, [sp, #128] 4812; CHECK-GI-NEXT: ldr w11, [sp, #1016] 4813; CHECK-GI-NEXT: ldr w10, [sp, #256] 4814; CHECK-GI-NEXT: mov v0.b[6], w6 4815; CHECK-GI-NEXT: mov v4.b[5], w9 4816; CHECK-GI-NEXT: ldr w9, [sp, #768] 4817; CHECK-GI-NEXT: mov v1.b[6], w8 4818; CHECK-GI-NEXT: ldr w8, [sp, #896] 4819; CHECK-GI-NEXT: mov v5.b[5], w11 4820; CHECK-GI-NEXT: mov v2.b[6], w10 4821; CHECK-GI-NEXT: mov v3.b[6], w9 4822; CHECK-GI-NEXT: ldr w9, [sp, #136] 4823; CHECK-GI-NEXT: ldr w11, [sp, #1024] 4824; CHECK-GI-NEXT: ldr w10, [sp, #264] 4825; CHECK-GI-NEXT: mov v0.b[7], w7 4826; CHECK-GI-NEXT: mov v4.b[6], w8 4827; CHECK-GI-NEXT: mov v1.b[7], w9 4828; CHECK-GI-NEXT: ldr w9, [sp, #904] 4829; CHECK-GI-NEXT: mov v5.b[6], w11 4830; CHECK-GI-NEXT: mov v2.b[7], w10 4831; CHECK-GI-NEXT: ldr w8, [sp, #16] 4832; CHECK-GI-NEXT: mov v3.b[7], w12 4833; CHECK-GI-NEXT: ldr w10, [sp, #144] 4834; CHECK-GI-NEXT: ldr w12, [sp, #1032] 4835; CHECK-GI-NEXT: mov v0.b[8], w8 4836; CHECK-GI-NEXT: ldr w8, [sp, #784] 4837; CHECK-GI-NEXT: ldr w11, [sp, #272] 4838; CHECK-GI-NEXT: mov v4.b[7], w9 4839; CHECK-GI-NEXT: mov v1.b[8], w10 4840; CHECK-GI-NEXT: ldr w10, [sp, #912] 4841; CHECK-GI-NEXT: mov v5.b[7], w12 4842; CHECK-GI-NEXT: ldr w9, [sp, #24] 4843; CHECK-GI-NEXT: ldr w12, [sp, #1040] 4844; CHECK-GI-NEXT: mov v3.b[8], w8 4845; CHECK-GI-NEXT: ldr w8, [sp, #152] 4846; CHECK-GI-NEXT: mov v2.b[8], w11 4847; CHECK-GI-NEXT: mov v0.b[9], w9 4848; CHECK-GI-NEXT: ldr w9, [sp, #792] 4849; CHECK-GI-NEXT: ldr w11, [sp, #280] 4850; CHECK-GI-NEXT: mov v4.b[8], w10 4851; CHECK-GI-NEXT: mov v1.b[9], w8 4852; CHECK-GI-NEXT: ldr w10, [sp, #920] 4853; CHECK-GI-NEXT: mov v5.b[8], w12 4854; CHECK-GI-NEXT: ldr w8, [sp, #32] 4855; CHECK-GI-NEXT: ldr w12, [sp, #1048] 4856; CHECK-GI-NEXT: mov v3.b[9], w9 4857; CHECK-GI-NEXT: ldr w9, [sp, #160] 4858; CHECK-GI-NEXT: mov v2.b[9], w11 4859; CHECK-GI-NEXT: mov v0.b[10], w8 4860; CHECK-GI-NEXT: ldr w8, [sp, #800] 4861; CHECK-GI-NEXT: ldr w11, [sp, #288] 4862; CHECK-GI-NEXT: mov v4.b[9], w10 4863; CHECK-GI-NEXT: mov v1.b[10], w9 4864; CHECK-GI-NEXT: ldr w10, [sp, #928] 4865; CHECK-GI-NEXT: mov v5.b[9], w12 4866; CHECK-GI-NEXT: ldr w9, [sp, #40] 4867; CHECK-GI-NEXT: ldr w12, [sp, #1056] 4868; CHECK-GI-NEXT: mov v3.b[10], w8 4869; CHECK-GI-NEXT: ldr w8, [sp, #168] 4870; CHECK-GI-NEXT: mov v2.b[10], w11 4871; CHECK-GI-NEXT: mov v0.b[11], w9 4872; CHECK-GI-NEXT: ldr w9, [sp, #808] 4873; CHECK-GI-NEXT: ldr w11, [sp, #296] 4874; CHECK-GI-NEXT: mov v4.b[10], w10 4875; CHECK-GI-NEXT: mov v1.b[11], w8 4876; CHECK-GI-NEXT: ldr w10, [sp, #936] 4877; CHECK-GI-NEXT: mov v5.b[10], w12 4878; CHECK-GI-NEXT: ldr w8, [sp, #48] 4879; CHECK-GI-NEXT: ldr w12, [sp, #1064] 4880; CHECK-GI-NEXT: mov v3.b[11], w9 4881; CHECK-GI-NEXT: ldr w9, [sp, #176] 4882; CHECK-GI-NEXT: mov v2.b[11], w11 4883; CHECK-GI-NEXT: mov v0.b[12], w8 4884; CHECK-GI-NEXT: ldr w8, [sp, #816] 4885; CHECK-GI-NEXT: ldr w11, [sp, #304] 4886; CHECK-GI-NEXT: mov v4.b[11], w10 4887; CHECK-GI-NEXT: mov v1.b[12], w9 4888; CHECK-GI-NEXT: ldr w10, [sp, #944] 4889; CHECK-GI-NEXT: mov v5.b[11], w12 4890; CHECK-GI-NEXT: ldr w9, [sp, #56] 4891; CHECK-GI-NEXT: ldr w12, [sp, #1072] 4892; CHECK-GI-NEXT: mov v3.b[12], w8 4893; CHECK-GI-NEXT: ldr w8, [sp, #184] 4894; CHECK-GI-NEXT: mov v2.b[12], w11 4895; CHECK-GI-NEXT: mov v0.b[13], w9 4896; CHECK-GI-NEXT: ldr w9, [sp, #824] 4897; CHECK-GI-NEXT: ldr w11, [sp, #312] 4898; CHECK-GI-NEXT: mov v4.b[12], w10 4899; CHECK-GI-NEXT: mov v1.b[13], w8 4900; CHECK-GI-NEXT: ldr w10, [sp, #952] 4901; CHECK-GI-NEXT: mov v5.b[12], w12 4902; CHECK-GI-NEXT: ldr w8, [sp, #64] 4903; CHECK-GI-NEXT: ldr w12, [sp, #1080] 4904; CHECK-GI-NEXT: mov v3.b[13], w9 4905; CHECK-GI-NEXT: ldr w9, [sp, #192] 4906; CHECK-GI-NEXT: mov v2.b[13], w11 4907; CHECK-GI-NEXT: mov v0.b[14], w8 4908; CHECK-GI-NEXT: ldr w8, [sp, #832] 4909; CHECK-GI-NEXT: ldr w11, [sp, #320] 4910; CHECK-GI-NEXT: mov v4.b[13], w10 4911; CHECK-GI-NEXT: mov v1.b[14], w9 4912; CHECK-GI-NEXT: ldr w10, [sp, #960] 4913; CHECK-GI-NEXT: mov v5.b[13], w12 4914; CHECK-GI-NEXT: ldr w9, [sp, #72] 4915; CHECK-GI-NEXT: ldr w12, [sp, #1088] 4916; CHECK-GI-NEXT: mov v3.b[14], w8 4917; CHECK-GI-NEXT: ldr w8, [sp, #200] 4918; CHECK-GI-NEXT: mov v2.b[14], w11 4919; CHECK-GI-NEXT: mov v0.b[15], w9 4920; CHECK-GI-NEXT: ldr w9, [sp, #840] 4921; CHECK-GI-NEXT: ldr w11, [sp, #328] 4922; CHECK-GI-NEXT: mov v4.b[14], w10 4923; CHECK-GI-NEXT: mov v1.b[15], w8 4924; CHECK-GI-NEXT: ldr w8, [sp, #968] 4925; CHECK-GI-NEXT: mov v5.b[14], w12 4926; CHECK-GI-NEXT: ldr w10, [sp, #1096] 4927; CHECK-GI-NEXT: mov v3.b[15], w9 4928; CHECK-GI-NEXT: mov v2.b[15], w11 4929; CHECK-GI-NEXT: sdot v7.4s, v0.16b, v6.16b 4930; CHECK-GI-NEXT: mov v4.b[15], w8 4931; CHECK-GI-NEXT: sdot v16.4s, v1.16b, v6.16b 4932; CHECK-GI-NEXT: mov v5.b[15], w10 4933; CHECK-GI-NEXT: sdot v17.4s, v3.16b, v6.16b 4934; CHECK-GI-NEXT: sdot v20.4s, v2.16b, v6.16b 4935; CHECK-GI-NEXT: addv s0, v7.4s 4936; CHECK-GI-NEXT: sdot v18.4s, v4.16b, v6.16b 4937; CHECK-GI-NEXT: addv s1, v16.4s 4938; CHECK-GI-NEXT: sdot v19.4s, v5.16b, v6.16b 4939; CHECK-GI-NEXT: addv s2, v17.4s 4940; CHECK-GI-NEXT: addv s4, v20.4s 4941; CHECK-GI-NEXT: fmov w8, s0 4942; CHECK-GI-NEXT: fmov w9, s1 4943; CHECK-GI-NEXT: addv s3, v18.4s 4944; CHECK-GI-NEXT: addv s5, v19.4s 4945; CHECK-GI-NEXT: fmov w10, s2 4946; CHECK-GI-NEXT: add w8, w8, w9 4947; CHECK-GI-NEXT: fmov w9, s4 4948; CHECK-GI-NEXT: fmov w11, s3 4949; CHECK-GI-NEXT: add w8, w8, w9 4950; CHECK-GI-NEXT: add w10, w10, w11 4951; CHECK-GI-NEXT: fmov w11, s5 4952; CHECK-GI-NEXT: add w9, w10, w11 4953; CHECK-GI-NEXT: add w0, w8, w9 4954; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 4955; CHECK-GI-NEXT: ret 4956entry: 4957 %az = sext <48 x i8> %a to <48 x i32> 4958 %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %az) 4959 %cz = sext <48 x i8> %c to <48 x i32> 4960 %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %cz) 4961 %x = add i32 %r1, %r2 4962 ret i32 %x 4963} 4964 4965define i32 @test_udot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 4966; CHECK-SD-LABEL: test_udot_v64i8: 4967; CHECK-SD: // %bb.0: // %entry 4968; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 4969; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 4970; CHECK-SD-NEXT: ldp q2, q3, [x0, #32] 4971; CHECK-SD-NEXT: ldp q4, q5, [x1, #32] 4972; CHECK-SD-NEXT: udot v1.4s, v5.16b, v3.16b 4973; CHECK-SD-NEXT: udot v0.4s, v4.16b, v2.16b 4974; CHECK-SD-NEXT: ldp q2, q3, [x0] 4975; CHECK-SD-NEXT: ldp q4, q5, [x1] 4976; CHECK-SD-NEXT: udot v1.4s, v5.16b, v3.16b 4977; CHECK-SD-NEXT: udot v0.4s, v4.16b, v2.16b 4978; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 4979; CHECK-SD-NEXT: addv s0, v0.4s 4980; CHECK-SD-NEXT: fmov w8, s0 4981; CHECK-SD-NEXT: add w0, w8, w2 4982; CHECK-SD-NEXT: ret 4983; 4984; CHECK-GI-LABEL: test_udot_v64i8: 4985; CHECK-GI: // %bb.0: // %entry 4986; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 4987; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 4988; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 4989; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 4990; CHECK-GI-NEXT: ldp q1, q2, [x0] 4991; CHECK-GI-NEXT: ldp q6, q7, [x0, #32] 4992; CHECK-GI-NEXT: ldp q16, q17, [x1] 4993; CHECK-GI-NEXT: ldp q18, q19, [x1, #32] 4994; CHECK-GI-NEXT: udot v0.4s, v16.16b, v1.16b 4995; CHECK-GI-NEXT: udot v4.4s, v17.16b, v2.16b 4996; CHECK-GI-NEXT: udot v5.4s, v18.16b, v6.16b 4997; CHECK-GI-NEXT: udot v3.4s, v19.16b, v7.16b 4998; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s 4999; CHECK-GI-NEXT: add v1.4s, v5.4s, v3.4s 5000; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5001; CHECK-GI-NEXT: addv s0, v0.4s 5002; CHECK-GI-NEXT: fmov w8, s0 5003; CHECK-GI-NEXT: add w0, w8, w2 5004; CHECK-GI-NEXT: ret 5005entry: 5006 %0 = load <64 x i8>, ptr %a 5007 %1 = zext <64 x i8> %0 to <64 x i32> 5008 %2 = load <64 x i8>, ptr %b 5009 %3 = zext <64 x i8> %2 to <64 x i32> 5010 %4 = mul nuw nsw <64 x i32> %3, %1 5011 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4) 5012 %op.extra = add i32 %5, %sum 5013 ret i32 %op.extra 5014} 5015 5016define i32 @test_udot_v64i8_nomla(ptr nocapture readonly %a1) { 5017; CHECK-SD-LABEL: test_udot_v64i8_nomla: 5018; CHECK-SD: // %bb.0: // %entry 5019; CHECK-SD-NEXT: movi v0.16b, #1 5020; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 5021; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 5022; CHECK-SD-NEXT: ldp q3, q4, [x0, #32] 5023; CHECK-SD-NEXT: udot v2.4s, v4.16b, v0.16b 5024; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b 5025; CHECK-SD-NEXT: ldp q3, q4, [x0] 5026; CHECK-SD-NEXT: udot v2.4s, v4.16b, v0.16b 5027; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b 5028; CHECK-SD-NEXT: add v0.4s, v1.4s, v2.4s 5029; CHECK-SD-NEXT: addv s0, v0.4s 5030; CHECK-SD-NEXT: fmov w0, s0 5031; CHECK-SD-NEXT: ret 5032; 5033; CHECK-GI-LABEL: test_udot_v64i8_nomla: 5034; CHECK-GI: // %bb.0: // %entry 5035; CHECK-GI-NEXT: movi v0.16b, #1 5036; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 5037; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 5038; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 5039; CHECK-GI-NEXT: ldp q5, q6, [x0] 5040; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 5041; CHECK-GI-NEXT: ldp q7, q16, [x0, #32] 5042; CHECK-GI-NEXT: udot v1.4s, v5.16b, v0.16b 5043; CHECK-GI-NEXT: udot v3.4s, v6.16b, v0.16b 5044; CHECK-GI-NEXT: udot v2.4s, v16.16b, v0.16b 5045; CHECK-GI-NEXT: udot v4.4s, v7.16b, v0.16b 5046; CHECK-GI-NEXT: add v0.4s, v1.4s, v3.4s 5047; CHECK-GI-NEXT: add v1.4s, v4.4s, v2.4s 5048; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5049; CHECK-GI-NEXT: addv s0, v0.4s 5050; CHECK-GI-NEXT: fmov w0, s0 5051; CHECK-GI-NEXT: ret 5052entry: 5053 %0 = load <64 x i8>, ptr %a1 5054 %1 = zext <64 x i8> %0 to <64 x i32> 5055 %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1) 5056 ret i32 %2 5057} 5058define i32 @test_sdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 5059; CHECK-SD-LABEL: test_sdot_v64i8: 5060; CHECK-SD: // %bb.0: // %entry 5061; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 5062; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 5063; CHECK-SD-NEXT: ldp q2, q3, [x0, #32] 5064; CHECK-SD-NEXT: ldp q4, q5, [x1, #32] 5065; CHECK-SD-NEXT: sdot v1.4s, v5.16b, v3.16b 5066; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v2.16b 5067; CHECK-SD-NEXT: ldp q2, q3, [x0] 5068; CHECK-SD-NEXT: ldp q4, q5, [x1] 5069; CHECK-SD-NEXT: sdot v1.4s, v5.16b, v3.16b 5070; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v2.16b 5071; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 5072; CHECK-SD-NEXT: addv s0, v0.4s 5073; CHECK-SD-NEXT: fmov w8, s0 5074; CHECK-SD-NEXT: add w0, w8, w2 5075; CHECK-SD-NEXT: ret 5076; 5077; CHECK-GI-LABEL: test_sdot_v64i8: 5078; CHECK-GI: // %bb.0: // %entry 5079; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 5080; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 5081; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 5082; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 5083; CHECK-GI-NEXT: ldp q1, q2, [x0] 5084; CHECK-GI-NEXT: ldp q6, q7, [x0, #32] 5085; CHECK-GI-NEXT: ldp q16, q17, [x1] 5086; CHECK-GI-NEXT: ldp q18, q19, [x1, #32] 5087; CHECK-GI-NEXT: sdot v0.4s, v16.16b, v1.16b 5088; CHECK-GI-NEXT: sdot v4.4s, v17.16b, v2.16b 5089; CHECK-GI-NEXT: sdot v5.4s, v18.16b, v6.16b 5090; CHECK-GI-NEXT: sdot v3.4s, v19.16b, v7.16b 5091; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s 5092; CHECK-GI-NEXT: add v1.4s, v5.4s, v3.4s 5093; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5094; CHECK-GI-NEXT: addv s0, v0.4s 5095; CHECK-GI-NEXT: fmov w8, s0 5096; CHECK-GI-NEXT: add w0, w8, w2 5097; CHECK-GI-NEXT: ret 5098entry: 5099 %0 = load <64 x i8>, ptr %a 5100 %1 = sext <64 x i8> %0 to <64 x i32> 5101 %2 = load <64 x i8>, ptr %b 5102 %3 = sext <64 x i8> %2 to <64 x i32> 5103 %4 = mul nsw <64 x i32> %3, %1 5104 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4) 5105 %op.extra = add nsw i32 %5, %sum 5106 ret i32 %op.extra 5107} 5108 5109define i32 @test_sdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { 5110; CHECK-SD-LABEL: test_sdot_v64i8_double: 5111; CHECK-SD: // %bb.0: // %entry 5112; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 5113; CHECK-SD-NEXT: movi v17.2d, #0000000000000000 5114; CHECK-SD-NEXT: movi v18.2d, #0000000000000000 5115; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 5116; CHECK-SD-NEXT: ldp q20, q21, [sp, #96] 5117; CHECK-SD-NEXT: ldp q22, q23, [sp, #32] 5118; CHECK-SD-NEXT: sdot v16.4s, v3.16b, v7.16b 5119; CHECK-SD-NEXT: sdot v18.4s, v2.16b, v6.16b 5120; CHECK-SD-NEXT: sdot v19.4s, v23.16b, v21.16b 5121; CHECK-SD-NEXT: sdot v17.4s, v22.16b, v20.16b 5122; CHECK-SD-NEXT: ldp q2, q3, [sp, #64] 5123; CHECK-SD-NEXT: ldp q6, q7, [sp] 5124; CHECK-SD-NEXT: sdot v16.4s, v1.16b, v5.16b 5125; CHECK-SD-NEXT: sdot v18.4s, v0.16b, v4.16b 5126; CHECK-SD-NEXT: sdot v19.4s, v7.16b, v3.16b 5127; CHECK-SD-NEXT: sdot v17.4s, v6.16b, v2.16b 5128; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s 5129; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s 5130; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 5131; CHECK-SD-NEXT: addv s0, v0.4s 5132; CHECK-SD-NEXT: fmov w0, s0 5133; CHECK-SD-NEXT: ret 5134; 5135; CHECK-GI-LABEL: test_sdot_v64i8_double: 5136; CHECK-GI: // %bb.0: // %entry 5137; CHECK-GI-NEXT: movi v18.2d, #0000000000000000 5138; CHECK-GI-NEXT: movi v21.2d, #0000000000000000 5139; CHECK-GI-NEXT: movi v22.2d, #0000000000000000 5140; CHECK-GI-NEXT: movi v23.2d, #0000000000000000 5141; CHECK-GI-NEXT: ldp q16, q17, [sp] 5142; CHECK-GI-NEXT: movi v24.2d, #0000000000000000 5143; CHECK-GI-NEXT: movi v25.2d, #0000000000000000 5144; CHECK-GI-NEXT: movi v26.2d, #0000000000000000 5145; CHECK-GI-NEXT: movi v27.2d, #0000000000000000 5146; CHECK-GI-NEXT: ldp q19, q20, [sp, #32] 5147; CHECK-GI-NEXT: sdot v18.4s, v0.16b, v4.16b 5148; CHECK-GI-NEXT: ldp q0, q4, [sp, #64] 5149; CHECK-GI-NEXT: sdot v21.4s, v1.16b, v5.16b 5150; CHECK-GI-NEXT: ldp q1, q5, [sp, #96] 5151; CHECK-GI-NEXT: sdot v22.4s, v2.16b, v6.16b 5152; CHECK-GI-NEXT: sdot v23.4s, v3.16b, v7.16b 5153; CHECK-GI-NEXT: sdot v24.4s, v16.16b, v0.16b 5154; CHECK-GI-NEXT: sdot v26.4s, v17.16b, v4.16b 5155; CHECK-GI-NEXT: sdot v27.4s, v19.16b, v1.16b 5156; CHECK-GI-NEXT: sdot v25.4s, v20.16b, v5.16b 5157; CHECK-GI-NEXT: add v0.4s, v18.4s, v21.4s 5158; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s 5159; CHECK-GI-NEXT: add v2.4s, v24.4s, v26.4s 5160; CHECK-GI-NEXT: add v3.4s, v27.4s, v25.4s 5161; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5162; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s 5163; CHECK-GI-NEXT: addv s0, v0.4s 5164; CHECK-GI-NEXT: addv s1, v1.4s 5165; CHECK-GI-NEXT: fmov w8, s0 5166; CHECK-GI-NEXT: fmov w9, s1 5167; CHECK-GI-NEXT: add w0, w8, w9 5168; CHECK-GI-NEXT: ret 5169entry: 5170 %az = sext <64 x i8> %a to <64 x i32> 5171 %bz = sext <64 x i8> %b to <64 x i32> 5172 %m1 = mul nuw nsw <64 x i32> %az, %bz 5173 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1) 5174 %cz = sext <64 x i8> %c to <64 x i32> 5175 %dz = sext <64 x i8> %d to <64 x i32> 5176 %m2 = mul nuw nsw <64 x i32> %cz, %dz 5177 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2) 5178 %x = add i32 %r1, %r2 5179 ret i32 %x 5180} 5181 5182define i32 @test_sdot_v64i8_double_nomla(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { 5183; CHECK-SD-LABEL: test_sdot_v64i8_double_nomla: 5184; CHECK-SD: // %bb.0: // %entry 5185; CHECK-SD-NEXT: movi v4.16b, #1 5186; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 5187; CHECK-SD-NEXT: movi v6.2d, #0000000000000000 5188; CHECK-SD-NEXT: movi v7.2d, #0000000000000000 5189; CHECK-SD-NEXT: ldp q17, q18, [sp, #32] 5190; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 5191; CHECK-SD-NEXT: sdot v5.4s, v3.16b, v4.16b 5192; CHECK-SD-NEXT: sdot v6.4s, v17.16b, v4.16b 5193; CHECK-SD-NEXT: sdot v7.4s, v2.16b, v4.16b 5194; CHECK-SD-NEXT: ldp q2, q3, [sp] 5195; CHECK-SD-NEXT: sdot v16.4s, v18.16b, v4.16b 5196; CHECK-SD-NEXT: sdot v5.4s, v1.16b, v4.16b 5197; CHECK-SD-NEXT: sdot v6.4s, v2.16b, v4.16b 5198; CHECK-SD-NEXT: sdot v7.4s, v0.16b, v4.16b 5199; CHECK-SD-NEXT: sdot v16.4s, v3.16b, v4.16b 5200; CHECK-SD-NEXT: add v0.4s, v7.4s, v5.4s 5201; CHECK-SD-NEXT: add v1.4s, v6.4s, v16.4s 5202; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 5203; CHECK-SD-NEXT: addv s0, v0.4s 5204; CHECK-SD-NEXT: fmov w0, s0 5205; CHECK-SD-NEXT: ret 5206; 5207; CHECK-GI-LABEL: test_sdot_v64i8_double_nomla: 5208; CHECK-GI: // %bb.0: // %entry 5209; CHECK-GI-NEXT: movi v4.16b, #1 5210; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 5211; CHECK-GI-NEXT: movi v6.2d, #0000000000000000 5212; CHECK-GI-NEXT: movi v7.2d, #0000000000000000 5213; CHECK-GI-NEXT: ldp q21, q22, [sp] 5214; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 5215; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 5216; CHECK-GI-NEXT: movi v18.2d, #0000000000000000 5217; CHECK-GI-NEXT: movi v19.2d, #0000000000000000 5218; CHECK-GI-NEXT: movi v20.2d, #0000000000000000 5219; CHECK-GI-NEXT: sdot v5.4s, v0.16b, v4.16b 5220; CHECK-GI-NEXT: sdot v6.4s, v1.16b, v4.16b 5221; CHECK-GI-NEXT: ldp q0, q1, [sp, #32] 5222; CHECK-GI-NEXT: sdot v7.4s, v2.16b, v4.16b 5223; CHECK-GI-NEXT: sdot v16.4s, v3.16b, v4.16b 5224; CHECK-GI-NEXT: sdot v17.4s, v21.16b, v4.16b 5225; CHECK-GI-NEXT: sdot v19.4s, v22.16b, v4.16b 5226; CHECK-GI-NEXT: sdot v20.4s, v0.16b, v4.16b 5227; CHECK-GI-NEXT: sdot v18.4s, v1.16b, v4.16b 5228; CHECK-GI-NEXT: add v0.4s, v5.4s, v6.4s 5229; CHECK-GI-NEXT: add v1.4s, v7.4s, v16.4s 5230; CHECK-GI-NEXT: add v2.4s, v17.4s, v19.4s 5231; CHECK-GI-NEXT: add v3.4s, v20.4s, v18.4s 5232; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5233; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s 5234; CHECK-GI-NEXT: addv s0, v0.4s 5235; CHECK-GI-NEXT: addv s1, v1.4s 5236; CHECK-GI-NEXT: fmov w8, s0 5237; CHECK-GI-NEXT: fmov w9, s1 5238; CHECK-GI-NEXT: add w0, w8, w9 5239; CHECK-GI-NEXT: ret 5240entry: 5241 %az = sext <64 x i8> %a to <64 x i32> 5242 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %az) 5243 %cz = sext <64 x i8> %c to <64 x i32> 5244 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %cz) 5245 %x = add i32 %r1, %r2 5246 ret i32 %x 5247} 5248 5249define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { 5250; CHECK-SD-LABEL: test_usdot_v64i8: 5251; CHECK-SD: // %bb.0: // %entry 5252; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 5253; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 5254; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 5255; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 5256; CHECK-SD-NEXT: ldp q1, q2, [x0, #32] 5257; CHECK-SD-NEXT: ldp q6, q7, [x1, #32] 5258; CHECK-SD-NEXT: ldp q16, q17, [x0] 5259; CHECK-SD-NEXT: ldp q18, q19, [x1] 5260; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b 5261; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b 5262; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b 5263; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b 5264; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s 5265; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s 5266; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s 5267; CHECK-SD-NEXT: addv s0, v0.4s 5268; CHECK-SD-NEXT: fmov w8, s0 5269; CHECK-SD-NEXT: add w0, w8, w2 5270; CHECK-SD-NEXT: ret 5271; 5272; CHECK-GI-LABEL: test_usdot_v64i8: 5273; CHECK-GI: // %bb.0: // %entry 5274; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill 5275; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill 5276; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill 5277; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill 5278; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 5279; CHECK-GI-NEXT: .cfi_offset b8, -8 5280; CHECK-GI-NEXT: .cfi_offset b9, -16 5281; CHECK-GI-NEXT: .cfi_offset b10, -24 5282; CHECK-GI-NEXT: .cfi_offset b11, -32 5283; CHECK-GI-NEXT: .cfi_offset b12, -40 5284; CHECK-GI-NEXT: .cfi_offset b13, -48 5285; CHECK-GI-NEXT: .cfi_offset b14, -56 5286; CHECK-GI-NEXT: .cfi_offset b15, -64 5287; CHECK-GI-NEXT: ldp q0, q1, [x1] 5288; CHECK-GI-NEXT: ldp q21, q17, [x0] 5289; CHECK-GI-NEXT: ldp q3, q19, [x1, #32] 5290; CHECK-GI-NEXT: ldp q18, q4, [x0, #32] 5291; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 5292; CHECK-GI-NEXT: sshll2 v5.8h, v0.16b, #0 5293; CHECK-GI-NEXT: sshll v7.8h, v1.8b, #0 5294; CHECK-GI-NEXT: sshll2 v22.8h, v1.16b, #0 5295; CHECK-GI-NEXT: sshll v23.8h, v3.8b, #0 5296; CHECK-GI-NEXT: sshll2 v24.8h, v3.16b, #0 5297; CHECK-GI-NEXT: sshll v25.8h, v19.8b, #0 5298; CHECK-GI-NEXT: sshll2 v26.8h, v19.16b, #0 5299; CHECK-GI-NEXT: ushll v27.8h, v21.8b, #0 5300; CHECK-GI-NEXT: ushll2 v28.8h, v21.16b, #0 5301; CHECK-GI-NEXT: ushll v30.8h, v17.8b, #0 5302; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0 5303; CHECK-GI-NEXT: ushll v8.8h, v18.8b, #0 5304; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0 5305; CHECK-GI-NEXT: ushll v9.8h, v4.8b, #0 5306; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0 5307; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 5308; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0 5309; CHECK-GI-NEXT: sshll v1.4s, v5.4h, #0 5310; CHECK-GI-NEXT: sshll2 v16.4s, v5.8h, #0 5311; CHECK-GI-NEXT: sshll v2.4s, v7.4h, #0 5312; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0 5313; CHECK-GI-NEXT: sshll v3.4s, v22.4h, #0 5314; CHECK-GI-NEXT: sshll2 v22.4s, v22.8h, #0 5315; CHECK-GI-NEXT: sshll v5.4s, v23.4h, #0 5316; CHECK-GI-NEXT: sshll2 v23.4s, v23.8h, #0 5317; CHECK-GI-NEXT: sshll v7.4s, v24.4h, #0 5318; CHECK-GI-NEXT: sshll2 v24.4s, v24.8h, #0 5319; CHECK-GI-NEXT: sshll v19.4s, v25.4h, #0 5320; CHECK-GI-NEXT: sshll2 v25.4s, v25.8h, #0 5321; CHECK-GI-NEXT: sshll v21.4s, v26.4h, #0 5322; CHECK-GI-NEXT: sshll2 v26.4s, v26.8h, #0 5323; CHECK-GI-NEXT: ushll v29.4s, v27.4h, #0 5324; CHECK-GI-NEXT: ushll2 v27.4s, v27.8h, #0 5325; CHECK-GI-NEXT: ushll v31.4s, v28.4h, #0 5326; CHECK-GI-NEXT: ushll2 v28.4s, v28.8h, #0 5327; CHECK-GI-NEXT: ushll v10.4s, v30.4h, #0 5328; CHECK-GI-NEXT: ushll2 v30.4s, v30.8h, #0 5329; CHECK-GI-NEXT: ushll v11.4s, v17.4h, #0 5330; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0 5331; CHECK-GI-NEXT: ushll2 v12.4s, v8.8h, #0 5332; CHECK-GI-NEXT: ushll2 v13.4s, v18.8h, #0 5333; CHECK-GI-NEXT: ushll2 v14.4s, v9.8h, #0 5334; CHECK-GI-NEXT: ushll2 v15.4s, v4.8h, #0 5335; CHECK-GI-NEXT: mul v6.4s, v6.4s, v27.4s 5336; CHECK-GI-NEXT: mul v16.4s, v16.4s, v28.4s 5337; CHECK-GI-NEXT: mul v20.4s, v20.4s, v30.4s 5338; CHECK-GI-NEXT: mul v17.4s, v22.4s, v17.4s 5339; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0 5340; CHECK-GI-NEXT: mul v22.4s, v23.4s, v12.4s 5341; CHECK-GI-NEXT: mul v23.4s, v24.4s, v13.4s 5342; CHECK-GI-NEXT: mul v24.4s, v25.4s, v14.4s 5343; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload 5344; CHECK-GI-NEXT: mul v25.4s, v26.4s, v15.4s 5345; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 5346; CHECK-GI-NEXT: ushll v26.4s, v9.4h, #0 5347; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 5348; CHECK-GI-NEXT: mla v6.4s, v0.4s, v29.4s 5349; CHECK-GI-NEXT: mla v16.4s, v1.4s, v31.4s 5350; CHECK-GI-NEXT: mla v20.4s, v2.4s, v10.4s 5351; CHECK-GI-NEXT: mla v17.4s, v3.4s, v11.4s 5352; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload 5353; CHECK-GI-NEXT: mla v22.4s, v5.4s, v8.4s 5354; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload 5355; CHECK-GI-NEXT: mla v23.4s, v7.4s, v18.4s 5356; CHECK-GI-NEXT: mla v24.4s, v19.4s, v26.4s 5357; CHECK-GI-NEXT: mla v25.4s, v21.4s, v4.4s 5358; CHECK-GI-NEXT: add v0.4s, v6.4s, v16.4s 5359; CHECK-GI-NEXT: add v1.4s, v20.4s, v17.4s 5360; CHECK-GI-NEXT: add v2.4s, v22.4s, v23.4s 5361; CHECK-GI-NEXT: add v3.4s, v24.4s, v25.4s 5362; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5363; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s 5364; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 5365; CHECK-GI-NEXT: addv s0, v0.4s 5366; CHECK-GI-NEXT: fmov w8, s0 5367; CHECK-GI-NEXT: add w0, w8, w2 5368; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload 5369; CHECK-GI-NEXT: ret 5370entry: 5371 %0 = load <64 x i8>, ptr %a 5372 %1 = zext <64 x i8> %0 to <64 x i32> 5373 %2 = load <64 x i8>, ptr %b 5374 %3 = sext <64 x i8> %2 to <64 x i32> 5375 %4 = mul nsw <64 x i32> %3, %1 5376 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4) 5377 %op.extra = add nsw i32 %5, %sum 5378 ret i32 %op.extra 5379} 5380 5381define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { 5382; CHECK-SD-LABEL: test_usdot_v64i8_double: 5383; CHECK-SD: // %bb.0: // %entry 5384; CHECK-SD-NEXT: movi v18.2d, #0000000000000000 5385; CHECK-SD-NEXT: movi v21.2d, #0000000000000000 5386; CHECK-SD-NEXT: movi v22.2d, #0000000000000000 5387; CHECK-SD-NEXT: movi v23.2d, #0000000000000000 5388; CHECK-SD-NEXT: ldp q16, q17, [sp, #64] 5389; CHECK-SD-NEXT: movi v24.2d, #0000000000000000 5390; CHECK-SD-NEXT: movi v25.2d, #0000000000000000 5391; CHECK-SD-NEXT: movi v26.2d, #0000000000000000 5392; CHECK-SD-NEXT: movi v27.2d, #0000000000000000 5393; CHECK-SD-NEXT: ldp q19, q20, [sp, #96] 5394; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b 5395; CHECK-SD-NEXT: ldp q3, q7, [sp, #32] 5396; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b 5397; CHECK-SD-NEXT: ldp q1, q5, [sp] 5398; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b 5399; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b 5400; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b 5401; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b 5402; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b 5403; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b 5404; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s 5405; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s 5406; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s 5407; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s 5408; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s 5409; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s 5410; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 5411; CHECK-SD-NEXT: addv s0, v0.4s 5412; CHECK-SD-NEXT: fmov w0, s0 5413; CHECK-SD-NEXT: ret 5414; 5415; CHECK-GI-LABEL: test_usdot_v64i8_double: 5416; CHECK-GI: // %bb.0: // %entry 5417; CHECK-GI-NEXT: sub sp, sp, #304 5418; CHECK-GI-NEXT: stp d15, d14, [sp, #224] // 16-byte Folded Spill 5419; CHECK-GI-NEXT: stp d13, d12, [sp, #240] // 16-byte Folded Spill 5420; CHECK-GI-NEXT: stp d11, d10, [sp, #256] // 16-byte Folded Spill 5421; CHECK-GI-NEXT: stp d9, d8, [sp, #272] // 16-byte Folded Spill 5422; CHECK-GI-NEXT: str x29, [sp, #288] // 8-byte Folded Spill 5423; CHECK-GI-NEXT: .cfi_def_cfa_offset 304 5424; CHECK-GI-NEXT: .cfi_offset w29, -16 5425; CHECK-GI-NEXT: .cfi_offset b8, -24 5426; CHECK-GI-NEXT: .cfi_offset b9, -32 5427; CHECK-GI-NEXT: .cfi_offset b10, -40 5428; CHECK-GI-NEXT: .cfi_offset b11, -48 5429; CHECK-GI-NEXT: .cfi_offset b12, -56 5430; CHECK-GI-NEXT: .cfi_offset b13, -64 5431; CHECK-GI-NEXT: .cfi_offset b14, -72 5432; CHECK-GI-NEXT: .cfi_offset b15, -80 5433; CHECK-GI-NEXT: ushll v17.8h, v0.8b, #0 5434; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 5435; CHECK-GI-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload 5436; CHECK-GI-NEXT: mov v20.16b, v3.16b 5437; CHECK-GI-NEXT: ushll v16.8h, v1.8b, #0 5438; CHECK-GI-NEXT: ushll2 v18.8h, v1.16b, #0 5439; CHECK-GI-NEXT: ushll v26.8h, v2.8b, #0 5440; CHECK-GI-NEXT: ldp q27, q28, [sp, #304] 5441; CHECK-GI-NEXT: ushll2 v29.8h, v2.16b, #0 5442; CHECK-GI-NEXT: ushll v2.4s, v17.4h, #0 5443; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 5444; CHECK-GI-NEXT: sshll v8.8h, v4.8b, #0 5445; CHECK-GI-NEXT: ldp q23, q21, [sp, #368] 5446; CHECK-GI-NEXT: sshll2 v9.8h, v4.16b, #0 5447; CHECK-GI-NEXT: sshll2 v11.8h, v5.16b, #0 5448; CHECK-GI-NEXT: mov v25.16b, v7.16b 5449; CHECK-GI-NEXT: ushll2 v19.4s, v17.8h, #0 5450; CHECK-GI-NEXT: stp q1, q2, [sp, #192] // 32-byte Folded Spill 5451; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 5452; CHECK-GI-NEXT: ushll2 v17.4s, v18.8h, #0 5453; CHECK-GI-NEXT: ldp q24, q22, [sp, #336] 5454; CHECK-GI-NEXT: sshll v10.8h, v5.8b, #0 5455; CHECK-GI-NEXT: sshll v12.8h, v6.8b, #0 5456; CHECK-GI-NEXT: sshll2 v13.8h, v6.16b, #0 5457; CHECK-GI-NEXT: mov v2.16b, v20.16b 5458; CHECK-GI-NEXT: sshll2 v0.4s, v8.8h, #0 5459; CHECK-GI-NEXT: sshll2 v4.4s, v9.8h, #0 5460; CHECK-GI-NEXT: sshll2 v6.4s, v11.8h, #0 5461; CHECK-GI-NEXT: ushll2 v7.4s, v16.8h, #0 5462; CHECK-GI-NEXT: ushll2 v31.4s, v29.8h, #0 5463; CHECK-GI-NEXT: sshll2 v5.4s, v10.8h, #0 5464; CHECK-GI-NEXT: sshll2 v1.4s, v13.8h, #0 5465; CHECK-GI-NEXT: ushll2 v30.4s, v26.8h, #0 5466; CHECK-GI-NEXT: ushll v14.8h, v2.8b, #0 5467; CHECK-GI-NEXT: mul v20.4s, v19.4s, v0.4s 5468; CHECK-GI-NEXT: mul v19.4s, v3.4s, v4.4s 5469; CHECK-GI-NEXT: sshll v0.8h, v25.8b, #0 5470; CHECK-GI-NEXT: mul v4.4s, v17.4s, v6.4s 5471; CHECK-GI-NEXT: sshll2 v15.4s, v12.8h, #0 5472; CHECK-GI-NEXT: ldp q17, q3, [sp, #400] 5473; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s 5474; CHECK-GI-NEXT: mul v7.4s, v31.4s, v1.4s 5475; CHECK-GI-NEXT: ushll2 v31.8h, v2.16b, #0 5476; CHECK-GI-NEXT: sshll2 v25.8h, v25.16b, #0 5477; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0 5478; CHECK-GI-NEXT: ushll v2.4s, v14.4h, #0 5479; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 5480; CHECK-GI-NEXT: str q3, [sp, #96] // 16-byte Folded Spill 5481; CHECK-GI-NEXT: ushll2 v3.4s, v14.8h, #0 5482; CHECK-GI-NEXT: mul v6.4s, v30.4s, v15.4s 5483; CHECK-GI-NEXT: str q31, [sp, #160] // 16-byte Folded Spill 5484; CHECK-GI-NEXT: ushll v30.4s, v26.4h, #0 5485; CHECK-GI-NEXT: sshll v26.4s, v8.4h, #0 5486; CHECK-GI-NEXT: ushll v14.8h, v27.8b, #0 5487; CHECK-GI-NEXT: ushll v15.4s, v29.4h, #0 5488; CHECK-GI-NEXT: sshll v29.4s, v9.4h, #0 5489; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s 5490; CHECK-GI-NEXT: ushll2 v3.4s, v31.8h, #0 5491; CHECK-GI-NEXT: ushll v31.8h, v28.8b, #0 5492; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 5493; CHECK-GI-NEXT: sshll v8.4s, v10.4h, #0 5494; CHECK-GI-NEXT: sshll v9.4s, v11.4h, #0 5495; CHECK-GI-NEXT: sshll v10.4s, v12.4h, #0 5496; CHECK-GI-NEXT: sshll v11.4s, v13.4h, #0 5497; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 5498; CHECK-GI-NEXT: stp q3, q25, [sp, #112] // 32-byte Folded Spill 5499; CHECK-GI-NEXT: ldr q3, [sp, #208] // 16-byte Folded Reload 5500; CHECK-GI-NEXT: ushll2 v28.8h, v28.16b, #0 5501; CHECK-GI-NEXT: mla v1.4s, v2.4s, v0.4s 5502; CHECK-GI-NEXT: ushll2 v0.4s, v31.8h, #0 5503; CHECK-GI-NEXT: mla v5.4s, v16.4s, v8.4s 5504; CHECK-GI-NEXT: mla v20.4s, v3.4s, v26.4s 5505; CHECK-GI-NEXT: sshll2 v3.4s, v25.8h, #0 5506; CHECK-GI-NEXT: mla v6.4s, v30.4s, v10.4s 5507; CHECK-GI-NEXT: mla v7.4s, v15.4s, v11.4s 5508; CHECK-GI-NEXT: sshll v25.8h, v23.8b, #0 5509; CHECK-GI-NEXT: mla v4.4s, v18.4s, v9.4s 5510; CHECK-GI-NEXT: ushll v30.8h, v22.8b, #0 5511; CHECK-GI-NEXT: ushll2 v26.8h, v22.16b, #0 5512; CHECK-GI-NEXT: sshll v22.8h, v21.8b, #0 5513; CHECK-GI-NEXT: str q3, [sp, #32] // 16-byte Folded Spill 5514; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload 5515; CHECK-GI-NEXT: ushll2 v8.8h, v27.16b, #0 5516; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill 5517; CHECK-GI-NEXT: ldr q9, [sp, #32] // 16-byte Folded Reload 5518; CHECK-GI-NEXT: ushll2 v1.4s, v14.8h, #0 5519; CHECK-GI-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill 5520; CHECK-GI-NEXT: mla v19.4s, v3.4s, v29.4s 5521; CHECK-GI-NEXT: sshll2 v7.4s, v25.8h, #0 5522; CHECK-GI-NEXT: str q5, [sp, #176] // 16-byte Folded Spill 5523; CHECK-GI-NEXT: ushll v29.8h, v24.8b, #0 5524; CHECK-GI-NEXT: ushll2 v27.8h, v24.16b, #0 5525; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill 5526; CHECK-GI-NEXT: ldp q0, q16, [sp, #96] // 32-byte Folded Reload 5527; CHECK-GI-NEXT: str q4, [sp, #144] // 16-byte Folded Spill 5528; CHECK-GI-NEXT: sshll2 v24.8h, v23.16b, #0 5529; CHECK-GI-NEXT: ushll2 v18.4s, v26.8h, #0 5530; CHECK-GI-NEXT: stp q19, q20, [sp, #192] // 32-byte Folded Spill 5531; CHECK-GI-NEXT: sshll2 v20.8h, v21.16b, #0 5532; CHECK-GI-NEXT: sshll v21.8h, v17.8b, #0 5533; CHECK-GI-NEXT: sshll2 v19.8h, v17.16b, #0 5534; CHECK-GI-NEXT: sshll2 v17.8h, v0.16b, #0 5535; CHECK-GI-NEXT: mul v16.4s, v16.4s, v9.4s 5536; CHECK-GI-NEXT: ldr q9, [sp, #16] // 16-byte Folded Reload 5537; CHECK-GI-NEXT: sshll v23.8h, v0.8b, #0 5538; CHECK-GI-NEXT: sshll2 v2.4s, v22.8h, #0 5539; CHECK-GI-NEXT: ushll2 v12.4s, v27.8h, #0 5540; CHECK-GI-NEXT: ushll v26.4s, v26.4h, #0 5541; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0 5542; CHECK-GI-NEXT: sshll2 v0.4s, v17.8h, #0 5543; CHECK-GI-NEXT: mul v7.4s, v9.4s, v7.4s 5544; CHECK-GI-NEXT: ldr q9, [sp] // 16-byte Folded Reload 5545; CHECK-GI-NEXT: sshll2 v5.4s, v19.8h, #0 5546; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0 5547; CHECK-GI-NEXT: sshll2 v3.4s, v20.8h, #0 5548; CHECK-GI-NEXT: mul v2.4s, v9.4s, v2.4s 5549; CHECK-GI-NEXT: ldr q9, [sp, #128] // 16-byte Folded Reload 5550; CHECK-GI-NEXT: ushll2 v15.4s, v8.8h, #0 5551; CHECK-GI-NEXT: mul v0.4s, v18.4s, v0.4s 5552; CHECK-GI-NEXT: ldr q18, [sp, #160] // 16-byte Folded Reload 5553; CHECK-GI-NEXT: ushll2 v11.4s, v29.8h, #0 5554; CHECK-GI-NEXT: sshll v9.4s, v9.4h, #0 5555; CHECK-GI-NEXT: ushll2 v13.4s, v30.8h, #0 5556; CHECK-GI-NEXT: sshll2 v1.4s, v24.8h, #0 5557; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 5558; CHECK-GI-NEXT: sshll2 v4.4s, v21.8h, #0 5559; CHECK-GI-NEXT: sshll2 v6.4s, v23.8h, #0 5560; CHECK-GI-NEXT: mul v5.4s, v12.4s, v5.4s 5561; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0 5562; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0 5563; CHECK-GI-NEXT: mla v0.4s, v26.4s, v17.4s 5564; CHECK-GI-NEXT: mul v3.4s, v10.4s, v3.4s 5565; CHECK-GI-NEXT: mul v1.4s, v15.4s, v1.4s 5566; CHECK-GI-NEXT: mla v16.4s, v18.4s, v9.4s 5567; CHECK-GI-NEXT: ldp q18, q17, [sp, #192] // 32-byte Folded Reload 5568; CHECK-GI-NEXT: mul v4.4s, v11.4s, v4.4s 5569; CHECK-GI-NEXT: mul v6.4s, v13.4s, v6.4s 5570; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0 5571; CHECK-GI-NEXT: ldp d13, d12, [sp, #240] // 16-byte Folded Reload 5572; CHECK-GI-NEXT: sshll v20.4s, v20.4h, #0 5573; CHECK-GI-NEXT: ushll v10.4s, v14.4h, #0 5574; CHECK-GI-NEXT: ldp d15, d14, [sp, #224] // 16-byte Folded Reload 5575; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0 5576; CHECK-GI-NEXT: ushll v31.4s, v31.4h, #0 5577; CHECK-GI-NEXT: ushll v29.4s, v29.4h, #0 5578; CHECK-GI-NEXT: ushll v30.4s, v30.4h, #0 5579; CHECK-GI-NEXT: sshll v25.4s, v25.4h, #0 5580; CHECK-GI-NEXT: sshll v24.4s, v24.4h, #0 5581; CHECK-GI-NEXT: sshll v22.4s, v22.4h, #0 5582; CHECK-GI-NEXT: sshll v21.4s, v21.4h, #0 5583; CHECK-GI-NEXT: sshll v23.4s, v23.4h, #0 5584; CHECK-GI-NEXT: mla v5.4s, v27.4s, v19.4s 5585; CHECK-GI-NEXT: ldr q19, [sp, #144] // 16-byte Folded Reload 5586; CHECK-GI-NEXT: add v17.4s, v17.4s, v18.4s 5587; CHECK-GI-NEXT: ldr q18, [sp, #176] // 16-byte Folded Reload 5588; CHECK-GI-NEXT: mla v3.4s, v28.4s, v20.4s 5589; CHECK-GI-NEXT: mla v7.4s, v10.4s, v25.4s 5590; CHECK-GI-NEXT: ldp d11, d10, [sp, #256] // 16-byte Folded Reload 5591; CHECK-GI-NEXT: mla v1.4s, v8.4s, v24.4s 5592; CHECK-GI-NEXT: ldp d9, d8, [sp, #272] // 16-byte Folded Reload 5593; CHECK-GI-NEXT: add v18.4s, v18.4s, v19.4s 5594; CHECK-GI-NEXT: ldp q20, q19, [sp, #64] // 32-byte Folded Reload 5595; CHECK-GI-NEXT: mla v2.4s, v31.4s, v22.4s 5596; CHECK-GI-NEXT: mla v4.4s, v29.4s, v21.4s 5597; CHECK-GI-NEXT: mla v6.4s, v30.4s, v23.4s 5598; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s 5599; CHECK-GI-NEXT: add v19.4s, v19.4s, v20.4s 5600; CHECK-GI-NEXT: ldr q20, [sp, #48] // 16-byte Folded Reload 5601; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s 5602; CHECK-GI-NEXT: add v16.4s, v20.4s, v16.4s 5603; CHECK-GI-NEXT: add v3.4s, v4.4s, v5.4s 5604; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s 5605; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s 5606; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s 5607; CHECK-GI-NEXT: add v5.4s, v19.4s, v16.4s 5608; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s 5609; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s 5610; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s 5611; CHECK-GI-NEXT: addv s1, v2.4s 5612; CHECK-GI-NEXT: addv s0, v0.4s 5613; CHECK-GI-NEXT: fmov w8, s1 5614; CHECK-GI-NEXT: fmov w9, s0 5615; CHECK-GI-NEXT: add w0, w8, w9 5616; CHECK-GI-NEXT: add sp, sp, #304 5617; CHECK-GI-NEXT: ret 5618entry: 5619 %az = zext <64 x i8> %a to <64 x i32> 5620 %bz = sext <64 x i8> %b to <64 x i32> 5621 %m1 = mul nuw nsw <64 x i32> %az, %bz 5622 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1) 5623 %cz = zext <64 x i8> %c to <64 x i32> 5624 %dz = sext <64 x i8> %d to <64 x i32> 5625 %m2 = mul nuw nsw <64 x i32> %cz, %dz 5626 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2) 5627 %x = add i32 %r1, %r2 5628 ret i32 %x 5629} 5630