1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM 3; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM 4 5define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 6; CHECK-LABEL: udot: 7; CHECK: // %bb.0: // %entry 8; CHECK-NEXT: udot z0.s, z1.b, z2.b 9; CHECK-NEXT: ret 10entry: 11 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> 12 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> 13 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide 14 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) 15 ret <vscale x 4 x i32> %partial.reduce 16} 17 18define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 19; CHECK-LABEL: udot_wide: 20; CHECK: // %bb.0: // %entry 21; CHECK-NEXT: udot z0.d, z1.h, z2.h 22; CHECK-NEXT: ret 23entry: 24 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> 25 %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> 26 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 27 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 28 ret <vscale x 2 x i64> %partial.reduce 29} 30 31define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 32; CHECK-LABEL: sdot: 33; CHECK: // %bb.0: // %entry 34; CHECK-NEXT: sdot z0.s, z1.b, z2.b 35; CHECK-NEXT: ret 36entry: 37 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> 38 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> 39 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide 40 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult) 41 ret <vscale x 4 x i32> %partial.reduce 42} 43 44define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 45; CHECK-LABEL: sdot_wide: 46; CHECK: // %bb.0: // %entry 47; CHECK-NEXT: sdot z0.d, z1.h, z2.h 48; CHECK-NEXT: ret 49entry: 50 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> 51 %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> 52 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 53 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 54 ret <vscale x 2 x i64> %partial.reduce 55} 56 57define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 58; CHECK-I8MM-LABEL: usdot: 59; CHECK-I8MM: // %bb.0: // %entry 60; CHECK-I8MM-NEXT: usdot z0.s, z1.b, z2.b 61; CHECK-I8MM-NEXT: ret 62; 63; CHECK-NOI8MM-LABEL: usdot: 64; CHECK-NOI8MM: // %bb.0: // %entry 65; CHECK-NOI8MM-NEXT: uunpklo z3.h, z1.b 66; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b 67; CHECK-NOI8MM-NEXT: uunpkhi z1.h, z1.b 68; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b 69; CHECK-NOI8MM-NEXT: ptrue p0.s 70; CHECK-NOI8MM-NEXT: uunpklo z5.s, z3.h 71; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h 72; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h 73; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h 74; CHECK-NOI8MM-NEXT: uunpklo z7.s, z1.h 75; CHECK-NOI8MM-NEXT: uunpkhi z1.s, z1.h 76; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h 77; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h 78; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s 79; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s 80; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s 81; CHECK-NOI8MM-NEXT: movprfx z1, z3 82; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s 83; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s 84; CHECK-NOI8MM-NEXT: ret 85entry: 86 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> 87 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> 88 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide 89 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) 90 ret <vscale x 4 x i32> %partial.reduce 91} 92 93define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 94; CHECK-I8MM-LABEL: sudot: 95; CHECK-I8MM: // %bb.0: // %entry 96; CHECK-I8MM-NEXT: usdot z0.s, z2.b, z1.b 97; CHECK-I8MM-NEXT: ret 98; 99; CHECK-NOI8MM-LABEL: sudot: 100; CHECK-NOI8MM: // %bb.0: // %entry 101; CHECK-NOI8MM-NEXT: sunpklo z3.h, z1.b 102; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b 103; CHECK-NOI8MM-NEXT: sunpkhi z1.h, z1.b 104; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b 105; CHECK-NOI8MM-NEXT: ptrue p0.s 106; CHECK-NOI8MM-NEXT: sunpklo z5.s, z3.h 107; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h 108; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h 109; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h 110; CHECK-NOI8MM-NEXT: sunpklo z7.s, z1.h 111; CHECK-NOI8MM-NEXT: sunpkhi z1.s, z1.h 112; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h 113; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h 114; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s 115; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s 116; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s 117; CHECK-NOI8MM-NEXT: movprfx z1, z3 118; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s 119; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s 120; CHECK-NOI8MM-NEXT: ret 121entry: 122 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> 123 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> 124 %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide 125 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) 126 ret <vscale x 4 x i32> %partial.reduce 127} 128 129define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 130; CHECK-LABEL: udot_8to64: 131; CHECK: // %bb.0: // %entry 132; CHECK-NEXT: mov z4.s, #0 // =0x0 133; CHECK-NEXT: udot z4.s, z2.b, z3.b 134; CHECK-NEXT: sunpklo z2.d, z4.s 135; CHECK-NEXT: sunpkhi z3.d, z4.s 136; CHECK-NEXT: add z0.d, z0.d, z2.d 137; CHECK-NEXT: add z1.d, z1.d, z3.d 138; CHECK-NEXT: ret 139entry: 140 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> 141 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64> 142 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide 143 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( 144 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) 145 ret <vscale x 4 x i64> %partial.reduce 146} 147 148define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ 149; CHECK-LABEL: sdot_8to64: 150; CHECK: // %bb.0: // %entry 151; CHECK-NEXT: mov z4.s, #0 // =0x0 152; CHECK-NEXT: sdot z4.s, z2.b, z3.b 153; CHECK-NEXT: sunpklo z2.d, z4.s 154; CHECK-NEXT: sunpkhi z3.d, z4.s 155; CHECK-NEXT: add z0.d, z0.d, z2.d 156; CHECK-NEXT: add z1.d, z1.d, z3.d 157; CHECK-NEXT: ret 158entry: 159 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> 160 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64> 161 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide 162 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( 163 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) 164 ret <vscale x 4 x i64> %partial.reduce 165} 166 167define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ 168; CHECK-I8MM-LABEL: usdot_8to64: 169; CHECK-I8MM: // %bb.0: // %entry 170; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0 171; CHECK-I8MM-NEXT: usdot z4.s, z2.b, z3.b 172; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s 173; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s 174; CHECK-I8MM-NEXT: add z0.d, z0.d, z2.d 175; CHECK-I8MM-NEXT: add z1.d, z1.d, z3.d 176; CHECK-I8MM-NEXT: ret 177; 178; CHECK-NOI8MM-LABEL: usdot_8to64: 179; CHECK-NOI8MM: // %bb.0: // %entry 180; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 181; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2 182; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill 183; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill 184; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG 185; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16 186; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 187; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG 188; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b 189; CHECK-NOI8MM-NEXT: sunpklo z5.h, z3.b 190; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b 191; CHECK-NOI8MM-NEXT: sunpkhi z3.h, z3.b 192; CHECK-NOI8MM-NEXT: ptrue p0.d 193; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h 194; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h 195; CHECK-NOI8MM-NEXT: sunpklo z7.s, z5.h 196; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h 197; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h 198; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h 199; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h 200; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h 201; CHECK-NOI8MM-NEXT: uunpkhi z26.d, z6.s 202; CHECK-NOI8MM-NEXT: uunpklo z6.d, z6.s 203; CHECK-NOI8MM-NEXT: uunpklo z27.d, z4.s 204; CHECK-NOI8MM-NEXT: sunpklo z28.d, z7.s 205; CHECK-NOI8MM-NEXT: sunpklo z29.d, z5.s 206; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z4.s 207; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s 208; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s 209; CHECK-NOI8MM-NEXT: uunpkhi z30.d, z24.s 210; CHECK-NOI8MM-NEXT: uunpkhi z31.d, z2.s 211; CHECK-NOI8MM-NEXT: uunpklo z24.d, z24.s 212; CHECK-NOI8MM-NEXT: uunpklo z2.d, z2.s 213; CHECK-NOI8MM-NEXT: sunpkhi z8.d, z25.s 214; CHECK-NOI8MM-NEXT: sunpklo z25.d, z25.s 215; CHECK-NOI8MM-NEXT: sunpklo z9.d, z3.s 216; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d 217; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d 218; CHECK-NOI8MM-NEXT: sunpkhi z3.d, z3.s 219; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d 220; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d 221; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d 222; CHECK-NOI8MM-NEXT: movprfx z2, z27 223; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d 224; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload 225; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d 226; CHECK-NOI8MM-NEXT: movprfx z3, z4 227; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d 228; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload 229; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d 230; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d 231; CHECK-NOI8MM-NEXT: addvl sp, sp, #2 232; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 233; CHECK-NOI8MM-NEXT: ret 234entry: 235 %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> 236 %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64> 237 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide 238 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( 239 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) 240 ret <vscale x 4 x i64> %partial.reduce 241} 242 243define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 244; CHECK-I8MM-LABEL: sudot_8to64: 245; CHECK-I8MM: // %bb.0: // %entry 246; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0 247; CHECK-I8MM-NEXT: usdot z4.s, z3.b, z2.b 248; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s 249; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s 250; CHECK-I8MM-NEXT: add z0.d, z0.d, z2.d 251; CHECK-I8MM-NEXT: add z1.d, z1.d, z3.d 252; CHECK-I8MM-NEXT: ret 253; 254; CHECK-NOI8MM-LABEL: sudot_8to64: 255; CHECK-NOI8MM: // %bb.0: // %entry 256; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 257; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2 258; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill 259; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill 260; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG 261; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16 262; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 263; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG 264; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b 265; CHECK-NOI8MM-NEXT: uunpklo z5.h, z3.b 266; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b 267; CHECK-NOI8MM-NEXT: uunpkhi z3.h, z3.b 268; CHECK-NOI8MM-NEXT: ptrue p0.d 269; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h 270; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h 271; CHECK-NOI8MM-NEXT: uunpklo z7.s, z5.h 272; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h 273; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h 274; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h 275; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h 276; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h 277; CHECK-NOI8MM-NEXT: sunpkhi z26.d, z6.s 278; CHECK-NOI8MM-NEXT: sunpklo z6.d, z6.s 279; CHECK-NOI8MM-NEXT: sunpklo z27.d, z4.s 280; CHECK-NOI8MM-NEXT: uunpklo z28.d, z7.s 281; CHECK-NOI8MM-NEXT: uunpklo z29.d, z5.s 282; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z4.s 283; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s 284; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s 285; CHECK-NOI8MM-NEXT: sunpkhi z30.d, z24.s 286; CHECK-NOI8MM-NEXT: sunpkhi z31.d, z2.s 287; CHECK-NOI8MM-NEXT: sunpklo z24.d, z24.s 288; CHECK-NOI8MM-NEXT: sunpklo z2.d, z2.s 289; CHECK-NOI8MM-NEXT: uunpkhi z8.d, z25.s 290; CHECK-NOI8MM-NEXT: uunpklo z25.d, z25.s 291; CHECK-NOI8MM-NEXT: uunpklo z9.d, z3.s 292; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d 293; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d 294; CHECK-NOI8MM-NEXT: uunpkhi z3.d, z3.s 295; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d 296; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d 297; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d 298; CHECK-NOI8MM-NEXT: movprfx z2, z27 299; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d 300; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload 301; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d 302; CHECK-NOI8MM-NEXT: movprfx z3, z4 303; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d 304; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload 305; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d 306; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d 307; CHECK-NOI8MM-NEXT: addvl sp, sp, #2 308; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 309; CHECK-NOI8MM-NEXT: ret 310entry: 311 %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> 312 %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64> 313 %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide 314 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( 315 <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) 316 ret <vscale x 4 x i64> %partial.reduce 317} 318 319define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){ 320; CHECK-LABEL: udot_no_bin_op: 321; CHECK: // %bb.0: 322; CHECK-NEXT: mov z2.b, #1 // =0x1 323; CHECK-NEXT: udot z0.s, z1.b, z2.b 324; CHECK-NEXT: ret 325 %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> 326 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext) 327 ret <vscale x 4 x i32> %partial.reduce 328} 329 330define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){ 331; CHECK-LABEL: sdot_no_bin_op: 332; CHECK: // %bb.0: 333; CHECK-NEXT: mov z2.b, #1 // =0x1 334; CHECK-NEXT: sdot z0.s, z1.b, z2.b 335; CHECK-NEXT: ret 336 %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> 337 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext) 338 ret <vscale x 4 x i32> %partial.reduce 339} 340 341define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){ 342; CHECK-LABEL: udot_no_bin_op_wide: 343; CHECK: // %bb.0: // %entry 344; CHECK-NEXT: mov z2.h, #1 // =0x1 345; CHECK-NEXT: udot z0.d, z1.h, z2.h 346; CHECK-NEXT: ret 347entry: 348 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> 349 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide) 350 ret <vscale x 2 x i64> %partial.reduce 351} 352 353define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){ 354; CHECK-LABEL: sdot_no_bin_op_wide: 355; CHECK: // %bb.0: // %entry 356; CHECK-NEXT: mov z2.h, #1 // =0x1 357; CHECK-NEXT: sdot z0.d, z1.h, z2.h 358; CHECK-NEXT: ret 359entry: 360 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> 361 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide) 362 ret <vscale x 2 x i64> %partial.reduce 363} 364 365define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){ 366; CHECK-LABEL: udot_no_bin_op_8to64: 367; CHECK: // %bb.0: 368; CHECK-NEXT: mov z3.b, #1 // =0x1 369; CHECK-NEXT: mov z4.s, #0 // =0x0 370; CHECK-NEXT: udot z4.s, z2.b, z3.b 371; CHECK-NEXT: sunpklo z2.d, z4.s 372; CHECK-NEXT: sunpkhi z3.d, z4.s 373; CHECK-NEXT: add z0.d, z0.d, z2.d 374; CHECK-NEXT: add z1.d, z1.d, z3.d 375; CHECK-NEXT: ret 376 %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> 377 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext) 378 ret <vscale x 4 x i64> %partial.reduce 379} 380 381define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){ 382; CHECK-LABEL: sdot_no_bin_op_8to64: 383; CHECK: // %bb.0: 384; CHECK-NEXT: mov z3.b, #1 // =0x1 385; CHECK-NEXT: mov z4.s, #0 // =0x0 386; CHECK-NEXT: sdot z4.s, z2.b, z3.b 387; CHECK-NEXT: sunpklo z2.d, z4.s 388; CHECK-NEXT: sunpkhi z3.d, z4.s 389; CHECK-NEXT: add z0.d, z0.d, z2.d 390; CHECK-NEXT: add z1.d, z1.d, z3.d 391; CHECK-NEXT: ret 392 %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> 393 %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext) 394 ret <vscale x 4 x i64> %partial.reduce 395} 396 397define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) { 398; CHECK-LABEL: not_udot: 399; CHECK: // %bb.0: // %entry 400; CHECK-NEXT: and z1.h, z1.h, #0xff 401; CHECK-NEXT: and z2.h, z2.h, #0xff 402; CHECK-NEXT: ptrue p0.s 403; CHECK-NEXT: uunpklo z3.s, z1.h 404; CHECK-NEXT: uunpklo z4.s, z2.h 405; CHECK-NEXT: uunpkhi z1.s, z1.h 406; CHECK-NEXT: uunpkhi z2.s, z2.h 407; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s 408; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s 409; CHECK-NEXT: ret 410entry: 411 %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32> 412 %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32> 413 %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide 414 %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult) 415 ret <vscale x 4 x i32> %partial.reduce 416} 417 418define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) { 419; CHECK-LABEL: not_udot_wide: 420; CHECK: // %bb.0: // %entry 421; CHECK-NEXT: and z1.s, z1.s, #0xffff 422; CHECK-NEXT: and z2.s, z2.s, #0xffff 423; CHECK-NEXT: ptrue p0.d 424; CHECK-NEXT: uunpklo z3.d, z1.s 425; CHECK-NEXT: uunpklo z4.d, z2.s 426; CHECK-NEXT: uunpkhi z1.d, z1.s 427; CHECK-NEXT: uunpkhi z2.d, z2.s 428; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d 429; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 430; CHECK-NEXT: ret 431entry: 432 %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64> 433 %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64> 434 %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide 435 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult) 436 ret <vscale x 2 x i64> %partial.reduce 437} 438 439define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 440; CHECK-LABEL: not_usdot: 441; CHECK: // %bb.0: // %entry 442; CHECK-NEXT: uunpklo z3.s, z1.h 443; CHECK-NEXT: sunpklo z4.s, z2.h 444; CHECK-NEXT: uunpkhi z1.s, z1.h 445; CHECK-NEXT: sunpkhi z2.s, z2.h 446; CHECK-NEXT: ptrue p0.d 447; CHECK-NEXT: uunpklo z5.d, z3.s 448; CHECK-NEXT: uunpkhi z3.d, z3.s 449; CHECK-NEXT: sunpklo z6.d, z4.s 450; CHECK-NEXT: sunpkhi z4.d, z4.s 451; CHECK-NEXT: uunpklo z7.d, z1.s 452; CHECK-NEXT: uunpkhi z1.d, z1.s 453; CHECK-NEXT: sunpklo z24.d, z2.s 454; CHECK-NEXT: sunpkhi z2.d, z2.s 455; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d 456; CHECK-NEXT: mul z3.d, z3.d, z4.d 457; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 458; CHECK-NEXT: movprfx z1, z3 459; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d 460; CHECK-NEXT: add z0.d, z1.d, z0.d 461; CHECK-NEXT: ret 462entry: 463 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> 464 %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> 465 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 466 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 467 ret <vscale x 2 x i64> %partial.reduce 468} 469 470define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 471; CHECK-LABEL: not_sudot: 472; CHECK: // %bb.0: // %entry 473; CHECK-NEXT: sunpklo z3.s, z1.h 474; CHECK-NEXT: uunpklo z4.s, z2.h 475; CHECK-NEXT: sunpkhi z1.s, z1.h 476; CHECK-NEXT: uunpkhi z2.s, z2.h 477; CHECK-NEXT: ptrue p0.d 478; CHECK-NEXT: sunpklo z5.d, z3.s 479; CHECK-NEXT: sunpkhi z3.d, z3.s 480; CHECK-NEXT: uunpklo z6.d, z4.s 481; CHECK-NEXT: uunpkhi z4.d, z4.s 482; CHECK-NEXT: sunpklo z7.d, z1.s 483; CHECK-NEXT: sunpkhi z1.d, z1.s 484; CHECK-NEXT: uunpklo z24.d, z2.s 485; CHECK-NEXT: uunpkhi z2.d, z2.s 486; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d 487; CHECK-NEXT: mul z3.d, z3.d, z4.d 488; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 489; CHECK-NEXT: movprfx z1, z3 490; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d 491; CHECK-NEXT: add z0.d, z1.d, z0.d 492; CHECK-NEXT: ret 493entry: 494 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> 495 %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> 496 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 497 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 498 ret <vscale x 2 x i64> %partial.reduce 499} 500 501define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ 502; CHECK-LABEL: udot_different_types: 503; CHECK: // %bb.0: // %entry 504; CHECK-NEXT: and z2.h, z2.h, #0xff 505; CHECK-NEXT: uunpklo z3.s, z1.h 506; CHECK-NEXT: uunpkhi z1.s, z1.h 507; CHECK-NEXT: ptrue p0.d 508; CHECK-NEXT: uunpklo z4.s, z2.h 509; CHECK-NEXT: uunpkhi z2.s, z2.h 510; CHECK-NEXT: uunpklo z5.d, z3.s 511; CHECK-NEXT: uunpkhi z3.d, z3.s 512; CHECK-NEXT: uunpklo z7.d, z1.s 513; CHECK-NEXT: uunpkhi z1.d, z1.s 514; CHECK-NEXT: uunpklo z6.d, z4.s 515; CHECK-NEXT: uunpkhi z4.d, z4.s 516; CHECK-NEXT: uunpklo z24.d, z2.s 517; CHECK-NEXT: uunpkhi z2.d, z2.s 518; CHECK-NEXT: mul z3.d, z3.d, z4.d 519; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d 520; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 521; CHECK-NEXT: movprfx z1, z3 522; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d 523; CHECK-NEXT: add z0.d, z1.d, z0.d 524; CHECK-NEXT: ret 525entry: 526 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> 527 %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> 528 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 529 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 530 ret <vscale x 2 x i64> %partial.reduce 531} 532 533define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ 534; CHECK-LABEL: sdot_different_types: 535; CHECK: // %bb.0: // %entry 536; CHECK-NEXT: ptrue p0.h 537; CHECK-NEXT: sunpklo z3.s, z1.h 538; CHECK-NEXT: sunpkhi z1.s, z1.h 539; CHECK-NEXT: sxtb z2.h, p0/m, z2.h 540; CHECK-NEXT: ptrue p0.d 541; CHECK-NEXT: sunpklo z5.d, z3.s 542; CHECK-NEXT: sunpkhi z3.d, z3.s 543; CHECK-NEXT: sunpklo z7.d, z1.s 544; CHECK-NEXT: sunpklo z4.s, z2.h 545; CHECK-NEXT: sunpkhi z2.s, z2.h 546; CHECK-NEXT: sunpkhi z1.d, z1.s 547; CHECK-NEXT: sunpklo z6.d, z4.s 548; CHECK-NEXT: sunpkhi z4.d, z4.s 549; CHECK-NEXT: sunpklo z24.d, z2.s 550; CHECK-NEXT: sunpkhi z2.d, z2.s 551; CHECK-NEXT: mul z3.d, z3.d, z4.d 552; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d 553; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 554; CHECK-NEXT: movprfx z1, z3 555; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d 556; CHECK-NEXT: add z0.d, z1.d, z0.d 557; CHECK-NEXT: ret 558entry: 559 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> 560 %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64> 561 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 562 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 563 ret <vscale x 2 x i64> %partial.reduce 564} 565 566define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ 567; CHECK-LABEL: usdot_different_types: 568; CHECK: // %bb.0: // %entry 569; CHECK-NEXT: ptrue p0.h 570; CHECK-NEXT: uunpklo z3.s, z1.h 571; CHECK-NEXT: uunpkhi z1.s, z1.h 572; CHECK-NEXT: sxtb z2.h, p0/m, z2.h 573; CHECK-NEXT: ptrue p0.d 574; CHECK-NEXT: uunpklo z5.d, z3.s 575; CHECK-NEXT: uunpkhi z3.d, z3.s 576; CHECK-NEXT: uunpklo z7.d, z1.s 577; CHECK-NEXT: sunpklo z4.s, z2.h 578; CHECK-NEXT: sunpkhi z2.s, z2.h 579; CHECK-NEXT: uunpkhi z1.d, z1.s 580; CHECK-NEXT: sunpklo z6.d, z4.s 581; CHECK-NEXT: sunpkhi z4.d, z4.s 582; CHECK-NEXT: sunpklo z24.d, z2.s 583; CHECK-NEXT: sunpkhi z2.d, z2.s 584; CHECK-NEXT: mul z3.d, z3.d, z4.d 585; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d 586; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 587; CHECK-NEXT: movprfx z1, z3 588; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d 589; CHECK-NEXT: add z0.d, z1.d, z0.d 590; CHECK-NEXT: ret 591entry: 592 %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> 593 %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64> 594 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 595 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 596 ret <vscale x 2 x i64> %partial.reduce 597} 598 599define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ 600; CHECK-LABEL: sudot_different_types: 601; CHECK: // %bb.0: // %entry 602; CHECK-NEXT: and z2.h, z2.h, #0xff 603; CHECK-NEXT: sunpklo z3.s, z1.h 604; CHECK-NEXT: sunpkhi z1.s, z1.h 605; CHECK-NEXT: ptrue p0.d 606; CHECK-NEXT: uunpklo z4.s, z2.h 607; CHECK-NEXT: uunpkhi z2.s, z2.h 608; CHECK-NEXT: sunpklo z5.d, z3.s 609; CHECK-NEXT: sunpkhi z3.d, z3.s 610; CHECK-NEXT: sunpklo z7.d, z1.s 611; CHECK-NEXT: sunpkhi z1.d, z1.s 612; CHECK-NEXT: uunpklo z6.d, z4.s 613; CHECK-NEXT: uunpkhi z4.d, z4.s 614; CHECK-NEXT: uunpklo z24.d, z2.s 615; CHECK-NEXT: uunpkhi z2.d, z2.s 616; CHECK-NEXT: mul z3.d, z3.d, z4.d 617; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d 618; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 619; CHECK-NEXT: movprfx z1, z3 620; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d 621; CHECK-NEXT: add z0.d, z1.d, z0.d 622; CHECK-NEXT: ret 623entry: 624 %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> 625 %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> 626 %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide 627 %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) 628 ret <vscale x 2 x i64> %partial.reduce 629} 630