1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM 3; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT 4; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM 5 6define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { 7; CHECK-DOT-LABEL: udot: 8; CHECK-DOT: // %bb.0: 9; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b 10; CHECK-DOT-NEXT: ret 11; 12; CHECK-NODOT-LABEL: udot: 13; CHECK-NODOT: // %bb.0: 14; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b 15; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b 16; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 17; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h 18; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h 19; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h 20; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s 21; CHECK-NODOT-NEXT: ret 22 %u.wide = zext <16 x i8> %u to <16 x i32> 23 %s.wide = zext <16 x i8> %s to <16 x i32> 24 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide 25 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) 26 ret <4 x i32> %partial.reduce 27} 28 29define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { 30; CHECK-DOT-LABEL: udot_narrow: 31; CHECK-DOT: // %bb.0: 32; CHECK-DOT-NEXT: udot v0.2s, v2.8b, v1.8b 33; CHECK-DOT-NEXT: ret 34; 35; CHECK-NODOT-LABEL: udot_narrow: 36; CHECK-NODOT: // %bb.0: 37; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b 38; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 39; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 40; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 41; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 42; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h 43; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 44; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 45; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s 46; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h 47; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s 48; CHECK-NODOT-NEXT: ret 49 %u.wide = zext <8 x i8> %u to <8 x i32> 50 %s.wide = zext <8 x i8> %s to <8 x i32> 51 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide 52 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) 53 ret <2 x i32> %partial.reduce 54} 55 56define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { 57; CHECK-DOT-LABEL: sdot: 58; CHECK-DOT: // %bb.0: 59; CHECK-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b 60; CHECK-DOT-NEXT: ret 61; 62; CHECK-NODOT-LABEL: sdot: 63; CHECK-NODOT: // %bb.0: 64; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b 65; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b 66; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 67; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h 68; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h 69; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h 70; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s 71; CHECK-NODOT-NEXT: ret 72 %u.wide = sext <16 x i8> %u to <16 x i32> 73 %s.wide = sext <16 x i8> %s to <16 x i32> 74 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide 75 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) 76 ret <4 x i32> %partial.reduce 77} 78 79define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { 80; CHECK-DOT-LABEL: sdot_narrow: 81; CHECK-DOT: // %bb.0: 82; CHECK-DOT-NEXT: sdot v0.2s, v2.8b, v1.8b 83; CHECK-DOT-NEXT: ret 84; 85; CHECK-NODOT-LABEL: sdot_narrow: 86; CHECK-NODOT: // %bb.0: 87; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b 88; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 89; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 90; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 91; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 92; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h 93; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 94; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 95; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s 96; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h 97; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s 98; CHECK-NODOT-NEXT: ret 99 %u.wide = sext <8 x i8> %u to <8 x i32> 100 %s.wide = sext <8 x i8> %s to <8 x i32> 101 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide 102 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) 103 ret <2 x i32> %partial.reduce 104} 105 106define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { 107; CHECK-NOI8MM-LABEL: usdot: 108; CHECK-NOI8MM: // %bb.0: 109; CHECK-NOI8MM-NEXT: ushll v3.8h, v1.8b, #0 110; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0 111; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 112; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 113; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h 114; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h 115; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h 116; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h 117; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s 118; CHECK-NOI8MM-NEXT: ret 119; 120; CHECK-I8MM-LABEL: usdot: 121; CHECK-I8MM: // %bb.0: 122; CHECK-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b 123; CHECK-I8MM-NEXT: ret 124 %u.wide = zext <16 x i8> %u to <16 x i32> 125 %s.wide = sext <16 x i8> %s to <16 x i32> 126 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide 127 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) 128 ret <4 x i32> %partial.reduce 129} 130 131define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ 132; CHECK-NOI8MM-LABEL: usdot_narrow: 133; CHECK-NOI8MM: // %bb.0: 134; CHECK-NOI8MM-NEXT: ushll v1.8h, v1.8b, #0 135; CHECK-NOI8MM-NEXT: sshll v2.8h, v2.8b, #0 136; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0 137; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h 138; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h 139; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8 140; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8 141; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h 142; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 143; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8 144; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h 145; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s 146; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s 147; CHECK-NOI8MM-NEXT: ret 148; 149; CHECK-I8MM-LABEL: usdot_narrow: 150; CHECK-I8MM: // %bb.0: 151; CHECK-I8MM-NEXT: usdot v0.2s, v1.8b, v2.8b 152; CHECK-I8MM-NEXT: ret 153 %u.wide = zext <8 x i8> %u to <8 x i32> 154 %s.wide = sext <8 x i8> %s to <8 x i32> 155 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide 156 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) 157 ret <2 x i32> %partial.reduce 158} 159 160define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{ 161; CHECK-NOI8MM-LABEL: sudot: 162; CHECK-NOI8MM: // %bb.0: 163; CHECK-NOI8MM-NEXT: sshll v3.8h, v1.8b, #0 164; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0 165; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 166; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 167; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h 168; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h 169; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h 170; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h 171; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s 172; CHECK-NOI8MM-NEXT: ret 173; 174; CHECK-I8MM-LABEL: sudot: 175; CHECK-I8MM: // %bb.0: 176; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b 177; CHECK-I8MM-NEXT: ret 178 %u.wide = sext <16 x i8> %u to <16 x i32> 179 %s.wide = zext <16 x i8> %s to <16 x i32> 180 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide 181 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) 182 ret <4 x i32> %partial.reduce 183} 184 185define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ 186; CHECK-NOI8MM-LABEL: sudot_narrow: 187; CHECK-NOI8MM: // %bb.0: 188; CHECK-NOI8MM-NEXT: sshll v1.8h, v1.8b, #0 189; CHECK-NOI8MM-NEXT: ushll v2.8h, v2.8b, #0 190; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0 191; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h 192; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h 193; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8 194; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8 195; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h 196; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 197; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8 198; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h 199; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s 200; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s 201; CHECK-NOI8MM-NEXT: ret 202; 203; CHECK-I8MM-LABEL: sudot_narrow: 204; CHECK-I8MM: // %bb.0: 205; CHECK-I8MM-NEXT: usdot v0.2s, v2.8b, v1.8b 206; CHECK-I8MM-NEXT: ret 207 %u.wide = sext <8 x i8> %u to <8 x i32> 208 %s.wide = zext <8 x i8> %s to <8 x i32> 209 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide 210 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) 211 ret <2 x i32> %partial.reduce 212} 213 214define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { 215; CHECK-DOT-LABEL: udot_8to64: 216; CHECK-DOT: // %bb.0: // %entry 217; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 218; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b 219; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s 220; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s 221; CHECK-DOT-NEXT: ret 222; 223; CHECK-NODOT-LABEL: udot_8to64: 224; CHECK-NODOT: // %bb.0: // %entry 225; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b 226; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b 227; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0 228; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 229; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 230; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 231; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s 232; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s 233; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s 234; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s 235; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s 236; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s 237; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d 238; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d 239; CHECK-NODOT-NEXT: ret 240entry: 241 %a.wide = zext <16 x i8> %a to <16 x i64> 242 %b.wide = zext <16 x i8> %b to <16 x i64> 243 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide 244 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( 245 <4 x i64> %acc, <16 x i64> %mult) 246 ret <4 x i64> %partial.reduce 247} 248 249define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ 250; CHECK-DOT-LABEL: sdot_8to64: 251; CHECK-DOT: // %bb.0: // %entry 252; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 253; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b 254; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s 255; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s 256; CHECK-DOT-NEXT: ret 257; 258; CHECK-NODOT-LABEL: sdot_8to64: 259; CHECK-NODOT: // %bb.0: // %entry 260; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b 261; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b 262; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0 263; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 264; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 265; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 266; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s 267; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s 268; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s 269; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s 270; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s 271; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s 272; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d 273; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d 274; CHECK-NODOT-NEXT: ret 275entry: 276 %a.wide = sext <16 x i8> %a to <16 x i64> 277 %b.wide = sext <16 x i8> %b to <16 x i64> 278 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide 279 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( 280 <4 x i64> %acc, <16 x i64> %mult) 281 ret <4 x i64> %partial.reduce 282} 283 284define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ 285; CHECK-NOI8MM-LABEL: usdot_8to64: 286; CHECK-NOI8MM: // %bb.0: // %entry 287; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 288; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 289; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 290; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0 291; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0 292; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0 293; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0 294; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0 295; CHECK-NOI8MM-NEXT: ushll2 v16.4s, v2.8h, #0 296; CHECK-NOI8MM-NEXT: sshll2 v17.4s, v3.8h, #0 297; CHECK-NOI8MM-NEXT: ushll v2.4s, v2.4h, #0 298; CHECK-NOI8MM-NEXT: sshll v3.4s, v3.4h, #0 299; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s 300; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s 301; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s 302; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s 303; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s 304; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s 305; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s 306; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s 307; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d 308; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d 309; CHECK-NOI8MM-NEXT: ret 310; 311; CHECK-I8MM-LABEL: usdot_8to64: 312; CHECK-I8MM: // %bb.0: // %entry 313; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 314; CHECK-I8MM-NEXT: usdot v4.4s, v2.16b, v3.16b 315; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s 316; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s 317; CHECK-I8MM-NEXT: ret 318entry: 319 %a.wide = zext <16 x i8> %a to <16 x i64> 320 %b.wide = sext <16 x i8> %b to <16 x i64> 321 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide 322 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( 323 <4 x i64> %acc, <16 x i64> %mult) 324 ret <4 x i64> %partial.reduce 325} 326 327define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { 328; CHECK-NOI8MM-LABEL: sudot_8to64: 329; CHECK-NOI8MM: // %bb.0: // %entry 330; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 331; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 332; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 333; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0 334; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0 335; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0 336; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0 337; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0 338; CHECK-NOI8MM-NEXT: sshll2 v16.4s, v2.8h, #0 339; CHECK-NOI8MM-NEXT: ushll2 v17.4s, v3.8h, #0 340; CHECK-NOI8MM-NEXT: sshll v2.4s, v2.4h, #0 341; CHECK-NOI8MM-NEXT: ushll v3.4s, v3.4h, #0 342; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s 343; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s 344; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s 345; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s 346; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s 347; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s 348; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s 349; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s 350; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d 351; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d 352; CHECK-NOI8MM-NEXT: ret 353; 354; CHECK-I8MM-LABEL: sudot_8to64: 355; CHECK-I8MM: // %bb.0: // %entry 356; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 357; CHECK-I8MM-NEXT: usdot v4.4s, v3.16b, v2.16b 358; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s 359; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s 360; CHECK-I8MM-NEXT: ret 361entry: 362 %a.wide = sext <16 x i8> %a to <16 x i64> 363 %b.wide = zext <16 x i8> %b to <16 x i64> 364 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide 365 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( 366 <4 x i64> %acc, <16 x i64> %mult) 367 ret <4 x i64> %partial.reduce 368} 369 370define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ 371; CHECK-DOT-LABEL: udot_no_bin_op: 372; CHECK-DOT: // %bb.0: 373; CHECK-DOT-NEXT: movi v2.16b, #1 374; CHECK-DOT-NEXT: udot v0.4s, v1.16b, v2.16b 375; CHECK-DOT-NEXT: ret 376; 377; CHECK-NODOT-LABEL: udot_no_bin_op: 378; CHECK-NODOT: // %bb.0: 379; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0 380; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 381; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0 382; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h 383; CHECK-NODOT-NEXT: uaddw2 v2.4s, v3.4s, v2.8h 384; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h 385; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s 386; CHECK-NODOT-NEXT: ret 387 %a.wide = zext <16 x i8> %a to <16 x i32> 388 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide) 389 ret <4 x i32> %partial.reduce 390} 391 392define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ 393; CHECK-DOT-LABEL: sdot_no_bin_op: 394; CHECK-DOT: // %bb.0: 395; CHECK-DOT-NEXT: movi v2.16b, #1 396; CHECK-DOT-NEXT: sdot v0.4s, v1.16b, v2.16b 397; CHECK-DOT-NEXT: ret 398; 399; CHECK-NODOT-LABEL: sdot_no_bin_op: 400; CHECK-NODOT: // %bb.0: 401; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0 402; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0 403; CHECK-NODOT-NEXT: sshll v3.4s, v1.4h, #0 404; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h 405; CHECK-NODOT-NEXT: saddw2 v2.4s, v3.4s, v2.8h 406; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h 407; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s 408; CHECK-NODOT-NEXT: ret 409 %a.wide = sext <16 x i8> %a to <16 x i32> 410 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide) 411 ret <4 x i32> %partial.reduce 412} 413 414define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ 415; CHECK-DOT-LABEL: udot_no_bin_op_narrow: 416; CHECK-DOT: // %bb.0: 417; CHECK-DOT-NEXT: movi v2.8b, #1 418; CHECK-DOT-NEXT: udot v0.2s, v1.8b, v2.8b 419; CHECK-DOT-NEXT: ret 420; 421; CHECK-NODOT-LABEL: udot_no_bin_op_narrow: 422; CHECK-NODOT: // %bb.0: 423; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0 424; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 425; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 426; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 427; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 428; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h 429; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 430; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 431; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s 432; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h 433; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s 434; CHECK-NODOT-NEXT: ret 435 %a.wide = zext <8 x i8> %a to <8 x i32> 436 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide) 437 ret <2 x i32> %partial.reduce 438} 439 440define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ 441; CHECK-DOT-LABEL: sdot_no_bin_op_narrow: 442; CHECK-DOT: // %bb.0: 443; CHECK-DOT-NEXT: movi v2.8b, #1 444; CHECK-DOT-NEXT: sdot v0.2s, v1.8b, v2.8b 445; CHECK-DOT-NEXT: ret 446; 447; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow: 448; CHECK-NODOT: // %bb.0: 449; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0 450; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 451; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 452; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 453; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 454; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h 455; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 456; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 457; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s 458; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h 459; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s 460; CHECK-NODOT-NEXT: ret 461 %a.wide = sext <8 x i8> %a to <8 x i32> 462 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide) 463 ret <2 x i32> %partial.reduce 464} 465 466define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ 467; CHECK-DOT-LABEL: udot_no_bin_op_8to64: 468; CHECK-DOT: // %bb.0: 469; CHECK-DOT-NEXT: movi v3.16b, #1 470; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 471; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b 472; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s 473; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s 474; CHECK-DOT-NEXT: ret 475; 476; CHECK-NODOT-LABEL: udot_no_bin_op_8to64: 477; CHECK-NODOT: // %bb.0: 478; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 479; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 480; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0 481; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 482; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 483; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 484; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v4.4s 485; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s 486; CHECK-NODOT-NEXT: uaddl2 v4.2d, v3.4s, v5.4s 487; CHECK-NODOT-NEXT: uaddl v3.2d, v3.2s, v5.2s 488; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s 489; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s 490; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d 491; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d 492; CHECK-NODOT-NEXT: ret 493 %a.wide = zext <16 x i8> %a to <16 x i64> 494 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) 495 ret <4 x i64> %partial.reduce 496} 497 498define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ 499; CHECK-DOT-LABEL: sdot_no_bin_op_8to64: 500; CHECK-DOT: // %bb.0: 501; CHECK-DOT-NEXT: movi v3.16b, #1 502; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 503; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b 504; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s 505; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s 506; CHECK-DOT-NEXT: ret 507; 508; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64: 509; CHECK-NODOT: // %bb.0: 510; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0 511; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 512; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0 513; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 514; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0 515; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 516; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s 517; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s 518; CHECK-NODOT-NEXT: saddl2 v4.2d, v3.4s, v5.4s 519; CHECK-NODOT-NEXT: saddl v3.2d, v3.2s, v5.2s 520; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s 521; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s 522; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d 523; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d 524; CHECK-NODOT-NEXT: ret 525 %a.wide = sext <16 x i8> %a to <16 x i64> 526 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) 527 ret <4 x i64> %partial.reduce 528} 529 530define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ 531; CHECK-LABEL: not_udot: 532; CHECK: // %bb.0: 533; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b 534; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h 535; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h 536; CHECK-NEXT: ret 537 %u.wide = zext <8 x i8> %u to <8 x i32> 538 %s.wide = zext <8 x i8> %s to <8 x i32> 539 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide 540 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult) 541 ret <4 x i32> %partial.reduce 542} 543 544define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) { 545; CHECK-LABEL: not_udot_narrow: 546; CHECK: // %bb.0: 547; CHECK-NEXT: bic v1.4h, #255, lsl #8 548; CHECK-NEXT: bic v2.4h, #255, lsl #8 549; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 550; CHECK-NEXT: umull v3.4s, v2.4h, v1.4h 551; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h 552; CHECK-NEXT: ext v1.16b, v3.16b, v3.16b, #8 553; CHECK-NEXT: add v0.2s, v1.2s, v0.2s 554; CHECK-NEXT: ret 555 %u.wide = zext <4 x i8> %u to <4 x i32> 556 %s.wide = zext <4 x i8> %s to <4 x i32> 557 %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide 558 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult) 559 ret <2 x i32> %partial.reduce 560} 561 562define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ 563; CHECK-LABEL: udot_different_types: 564; CHECK: // %bb.0: // %entry 565; CHECK-NEXT: ushll v2.8h, v2.8b, #0 566; CHECK-NEXT: ushll v3.4s, v1.4h, #0 567; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 568; CHECK-NEXT: ushll v4.4s, v2.4h, #0 569; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 570; CHECK-NEXT: umull v5.2d, v1.2s, v2.2s 571; CHECK-NEXT: umlal v0.2d, v3.2s, v4.2s 572; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s 573; CHECK-NEXT: umlal2 v5.2d, v3.4s, v4.4s 574; CHECK-NEXT: add v0.2d, v5.2d, v0.2d 575; CHECK-NEXT: ret 576entry: 577 %a.wide = zext <8 x i16> %a to <8 x i64> 578 %b.wide = zext <8 x i8> %b to <8 x i64> 579 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide 580 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) 581 ret <2 x i64> %partial.reduce 582} 583 584define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ 585; CHECK-LABEL: sdot_different_types: 586; CHECK: // %bb.0: // %entry 587; CHECK-NEXT: sshll v2.8h, v2.8b, #0 588; CHECK-NEXT: sshll v3.4s, v1.4h, #0 589; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 590; CHECK-NEXT: sshll v4.4s, v2.4h, #0 591; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0 592; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s 593; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s 594; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s 595; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s 596; CHECK-NEXT: add v0.2d, v5.2d, v0.2d 597; CHECK-NEXT: ret 598entry: 599 %a.wide = sext <8 x i16> %a to <8 x i64> 600 %b.wide = sext <8 x i8> %b to <8 x i64> 601 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide 602 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) 603 ret <2 x i64> %partial.reduce 604} 605 606define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ 607; CHECK-LABEL: usdot_different_types: 608; CHECK: // %bb.0: // %entry 609; CHECK-NEXT: sshll v2.8h, v2.8b, #0 610; CHECK-NEXT: ushll v3.4s, v1.4h, #0 611; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 612; CHECK-NEXT: sshll v4.4s, v2.4h, #0 613; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0 614; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s 615; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s 616; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s 617; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s 618; CHECK-NEXT: add v0.2d, v5.2d, v0.2d 619; CHECK-NEXT: ret 620entry: 621 %a.wide = zext <8 x i16> %a to <8 x i64> 622 %b.wide = sext <8 x i8> %b to <8 x i64> 623 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide 624 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) 625 ret <2 x i64> %partial.reduce 626} 627 628define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ 629; CHECK-LABEL: sudot_different_types: 630; CHECK: // %bb.0: // %entry 631; CHECK-NEXT: ushll v2.8h, v2.8b, #0 632; CHECK-NEXT: sshll v3.4s, v1.4h, #0 633; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 634; CHECK-NEXT: ushll v4.4s, v2.4h, #0 635; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 636; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s 637; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s 638; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s 639; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s 640; CHECK-NEXT: add v0.2d, v5.2d, v0.2d 641; CHECK-NEXT: ret 642entry: 643 %a.wide = sext <8 x i16> %a to <8 x i64> 644 %b.wide = zext <8 x i8> %b to <8 x i64> 645 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide 646 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) 647 ret <2 x i64> %partial.reduce 648} 649