1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod < %s | FileCheck %s 3; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65 < %s | FileCheck %s 4; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65ae < %s | FileCheck %s 5; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1 < %s | FileCheck %s 6; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1 < %s | FileCheck %s 7; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s 8; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1 < %s | FileCheck %s 9; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a < %s | FileCheck %s 10; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b < %s | FileCheck %s 11 12declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) 13declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) 14declare <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) 15declare <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) 16 17define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { 18; CHECK-LABEL: test_vdot_u32: 19; CHECK: // %bb.0: // %entry 20; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b 21; CHECK-NEXT: ret 22entry: 23 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2 24 ret <2 x i32> %vdot1.i 25} 26 27define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { 28; CHECK-LABEL: test_vdotq_u32: 29; CHECK: // %bb.0: // %entry 30; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b 31; CHECK-NEXT: ret 32entry: 33 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2 34 ret <4 x i32> %vdot1.i 35} 36 37define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { 38; CHECK-LABEL: test_vdot_s32: 39; CHECK: // %bb.0: // %entry 40; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b 41; CHECK-NEXT: ret 42entry: 43 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2 44 ret <2 x i32> %vdot1.i 45} 46 47define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { 48; CHECK-LABEL: test_vdotq_s32: 49; CHECK: // %bb.0: // %entry 50; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b 51; CHECK-NEXT: ret 52entry: 53 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2 54 ret <4 x i32> %vdot1.i 55} 56 57 58define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { 59; CHECK-LABEL: test_vdot_u32_zero: 60; CHECK: // %bb.0: // %entry 61; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b 62; CHECK-NEXT: ret 63entry: 64 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2 65 %ret = add <2 x i32> %vdot1.i, %a 66 ret <2 x i32> %ret 67} 68 69define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { 70; CHECK-LABEL: test_vdotq_u32_zero: 71; CHECK: // %bb.0: // %entry 72; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b 73; CHECK-NEXT: ret 74entry: 75 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2 76 %ret = add <4 x i32> %vdot1.i, %a 77 ret <4 x i32> %ret 78} 79 80define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { 81; CHECK-LABEL: test_vdot_s32_zero: 82; CHECK: // %bb.0: // %entry 83; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b 84; CHECK-NEXT: ret 85entry: 86 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2 87 %ret = add <2 x i32> %vdot1.i, %a 88 ret <2 x i32> %ret 89} 90 91define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { 92; CHECK-LABEL: test_vdotq_s32_zero: 93; CHECK: // %bb.0: // %entry 94; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b 95; CHECK-NEXT: ret 96entry: 97 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2 98 %ret = add <4 x i32> %vdot1.i, %a 99 ret <4 x i32> %ret 100} 101 102 103define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { 104; CHECK-LABEL: test_vdot_lane_u32: 105; CHECK: // %bb.0: // %entry 106; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 107; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1] 108; CHECK-NEXT: ret 109entry: 110 %.cast = bitcast <8 x i8> %c to <2 x i32> 111 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 112 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 113 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 114 ret <2 x i32> %vdot1.i 115} 116 117define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { 118; CHECK-LABEL: test_vdotq_lane_u32: 119; CHECK: // %bb.0: // %entry 120; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 121; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1] 122; CHECK-NEXT: ret 123entry: 124 %.cast = bitcast <8 x i8> %c to <2 x i32> 125 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 126 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 127 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 128 ret <4 x i32> %vdot1.i 129} 130 131define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) { 132; CHECK-LABEL: test_vdot_laneq_u32: 133; CHECK: // %bb.0: // %entry 134; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1] 135; CHECK-NEXT: ret 136entry: 137 %.cast = bitcast <16 x i8> %c to <4 x i32> 138 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 139 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 140 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 141 ret <2 x i32> %vdot1.i 142} 143 144define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) { 145; CHECK-LABEL: test_vdotq_laneq_u32: 146; CHECK: // %bb.0: // %entry 147; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1] 148; CHECK-NEXT: ret 149entry: 150 %.cast = bitcast <16 x i8> %c to <4 x i32> 151 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 152 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 153 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 154 ret <4 x i32> %vdot1.i 155} 156 157 158define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { 159; CHECK-LABEL: test_vdot_lane_u32_zero: 160; CHECK: // %bb.0: // %entry 161; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 162; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1] 163; CHECK-NEXT: ret 164entry: 165 %.cast = bitcast <8 x i8> %c to <2 x i32> 166 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 167 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 168 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2 169 %ret = add <2 x i32> %vdot1.i, %a 170 ret <2 x i32> %ret 171} 172 173define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { 174; CHECK-LABEL: test_vdotq_lane_u32_zero: 175; CHECK: // %bb.0: // %entry 176; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 177; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1] 178; CHECK-NEXT: ret 179entry: 180 %.cast = bitcast <8 x i8> %c to <2 x i32> 181 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 182 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 183 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2 184 %ret = add <4 x i32> %vdot1.i, %a 185 ret <4 x i32> %ret 186} 187 188define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) { 189; CHECK-LABEL: test_vdot_laneq_u32_zero: 190; CHECK: // %bb.0: // %entry 191; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1] 192; CHECK-NEXT: ret 193entry: 194 %.cast = bitcast <16 x i8> %c to <4 x i32> 195 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 196 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 197 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2 198 %ret = add <2 x i32> %vdot1.i, %a 199 ret <2 x i32> %ret 200} 201 202define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) { 203; CHECK-LABEL: test_vdotq_laneq_u32_zero: 204; CHECK: // %bb.0: // %entry 205; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1] 206; CHECK-NEXT: ret 207entry: 208 %.cast = bitcast <16 x i8> %c to <4 x i32> 209 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 210 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 211 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2 212 %ret = add <4 x i32> %vdot1.i, %a 213 ret <4 x i32> %ret 214} 215 216 217define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { 218; CHECK-LABEL: test_vdot_lane_s32: 219; CHECK: // %bb.0: // %entry 220; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 221; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1] 222; CHECK-NEXT: ret 223entry: 224 %.cast = bitcast <8 x i8> %c to <2 x i32> 225 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 226 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 227 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 228 ret <2 x i32> %vdot1.i 229} 230 231define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { 232; CHECK-LABEL: test_vdotq_lane_s32: 233; CHECK: // %bb.0: // %entry 234; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 235; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1] 236; CHECK-NEXT: ret 237entry: 238 %.cast = bitcast <8 x i8> %c to <2 x i32> 239 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 240 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 241 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 242 ret <4 x i32> %vdot1.i 243} 244 245define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) { 246; CHECK-LABEL: test_vdot_laneq_s32: 247; CHECK: // %bb.0: // %entry 248; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1] 249; CHECK-NEXT: ret 250entry: 251 %.cast = bitcast <16 x i8> %c to <4 x i32> 252 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 253 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 254 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 255 ret <2 x i32> %vdot1.i 256} 257 258define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) { 259; CHECK-LABEL: test_vdotq_laneq_s32: 260; CHECK: // %bb.0: // %entry 261; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1] 262; CHECK-NEXT: ret 263entry: 264 %.cast = bitcast <16 x i8> %c to <4 x i32> 265 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 266 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 267 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 268 ret <4 x i32> %vdot1.i 269} 270 271 272define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { 273; CHECK-LABEL: test_vdot_lane_s32_zero: 274; CHECK: // %bb.0: // %entry 275; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 276; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1] 277; CHECK-NEXT: ret 278entry: 279 %.cast = bitcast <8 x i8> %c to <2 x i32> 280 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 281 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 282 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2 283 %ret = add <2 x i32> %vdot1.i, %a 284 ret <2 x i32> %ret 285} 286 287define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { 288; CHECK-LABEL: test_vdotq_lane_s32_zero: 289; CHECK: // %bb.0: // %entry 290; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 291; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1] 292; CHECK-NEXT: ret 293entry: 294 %.cast = bitcast <8 x i8> %c to <2 x i32> 295 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 296 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 297 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2 298 %ret = add <4 x i32> %vdot1.i, %a 299 ret <4 x i32> %ret 300} 301 302define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) { 303; CHECK-LABEL: test_vdot_laneq_s32_zero: 304; CHECK: // %bb.0: // %entry 305; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1] 306; CHECK-NEXT: ret 307entry: 308 %.cast = bitcast <16 x i8> %c to <4 x i32> 309 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 310 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> 311 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2 312 %ret = add <2 x i32> %vdot1.i, %a 313 ret <2 x i32> %ret 314} 315 316define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) { 317; CHECK-LABEL: test_vdotq_laneq_s32_zero: 318; CHECK: // %bb.0: // %entry 319; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1] 320; CHECK-NEXT: ret 321entry: 322 %.cast = bitcast <16 x i8> %c to <4 x i32> 323 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 324 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> 325 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2 326 %ret = add <4 x i32> %vdot1.i, %a 327 ret <4 x i32> %ret 328} 329