1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s 3 4define <vscale x 2 x i64> @add_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { 5; CHECK-LABEL: add_i64: 6; CHECK: // %bb.0: 7; CHECK-NEXT: add z0.d, z0.d, z1.d 8; CHECK-NEXT: ret 9 %res = add <vscale x 2 x i64> %a, %b 10 ret <vscale x 2 x i64> %res 11} 12 13define <vscale x 4 x i32> @add_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { 14; CHECK-LABEL: add_i32: 15; CHECK: // %bb.0: 16; CHECK-NEXT: add z0.s, z0.s, z1.s 17; CHECK-NEXT: ret 18 %res = add <vscale x 4 x i32> %a, %b 19 ret <vscale x 4 x i32> %res 20} 21 22define <vscale x 8 x i16> @add_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 23; CHECK-LABEL: add_i16: 24; CHECK: // %bb.0: 25; CHECK-NEXT: add z0.h, z0.h, z1.h 26; CHECK-NEXT: ret 27 %res = add <vscale x 8 x i16> %a, %b 28 ret <vscale x 8 x i16> %res 29} 30 31define <vscale x 16 x i8> @add_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 32; CHECK-LABEL: add_i8: 33; CHECK: // %bb.0: 34; CHECK-NEXT: add z0.b, z0.b, z1.b 35; CHECK-NEXT: ret 36 %res = add <vscale x 16 x i8> %a, %b 37 ret <vscale x 16 x i8> %res 38} 39 40define <vscale x 16 x i8> @add_i8_zero(<vscale x 16 x i8> %a) { 41; CHECK-LABEL: add_i8_zero: 42; CHECK: // %bb.0: 43; CHECK-NEXT: ret 44 %res = add <vscale x 16 x i8> %a, zeroinitializer 45 ret <vscale x 16 x i8> %res 46} 47 48define <vscale x 1 x i32> @add_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b) { 49; CHECK-LABEL: add_nxv1i32: 50; CHECK: // %bb.0: // %entry 51; CHECK-NEXT: add z0.s, z0.s, z1.s 52; CHECK-NEXT: ret 53entry: 54 %c = add <vscale x 1 x i32> %a, %b 55 ret <vscale x 1 x i32> %c 56} 57 58define <vscale x 2 x i64> @sub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { 59; CHECK-LABEL: sub_i64: 60; CHECK: // %bb.0: 61; CHECK-NEXT: sub z0.d, z0.d, z1.d 62; CHECK-NEXT: ret 63 %res = sub <vscale x 2 x i64> %a, %b 64 ret <vscale x 2 x i64> %res 65} 66 67define <vscale x 4 x i32> @sub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { 68; CHECK-LABEL: sub_i32: 69; CHECK: // %bb.0: 70; CHECK-NEXT: sub z0.s, z0.s, z1.s 71; CHECK-NEXT: ret 72 %res = sub <vscale x 4 x i32> %a, %b 73 ret <vscale x 4 x i32> %res 74} 75 76define <vscale x 8 x i16> @sub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 77; CHECK-LABEL: sub_i16: 78; CHECK: // %bb.0: 79; CHECK-NEXT: sub z0.h, z0.h, z1.h 80; CHECK-NEXT: ret 81 %res = sub <vscale x 8 x i16> %a, %b 82 ret <vscale x 8 x i16> %res 83} 84 85define <vscale x 16 x i8> @sub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 86; CHECK-LABEL: sub_i8: 87; CHECK: // %bb.0: 88; CHECK-NEXT: sub z0.b, z0.b, z1.b 89; CHECK-NEXT: ret 90 %res = sub <vscale x 16 x i8> %a, %b 91 ret <vscale x 16 x i8> %res 92} 93 94define <vscale x 16 x i8> @sub_i8_zero(<vscale x 16 x i8> %a) { 95; CHECK-LABEL: sub_i8_zero: 96; CHECK: // %bb.0: 97; CHECK-NEXT: ret 98 %res = sub <vscale x 16 x i8> %a, zeroinitializer 99 ret <vscale x 16 x i8> %res 100} 101 102define <vscale x 16 x i8> @abs_nxv16i8(<vscale x 16 x i8> %a) { 103; CHECK-LABEL: abs_nxv16i8: 104; CHECK: // %bb.0: 105; CHECK-NEXT: ptrue p0.b 106; CHECK-NEXT: abs z0.b, p0/m, z0.b 107; CHECK-NEXT: ret 108 %res = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> %a, i1 false) 109 ret <vscale x 16 x i8> %res 110} 111 112define <vscale x 8 x i16> @abs_nxv8i16(<vscale x 8 x i16> %a) { 113; CHECK-LABEL: abs_nxv8i16: 114; CHECK: // %bb.0: 115; CHECK-NEXT: ptrue p0.h 116; CHECK-NEXT: abs z0.h, p0/m, z0.h 117; CHECK-NEXT: ret 118 %res = call <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16> %a, i1 false) 119 ret <vscale x 8 x i16> %res 120} 121 122define <vscale x 4 x i32> @abs_nxv4i32(<vscale x 4 x i32> %a) { 123; CHECK-LABEL: abs_nxv4i32: 124; CHECK: // %bb.0: 125; CHECK-NEXT: ptrue p0.s 126; CHECK-NEXT: abs z0.s, p0/m, z0.s 127; CHECK-NEXT: ret 128 %res = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> %a, i1 false) 129 ret <vscale x 4 x i32> %res 130} 131 132define <vscale x 2 x i64> @abs_nxv2i64(<vscale x 2 x i64> %a) { 133; CHECK-LABEL: abs_nxv2i64: 134; CHECK: // %bb.0: 135; CHECK-NEXT: ptrue p0.d 136; CHECK-NEXT: abs z0.d, p0/m, z0.d 137; CHECK-NEXT: ret 138 %res = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> %a, i1 false) 139 ret <vscale x 2 x i64> %res 140} 141 142define <vscale x 4 x i16> @abs_nxv4i16(<vscale x 4 x i16> %a) { 143; CHECK-LABEL: abs_nxv4i16: 144; CHECK: // %bb.0: 145; CHECK-NEXT: ptrue p0.s 146; CHECK-NEXT: sxth z0.s, p0/m, z0.s 147; CHECK-NEXT: abs z0.s, p0/m, z0.s 148; CHECK-NEXT: ret 149 %res = call <vscale x 4 x i16> @llvm.abs.nxv4i16(<vscale x 4 x i16> %a, i1 false) 150 ret <vscale x 4 x i16> %res 151} 152 153define <vscale x 32 x i8> @abs_nxv32i8(<vscale x 32 x i8> %a) { 154; CHECK-LABEL: abs_nxv32i8: 155; CHECK: // %bb.0: 156; CHECK-NEXT: ptrue p0.b 157; CHECK-NEXT: abs z0.b, p0/m, z0.b 158; CHECK-NEXT: abs z1.b, p0/m, z1.b 159; CHECK-NEXT: ret 160 %res = call <vscale x 32 x i8> @llvm.abs.nxv32i8(<vscale x 32 x i8> %a, i1 false) 161 ret <vscale x 32 x i8> %res 162} 163 164define <vscale x 8 x i64> @abs_nxv8i64(<vscale x 8 x i64> %a) { 165; CHECK-LABEL: abs_nxv8i64: 166; CHECK: // %bb.0: 167; CHECK-NEXT: ptrue p0.d 168; CHECK-NEXT: abs z0.d, p0/m, z0.d 169; CHECK-NEXT: abs z1.d, p0/m, z1.d 170; CHECK-NEXT: abs z2.d, p0/m, z2.d 171; CHECK-NEXT: abs z3.d, p0/m, z3.d 172; CHECK-NEXT: ret 173 %res = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> %a, i1 false) 174 ret <vscale x 8 x i64> %res 175} 176 177define <vscale x 2 x i64> @sqadd_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { 178; CHECK-LABEL: sqadd_i64: 179; CHECK: // %bb.0: 180; CHECK-NEXT: sqadd z0.d, z0.d, z1.d 181; CHECK-NEXT: ret 182 %res = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 183 ret <vscale x 2 x i64> %res 184} 185 186define <vscale x 4 x i32> @sqadd_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { 187; CHECK-LABEL: sqadd_i32: 188; CHECK: // %bb.0: 189; CHECK-NEXT: sqadd z0.s, z0.s, z1.s 190; CHECK-NEXT: ret 191 %res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 192 ret <vscale x 4 x i32> %res 193} 194 195define <vscale x 4 x i32> @sqadd_i32_zero(<vscale x 4 x i32> %a) { 196; CHECK-LABEL: sqadd_i32_zero: 197; CHECK: // %bb.0: 198; CHECK-NEXT: ret 199 %res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer) 200 ret <vscale x 4 x i32> %res 201} 202 203define <vscale x 8 x i16> @sqadd_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 204; CHECK-LABEL: sqadd_i16: 205; CHECK: // %bb.0: 206; CHECK-NEXT: sqadd z0.h, z0.h, z1.h 207; CHECK-NEXT: ret 208 %res = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 209 ret <vscale x 8 x i16> %res 210} 211 212define <vscale x 16 x i8> @sqadd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 213; CHECK-LABEL: sqadd_i8: 214; CHECK: // %bb.0: 215; CHECK-NEXT: sqadd z0.b, z0.b, z1.b 216; CHECK-NEXT: ret 217 %res = call <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 218 ret <vscale x 16 x i8> %res 219} 220 221 222define <vscale x 2 x i64> @sqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { 223; CHECK-LABEL: sqsub_i64: 224; CHECK: // %bb.0: 225; CHECK-NEXT: sqsub z0.d, z0.d, z1.d 226; CHECK-NEXT: ret 227 %res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 228 ret <vscale x 2 x i64> %res 229} 230 231define <vscale x 2 x i64> @sqsub_i64_zero(<vscale x 2 x i64> %a) { 232; CHECK-LABEL: sqsub_i64_zero: 233; CHECK: // %bb.0: 234; CHECK-NEXT: ret 235 %res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer) 236 ret <vscale x 2 x i64> %res 237} 238 239define <vscale x 4 x i32> @sqsub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { 240; CHECK-LABEL: sqsub_i32: 241; CHECK: // %bb.0: 242; CHECK-NEXT: sqsub z0.s, z0.s, z1.s 243; CHECK-NEXT: ret 244 %res = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 245 ret <vscale x 4 x i32> %res 246} 247 248define <vscale x 8 x i16> @sqsub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 249; CHECK-LABEL: sqsub_i16: 250; CHECK: // %bb.0: 251; CHECK-NEXT: sqsub z0.h, z0.h, z1.h 252; CHECK-NEXT: ret 253 %res = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 254 ret <vscale x 8 x i16> %res 255} 256 257define <vscale x 16 x i8> @sqsub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 258; CHECK-LABEL: sqsub_i8: 259; CHECK: // %bb.0: 260; CHECK-NEXT: sqsub z0.b, z0.b, z1.b 261; CHECK-NEXT: ret 262 %res = call <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 263 ret <vscale x 16 x i8> %res 264} 265 266 267define <vscale x 2 x i64> @uqadd_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { 268; CHECK-LABEL: uqadd_i64: 269; CHECK: // %bb.0: 270; CHECK-NEXT: uqadd z0.d, z0.d, z1.d 271; CHECK-NEXT: ret 272 %res = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 273 ret <vscale x 2 x i64> %res 274} 275 276define <vscale x 4 x i32> @uqadd_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { 277; CHECK-LABEL: uqadd_i32: 278; CHECK: // %bb.0: 279; CHECK-NEXT: uqadd z0.s, z0.s, z1.s 280; CHECK-NEXT: ret 281 %res = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 282 ret <vscale x 4 x i32> %res 283} 284 285define <vscale x 8 x i16> @uqadd_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 286; CHECK-LABEL: uqadd_i16: 287; CHECK: // %bb.0: 288; CHECK-NEXT: uqadd z0.h, z0.h, z1.h 289; CHECK-NEXT: ret 290 %res = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 291 ret <vscale x 8 x i16> %res 292} 293 294define <vscale x 16 x i8> @uqadd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 295; CHECK-LABEL: uqadd_i8: 296; CHECK: // %bb.0: 297; CHECK-NEXT: uqadd z0.b, z0.b, z1.b 298; CHECK-NEXT: ret 299 %res = call <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 300 ret <vscale x 16 x i8> %res 301} 302 303 304define <vscale x 2 x i64> @uqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) { 305; CHECK-LABEL: uqsub_i64: 306; CHECK: // %bb.0: 307; CHECK-NEXT: uqsub z0.d, z0.d, z1.d 308; CHECK-NEXT: ret 309 %res = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 310 ret <vscale x 2 x i64> %res 311} 312 313define <vscale x 4 x i32> @uqsub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { 314; CHECK-LABEL: uqsub_i32: 315; CHECK: // %bb.0: 316; CHECK-NEXT: uqsub z0.s, z0.s, z1.s 317; CHECK-NEXT: ret 318 %res = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 319 ret <vscale x 4 x i32> %res 320} 321 322define <vscale x 8 x i16> @uqsub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { 323; CHECK-LABEL: uqsub_i16: 324; CHECK: // %bb.0: 325; CHECK-NEXT: uqsub z0.h, z0.h, z1.h 326; CHECK-NEXT: ret 327 %res = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 328 ret <vscale x 8 x i16> %res 329} 330 331define <vscale x 16 x i8> @uqsub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { 332; CHECK-LABEL: uqsub_i8: 333; CHECK: // %bb.0: 334; CHECK-NEXT: uqsub z0.b, z0.b, z1.b 335; CHECK-NEXT: ret 336 %res = call <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 337 ret <vscale x 16 x i8> %res 338} 339 340define <vscale x 16 x i8> @mad_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) { 341; CHECK-LABEL: mad_i8: 342; CHECK: // %bb.0: 343; CHECK-NEXT: ptrue p0.b 344; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b 345; CHECK-NEXT: ret 346 %prod = mul <vscale x 16 x i8> %a, %b 347 %res = add <vscale x 16 x i8> %c, %prod 348 ret <vscale x 16 x i8> %res 349} 350 351define <vscale x 8 x i16> @mad_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) { 352; CHECK-LABEL: mad_i16: 353; CHECK: // %bb.0: 354; CHECK-NEXT: ptrue p0.h 355; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h 356; CHECK-NEXT: ret 357 %prod = mul <vscale x 8 x i16> %a, %b 358 %res = add <vscale x 8 x i16> %c, %prod 359 ret <vscale x 8 x i16> %res 360} 361 362define <vscale x 4 x i32> @mad_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) { 363; CHECK-LABEL: mad_i32: 364; CHECK: // %bb.0: 365; CHECK-NEXT: ptrue p0.s 366; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s 367; CHECK-NEXT: ret 368 %prod = mul <vscale x 4 x i32> %a, %b 369 %res = add <vscale x 4 x i32> %c, %prod 370 ret <vscale x 4 x i32> %res 371} 372 373define <vscale x 2 x i64> @mad_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) { 374; CHECK-LABEL: mad_i64: 375; CHECK: // %bb.0: 376; CHECK-NEXT: ptrue p0.d 377; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d 378; CHECK-NEXT: ret 379 %prod = mul <vscale x 2 x i64> %a, %b 380 %res = add <vscale x 2 x i64> %c, %prod 381 ret <vscale x 2 x i64> %res 382} 383 384define <vscale x 16 x i8> @mla_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) { 385; CHECK-LABEL: mla_i8: 386; CHECK: // %bb.0: 387; CHECK-NEXT: ptrue p0.b 388; CHECK-NEXT: mla z0.b, p0/m, z1.b, z2.b 389; CHECK-NEXT: ret 390 %prod = mul <vscale x 16 x i8> %b, %c 391 %res = add <vscale x 16 x i8> %a, %prod 392 ret <vscale x 16 x i8> %res 393} 394 395define <vscale x 8 x i16> @mla_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) { 396; CHECK-LABEL: mla_i16: 397; CHECK: // %bb.0: 398; CHECK-NEXT: ptrue p0.h 399; CHECK-NEXT: mla z0.h, p0/m, z1.h, z2.h 400; CHECK-NEXT: ret 401 %prod = mul <vscale x 8 x i16> %b, %c 402 %res = add <vscale x 8 x i16> %a, %prod 403 ret <vscale x 8 x i16> %res 404} 405 406define <vscale x 4 x i32> @mla_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) { 407; CHECK-LABEL: mla_i32: 408; CHECK: // %bb.0: 409; CHECK-NEXT: ptrue p0.s 410; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s 411; CHECK-NEXT: ret 412 %prod = mul <vscale x 4 x i32> %b, %c 413 %res = add <vscale x 4 x i32> %a, %prod 414 ret <vscale x 4 x i32> %res 415} 416 417define <vscale x 2 x i64> @mla_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) { 418; CHECK-LABEL: mla_i64: 419; CHECK: // %bb.0: 420; CHECK-NEXT: ptrue p0.d 421; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d 422; CHECK-NEXT: ret 423 %prod = mul <vscale x 2 x i64> %b, %c 424 %res = add <vscale x 2 x i64> %a, %prod 425 ret <vscale x 2 x i64> %res 426} 427 428define <vscale x 16 x i8> @mla_i8_multiuse(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, ptr %p) { 429; CHECK-LABEL: mla_i8_multiuse: 430; CHECK: // %bb.0: 431; CHECK-NEXT: ptrue p0.b 432; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b 433; CHECK-NEXT: add z0.b, z2.b, z1.b 434; CHECK-NEXT: st1b { z1.b }, p0, [x0] 435; CHECK-NEXT: ret 436 %prod = mul <vscale x 16 x i8> %a, %b 437 store <vscale x 16 x i8> %prod, ptr %p 438 %res = add <vscale x 16 x i8> %c, %prod 439 ret <vscale x 16 x i8> %res 440} 441 442define <vscale x 16 x i8> @msb_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) { 443; CHECK-LABEL: msb_i8: 444; CHECK: // %bb.0: 445; CHECK-NEXT: ptrue p0.b 446; CHECK-NEXT: msb z0.b, p0/m, z1.b, z2.b 447; CHECK-NEXT: ret 448 %prod = mul <vscale x 16 x i8> %a, %b 449 %res = sub <vscale x 16 x i8> %c, %prod 450 ret <vscale x 16 x i8> %res 451} 452 453define <vscale x 8 x i16> @msb_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) { 454; CHECK-LABEL: msb_i16: 455; CHECK: // %bb.0: 456; CHECK-NEXT: ptrue p0.h 457; CHECK-NEXT: msb z0.h, p0/m, z1.h, z2.h 458; CHECK-NEXT: ret 459 %prod = mul <vscale x 8 x i16> %a, %b 460 %res = sub <vscale x 8 x i16> %c, %prod 461 ret <vscale x 8 x i16> %res 462} 463 464define <vscale x 4 x i32> @msb_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) { 465; CHECK-LABEL: msb_i32: 466; CHECK: // %bb.0: 467; CHECK-NEXT: ptrue p0.s 468; CHECK-NEXT: msb z0.s, p0/m, z1.s, z2.s 469; CHECK-NEXT: ret 470 %prod = mul <vscale x 4 x i32> %a, %b 471 %res = sub <vscale x 4 x i32> %c, %prod 472 ret <vscale x 4 x i32> %res 473} 474 475define <vscale x 2 x i64> @msb_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) { 476; CHECK-LABEL: msb_i64: 477; CHECK: // %bb.0: 478; CHECK-NEXT: ptrue p0.d 479; CHECK-NEXT: msb z0.d, p0/m, z1.d, z2.d 480; CHECK-NEXT: ret 481 %prod = mul <vscale x 2 x i64> %a, %b 482 %res = sub <vscale x 2 x i64> %c, %prod 483 ret <vscale x 2 x i64> %res 484} 485 486define <vscale x 16 x i8> @mls_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) { 487; CHECK-LABEL: mls_i8: 488; CHECK: // %bb.0: 489; CHECK-NEXT: ptrue p0.b 490; CHECK-NEXT: mls z0.b, p0/m, z1.b, z2.b 491; CHECK-NEXT: ret 492 %prod = mul <vscale x 16 x i8> %b, %c 493 %res = sub <vscale x 16 x i8> %a, %prod 494 ret <vscale x 16 x i8> %res 495} 496 497define <vscale x 8 x i16> @mls_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) { 498; CHECK-LABEL: mls_i16: 499; CHECK: // %bb.0: 500; CHECK-NEXT: ptrue p0.h 501; CHECK-NEXT: mls z0.h, p0/m, z1.h, z2.h 502; CHECK-NEXT: ret 503 %prod = mul <vscale x 8 x i16> %b, %c 504 %res = sub <vscale x 8 x i16> %a, %prod 505 ret <vscale x 8 x i16> %res 506} 507 508define <vscale x 4 x i32> @mls_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) { 509; CHECK-LABEL: mls_i32: 510; CHECK: // %bb.0: 511; CHECK-NEXT: ptrue p0.s 512; CHECK-NEXT: mls z0.s, p0/m, z1.s, z2.s 513; CHECK-NEXT: ret 514 %prod = mul <vscale x 4 x i32> %b, %c 515 %res = sub <vscale x 4 x i32> %a, %prod 516 ret <vscale x 4 x i32> %res 517} 518 519define <vscale x 2 x i64> @mls_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) { 520; CHECK-LABEL: mls_i64: 521; CHECK: // %bb.0: 522; CHECK-NEXT: ptrue p0.d 523; CHECK-NEXT: mls z0.d, p0/m, z1.d, z2.d 524; CHECK-NEXT: ret 525 %prod = mul <vscale x 2 x i64> %b, %c 526 %res = sub <vscale x 2 x i64> %a, %prod 527 ret <vscale x 2 x i64> %res 528} 529 530; Test cases below have one of the add/sub operands as constant splat 531 532 define <vscale x 2 x i64> @muladd_i64_positiveAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 533; CHECK-LABEL: muladd_i64_positiveAddend: 534; CHECK: // %bb.0: 535; CHECK-NEXT: mov z2.d, #0xffffffff 536; CHECK-NEXT: ptrue p0.d 537; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d 538; CHECK-NEXT: ret 539{ 540 %1 = mul <vscale x 2 x i64> %a, %b 541 %2 = add <vscale x 2 x i64> %1, splat (i64 4294967295) 542 ret <vscale x 2 x i64> %2 543} 544 545define <vscale x 2 x i64> @muladd_i64_negativeAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 546; CHECK-LABEL: muladd_i64_negativeAddend: 547; CHECK: // %bb.0: 548; CHECK-NEXT: mov z2.d, #0xffffffff00000001 549; CHECK-NEXT: ptrue p0.d 550; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d 551; CHECK-NEXT: ret 552{ 553 %1 = mul <vscale x 2 x i64> %a, %b 554 %2 = add <vscale x 2 x i64> %1, splat (i64 -4294967295) 555 ret <vscale x 2 x i64> %2 556} 557 558 559define <vscale x 4 x i32> @muladd_i32_positiveAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 560; CHECK-LABEL: muladd_i32_positiveAddend: 561; CHECK: // %bb.0: 562; CHECK-NEXT: mov z2.s, #0x10000 563; CHECK-NEXT: ptrue p0.s 564; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s 565; CHECK-NEXT: ret 566{ 567 %1 = mul <vscale x 4 x i32> %a, %b 568 %2 = add <vscale x 4 x i32> %1, splat (i32 65536) 569 ret <vscale x 4 x i32> %2 570} 571 572define <vscale x 4 x i32> @muladd_i32_negativeAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 573; CHECK-LABEL: muladd_i32_negativeAddend: 574; CHECK: // %bb.0: 575; CHECK-NEXT: mov z2.s, #0xffff0000 576; CHECK-NEXT: ptrue p0.s 577; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s 578; CHECK-NEXT: ret 579{ 580 %1 = mul <vscale x 4 x i32> %a, %b 581 %2 = add <vscale x 4 x i32> %1, splat (i32 -65536) 582 ret <vscale x 4 x i32> %2 583} 584 585define <vscale x 8 x i16> @muladd_i16_positiveAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 586; CHECK-LABEL: muladd_i16_positiveAddend: 587; CHECK: // %bb.0: 588; CHECK-NEXT: mov z2.h, #255 // =0xff 589; CHECK-NEXT: ptrue p0.h 590; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h 591; CHECK-NEXT: ret 592{ 593 %1 = mul <vscale x 8 x i16> %a, %b 594 %2 = add <vscale x 8 x i16> %1, splat (i16 255) 595 ret <vscale x 8 x i16> %2 596} 597 598define <vscale x 8 x i16> @muladd_i16_negativeAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 599; CHECK-LABEL: muladd_i16_negativeAddend: 600; CHECK: // %bb.0: 601; CHECK-NEXT: mov z2.h, #-255 // =0xffffffffffffff01 602; CHECK-NEXT: ptrue p0.h 603; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h 604; CHECK-NEXT: ret 605{ 606 %1 = mul <vscale x 8 x i16> %a, %b 607 %2 = add <vscale x 8 x i16> %1, splat (i16 -255) 608 ret <vscale x 8 x i16> %2 609} 610 611define <vscale x 16 x i8> @muladd_i8_positiveAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 612; CHECK-LABEL: muladd_i8_positiveAddend: 613; CHECK: // %bb.0: 614; CHECK-NEXT: mov z2.b, #15 // =0xf 615; CHECK-NEXT: ptrue p0.b 616; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b 617; CHECK-NEXT: ret 618{ 619 %1 = mul <vscale x 16 x i8> %a, %b 620 %2 = add <vscale x 16 x i8> %1, splat (i8 15) 621 ret <vscale x 16 x i8> %2 622} 623 624define <vscale x 16 x i8> @muladd_i8_negativeAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 625; CHECK-LABEL: muladd_i8_negativeAddend: 626; CHECK: // %bb.0: 627; CHECK-NEXT: mov z2.b, #-15 // =0xfffffffffffffff1 628; CHECK-NEXT: ptrue p0.b 629; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b 630; CHECK-NEXT: ret 631{ 632 %1 = mul <vscale x 16 x i8> %a, %b 633 %2 = add <vscale x 16 x i8> %1, splat (i8 -15) 634 ret <vscale x 16 x i8> %2 635} 636 637define <vscale x 2 x i64> @mulsub_i64_positiveAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 638; CHECK-LABEL: mulsub_i64_positiveAddend: 639; CHECK: // %bb.0: 640; CHECK-NEXT: ptrue p0.d 641; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 642; CHECK-NEXT: mov z1.d, #0xffffffff 643; CHECK-NEXT: sub z0.d, z0.d, z1.d 644; CHECK-NEXT: ret 645{ 646 %1 = mul <vscale x 2 x i64> %a, %b 647 %2 = sub <vscale x 2 x i64> %1, splat (i64 4294967295) 648 ret <vscale x 2 x i64> %2 649} 650 651define <vscale x 2 x i64> @mulsub_i64_negativeAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) 652; CHECK-LABEL: mulsub_i64_negativeAddend: 653; CHECK: // %bb.0: 654; CHECK-NEXT: ptrue p0.d 655; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 656; CHECK-NEXT: mov z1.d, #0xffffffff00000001 657; CHECK-NEXT: sub z0.d, z0.d, z1.d 658; CHECK-NEXT: ret 659{ 660 %1 = mul <vscale x 2 x i64> %a, %b 661 %2 = sub <vscale x 2 x i64> %1, splat (i64 -4294967295) 662 ret <vscale x 2 x i64> %2 663} 664 665 666define <vscale x 4 x i32> @mulsub_i32_positiveAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 667; CHECK-LABEL: mulsub_i32_positiveAddend: 668; CHECK: // %bb.0: 669; CHECK-NEXT: ptrue p0.s 670; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 671; CHECK-NEXT: mov z1.s, #0x10000 672; CHECK-NEXT: sub z0.s, z0.s, z1.s 673; CHECK-NEXT: ret 674{ 675 %1 = mul <vscale x 4 x i32> %a, %b 676 %2 = sub <vscale x 4 x i32> %1, splat (i32 65536) 677 ret <vscale x 4 x i32> %2 678} 679 680define <vscale x 4 x i32> @mulsub_i32_negativeAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) 681; CHECK-LABEL: mulsub_i32_negativeAddend: 682; CHECK: // %bb.0: 683; CHECK-NEXT: ptrue p0.s 684; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 685; CHECK-NEXT: mov z1.s, #0xffff0000 686; CHECK-NEXT: sub z0.s, z0.s, z1.s 687; CHECK-NEXT: ret 688{ 689 %1 = mul <vscale x 4 x i32> %a, %b 690 %2 = sub <vscale x 4 x i32> %1, splat (i32 -65536) 691 ret <vscale x 4 x i32> %2 692} 693 694define <vscale x 8 x i16> @mulsub_i16_positiveAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 695; CHECK-LABEL: mulsub_i16_positiveAddend: 696; CHECK: // %bb.0: 697; CHECK-NEXT: ptrue p0.h 698; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 699; CHECK-NEXT: sub z0.h, z0.h, #255 // =0xff 700; CHECK-NEXT: ret 701{ 702 %1 = mul <vscale x 8 x i16> %a, %b 703 %2 = sub <vscale x 8 x i16> %1, splat (i16 255) 704 ret <vscale x 8 x i16> %2 705} 706 707define <vscale x 8 x i16> @mulsub_i16_negativeAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 708; CHECK-LABEL: mulsub_i16_negativeAddend: 709; CHECK: // %bb.0: 710; CHECK-NEXT: ptrue p0.h 711; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 712; CHECK-NEXT: mov z1.h, #-255 // =0xffffffffffffff01 713; CHECK-NEXT: sub z0.h, z0.h, z1.h 714; CHECK-NEXT: ret 715{ 716 %1 = mul <vscale x 8 x i16> %a, %b 717 %2 = sub <vscale x 8 x i16> %1, splat (i16 -255) 718 ret <vscale x 8 x i16> %2 719} 720 721define <vscale x 16 x i8> @mulsub_i8_positiveAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 722; CHECK-LABEL: mulsub_i8_positiveAddend: 723; CHECK: // %bb.0: 724; CHECK-NEXT: ptrue p0.b 725; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b 726; CHECK-NEXT: sub z0.b, z0.b, #15 // =0xf 727; CHECK-NEXT: ret 728{ 729 %1 = mul <vscale x 16 x i8> %a, %b 730 %2 = sub <vscale x 16 x i8> %1, splat (i8 15) 731 ret <vscale x 16 x i8> %2 732} 733 734define <vscale x 16 x i8> @mulsub_i8_negativeAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) 735; CHECK-LABEL: mulsub_i8_negativeAddend: 736; CHECK: // %bb.0: 737; CHECK-NEXT: ptrue p0.b 738; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b 739; CHECK-NEXT: sub z0.b, z0.b, #241 // =0xf1 740; CHECK-NEXT: ret 741{ 742 %1 = mul <vscale x 16 x i8> %a, %b 743 %2 = sub <vscale x 16 x i8> %1, splat (i8 -15) 744 ret <vscale x 16 x i8> %2 745} 746 747; TOFIX: Should generate msb for mul+sub in this case. Shuffling operand of sub generates the required msb instruction. 748define <vscale x 8 x i16> @multiple_fused_ops(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) 749; CHECK-LABEL: multiple_fused_ops: 750; CHECK: // %bb.0: 751; CHECK-NEXT: mov w8, #200 // =0xc8 752; CHECK-NEXT: ptrue p0.h 753; CHECK-NEXT: mov z2.h, w8 754; CHECK-NEXT: mla z2.h, p0/m, z0.h, z1.h 755; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h 756; CHECK-NEXT: sub z0.h, z0.h, z1.h 757; CHECK-NEXT: ret 758{ 759 %1 = mul <vscale x 8 x i16> %a, %b 760 %2 = add <vscale x 8 x i16> %1, splat (i16 200) 761 %3 = mul <vscale x 8 x i16> %2, %a 762 %4 = sub <vscale x 8 x i16> %3, %b 763 ret <vscale x 8 x i16> %4 764} 765 766define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) { 767; CHECK-LABEL: mad_in_loop: 768; CHECK: // %bb.0: // %entry 769; CHECK-NEXT: cmp w3, #1 770; CHECK-NEXT: b.lt .LBB70_3 771; CHECK-NEXT: // %bb.1: // %for.body.preheader 772; CHECK-NEXT: mov w9, w3 773; CHECK-NEXT: mov z0.s, #1 // =0x1 774; CHECK-NEXT: ptrue p0.s 775; CHECK-NEXT: whilelo p1.s, xzr, x9 776; CHECK-NEXT: mov x8, xzr 777; CHECK-NEXT: cntw x10 778; CHECK-NEXT: .LBB70_2: // %vector.body 779; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 780; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] 781; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2] 782; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s 783; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] 784; CHECK-NEXT: add x8, x8, x10 785; CHECK-NEXT: whilelo p1.s, x8, x9 786; CHECK-NEXT: b.mi .LBB70_2 787; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup 788; CHECK-NEXT: ret 789entry: 790 %cmp9 = icmp sgt i32 %n, 0 791 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup 792 793for.body.preheader: ; preds = %entry 794 %wide.trip.count = zext i32 %n to i64 795 %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) 796 %0 = tail call i64 @llvm.vscale.i64() 797 %1 = shl nuw nsw i64 %0, 2 798 br label %vector.body 799 800vector.body: ; preds = %vector.body, %for.body.preheader 801 %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] 802 %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] 803 %2 = getelementptr inbounds i32, ptr %src1, i64 %index 804 %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %2, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison) 805 %3 = getelementptr inbounds i32, ptr %src2, i64 %index 806 %wide.masked.load12 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison) 807 %4 = mul nsw <vscale x 4 x i32> %wide.masked.load12, %wide.masked.load 808 %5 = add nsw <vscale x 4 x i32> %4, splat (i32 1) 809 %6 = getelementptr inbounds i32, ptr %dst, i64 %index 810 tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask) 811 %index.next = add i64 %index, %1 812 %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) 813 %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0 814 br i1 %7, label %vector.body, label %for.cond.cleanup 815 816for.cond.cleanup: ; preds = %vector.body, %entry 817 ret void 818} 819 820declare i64 @llvm.vscale.i64() 821declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) 822declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) 823declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) 824 825declare <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) 826declare <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) 827declare <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) 828declare <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>) 829 830declare <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) 831declare <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) 832declare <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) 833declare <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>) 834 835declare <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) 836declare <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) 837declare <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) 838declare <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>) 839 840declare <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) 841declare <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) 842declare <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) 843declare <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>) 844 845declare <vscale x 32 x i8> @llvm.abs.nxv32i8(<vscale x 32 x i8>, i1) 846declare <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8>, i1) 847declare <vscale x 4 x i16> @llvm.abs.nxv4i16(<vscale x 4 x i16>, i1) 848declare <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16>, i1) 849declare <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32>, i1) 850declare <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64>, i1) 851declare <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64>, i1) 852