1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AArch64. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64LegalizerInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 22 #include "llvm/CodeGen/GlobalISel/Utils.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/CodeGen/TargetOpcodes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/IntrinsicsAArch64.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <initializer_list> 32 33 #define DEBUG_TYPE "aarch64-legalinfo" 34 35 using namespace llvm; 36 using namespace LegalizeActions; 37 using namespace LegalizeMutations; 38 using namespace LegalityPredicates; 39 using namespace MIPatternMatch; 40 41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) 42 : ST(&ST) { 43 using namespace TargetOpcode; 44 const LLT p0 = LLT::pointer(0, 64); 45 const LLT s8 = LLT::scalar(8); 46 const LLT s16 = LLT::scalar(16); 47 const LLT s32 = LLT::scalar(32); 48 const LLT s64 = LLT::scalar(64); 49 const LLT s128 = LLT::scalar(128); 50 const LLT v16s8 = LLT::fixed_vector(16, 8); 51 const LLT v8s8 = LLT::fixed_vector(8, 8); 52 const LLT v4s8 = LLT::fixed_vector(4, 8); 53 const LLT v2s8 = LLT::fixed_vector(2, 8); 54 const LLT v8s16 = LLT::fixed_vector(8, 16); 55 const LLT v4s16 = LLT::fixed_vector(4, 16); 56 const LLT v2s16 = LLT::fixed_vector(2, 16); 57 const LLT v2s32 = LLT::fixed_vector(2, 32); 58 const LLT v4s32 = LLT::fixed_vector(4, 32); 59 const LLT v2s64 = LLT::fixed_vector(2, 64); 60 const LLT v2p0 = LLT::fixed_vector(2, p0); 61 62 const LLT nxv16s8 = LLT::scalable_vector(16, s8); 63 const LLT nxv8s16 = LLT::scalable_vector(8, s16); 64 const LLT nxv4s32 = LLT::scalable_vector(4, s32); 65 const LLT nxv2s64 = LLT::scalable_vector(2, s64); 66 67 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ 68 v16s8, v8s16, v4s32, 69 v2s64, v2p0, 70 /* End 128bit types */ 71 /* Begin 64bit types */ 72 v8s8, v4s16, v2s32}; 73 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0}; 74 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList); 75 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList); 76 77 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); 78 79 // FIXME: support subtargets which have neon/fp-armv8 disabled. 80 if (!ST.hasNEON() || !ST.hasFPARMv8()) { 81 getLegacyLegalizerInfo().computeTables(); 82 return; 83 } 84 85 // Some instructions only support s16 if the subtarget has full 16-bit FP 86 // support. 87 const bool HasFP16 = ST.hasFullFP16(); 88 const LLT &MinFPScalar = HasFP16 ? s16 : s32; 89 90 const bool HasCSSC = ST.hasCSSC(); 91 const bool HasRCPC3 = ST.hasRCPC3(); 92 const bool HasSVE = ST.hasSVE(); 93 94 getActionDefinitionsBuilder( 95 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER}) 96 .legalFor({p0, s8, s16, s32, s64}) 97 .legalFor({v16s8, v8s16, v4s32, v2s64, v2p0, v8s8, v4s16, v2s32, v4s8, 98 v2s16, v2s8}) 99 .widenScalarToNextPow2(0) 100 .clampScalar(0, s8, s64) 101 .moreElementsToNextPow2(0) 102 .widenVectorEltsToVectorMinSize(0, 64) 103 .clampNumElements(0, v8s8, v16s8) 104 .clampNumElements(0, v4s16, v8s16) 105 .clampNumElements(0, v2s32, v4s32) 106 .clampMaxNumElements(0, s64, 2) 107 .clampMaxNumElements(0, p0, 2) 108 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); 109 110 getActionDefinitionsBuilder(G_PHI) 111 .legalFor({p0, s16, s32, s64}) 112 .legalFor(PackedVectorAllTypeList) 113 .widenScalarToNextPow2(0) 114 .moreElementsToNextPow2(0) 115 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 116 .clampScalar(0, s16, s64) 117 .clampNumElements(0, v8s8, v16s8) 118 .clampNumElements(0, v4s16, v8s16) 119 .clampNumElements(0, v2s32, v4s32) 120 .clampMaxNumElements(0, s64, 2) 121 .clampMaxNumElements(0, p0, 2); 122 123 getActionDefinitionsBuilder(G_BSWAP) 124 .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64}) 125 .widenScalarOrEltToNextPow2(0, 16) 126 .clampScalar(0, s32, s64) 127 .clampNumElements(0, v4s16, v8s16) 128 .clampNumElements(0, v2s32, v4s32) 129 .clampNumElements(0, v2s64, v2s64) 130 .moreElementsToNextPow2(0); 131 132 getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR}) 133 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8}) 134 .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64}) 135 .widenScalarToNextPow2(0) 136 .clampScalar(0, s32, s64) 137 .clampMaxNumElements(0, s8, 16) 138 .clampMaxNumElements(0, s16, 8) 139 .clampNumElements(0, v2s32, v4s32) 140 .clampNumElements(0, v2s64, v2s64) 141 .minScalarOrEltIf( 142 [=](const LegalityQuery &Query) { 143 return Query.Types[0].getNumElements() <= 2; 144 }, 145 0, s32) 146 .minScalarOrEltIf( 147 [=](const LegalityQuery &Query) { 148 return Query.Types[0].getNumElements() <= 4; 149 }, 150 0, s16) 151 .minScalarOrEltIf( 152 [=](const LegalityQuery &Query) { 153 return Query.Types[0].getNumElements() <= 16; 154 }, 155 0, s8) 156 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 157 .moreElementsToNextPow2(0); 158 159 getActionDefinitionsBuilder(G_MUL) 160 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8}) 161 .widenScalarToNextPow2(0) 162 .clampScalar(0, s32, s64) 163 .clampMaxNumElements(0, s8, 16) 164 .clampMaxNumElements(0, s16, 8) 165 .clampNumElements(0, v2s32, v4s32) 166 .clampNumElements(0, v2s64, v2s64) 167 .minScalarOrEltIf( 168 [=](const LegalityQuery &Query) { 169 return Query.Types[0].getNumElements() <= 2; 170 }, 171 0, s32) 172 .minScalarOrEltIf( 173 [=](const LegalityQuery &Query) { 174 return Query.Types[0].getNumElements() <= 4; 175 }, 176 0, s16) 177 .minScalarOrEltIf( 178 [=](const LegalityQuery &Query) { 179 return Query.Types[0].getNumElements() <= 16; 180 }, 181 0, s8) 182 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 183 .moreElementsToNextPow2(0); 184 185 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) 186 .customIf([=](const LegalityQuery &Query) { 187 const auto &SrcTy = Query.Types[0]; 188 const auto &AmtTy = Query.Types[1]; 189 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && 190 AmtTy.getSizeInBits() == 32; 191 }) 192 .legalFor({ 193 {s32, s32}, 194 {s32, s64}, 195 {s64, s64}, 196 {v8s8, v8s8}, 197 {v16s8, v16s8}, 198 {v4s16, v4s16}, 199 {v8s16, v8s16}, 200 {v2s32, v2s32}, 201 {v4s32, v4s32}, 202 {v2s64, v2s64}, 203 }) 204 .widenScalarToNextPow2(0) 205 .clampScalar(1, s32, s64) 206 .clampScalar(0, s32, s64) 207 .clampNumElements(0, v8s8, v16s8) 208 .clampNumElements(0, v4s16, v8s16) 209 .clampNumElements(0, v2s32, v4s32) 210 .clampNumElements(0, v2s64, v2s64) 211 .moreElementsToNextPow2(0) 212 .minScalarSameAs(1, 0) 213 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); 214 215 getActionDefinitionsBuilder(G_PTR_ADD) 216 .legalFor({{p0, s64}, {v2p0, v2s64}}) 217 .clampScalarOrElt(1, s64, s64) 218 .clampNumElements(0, v2p0, v2p0); 219 220 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}}); 221 222 getActionDefinitionsBuilder({G_SDIV, G_UDIV}) 223 .legalFor({s32, s64}) 224 .libcallFor({s128}) 225 .clampScalar(0, s32, s64) 226 .widenScalarToNextPow2(0) 227 .scalarize(0); 228 229 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 230 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) 231 .libcallFor({s128}) 232 .widenScalarOrEltToNextPow2(0) 233 .minScalarOrElt(0, s32) 234 .clampNumElements(0, v2s32, v4s32) 235 .clampNumElements(0, v2s64, v2s64) 236 .scalarize(0); 237 238 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 239 .widenScalarToNextPow2(0, /*Min = */ 32) 240 .clampScalar(0, s32, s64) 241 .lower(); 242 243 getActionDefinitionsBuilder({G_SMULH, G_UMULH}) 244 .legalFor({s64, v8s16, v16s8, v4s32}) 245 .lower(); 246 247 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 248 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 249 .legalFor(HasCSSC, {s32, s64}) 250 .minScalar(HasCSSC, 0, s32) 251 .clampNumElements(0, v8s8, v16s8) 252 .clampNumElements(0, v4s16, v8s16) 253 .clampNumElements(0, v2s32, v4s32) 254 // FIXME: This sholdn't be needed as v2s64 types are going to 255 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet 256 .clampNumElements(0, v2s64, v2s64) 257 .lower(); 258 259 getActionDefinitionsBuilder( 260 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) 261 .legalFor({{s32, s32}, {s64, s32}}) 262 .clampScalar(0, s32, s64) 263 .clampScalar(1, s32, s64) 264 .widenScalarToNextPow2(0); 265 266 getActionDefinitionsBuilder( 267 {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM, 268 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT, 269 G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 270 .legalFor({s32, s64, v2s32, v4s32, v2s64}) 271 .legalFor(HasFP16, {s16, v4s16, v8s16}) 272 .libcallFor({s128}) 273 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 274 .minScalarOrElt(0, MinFPScalar) 275 .clampNumElements(0, v4s16, v8s16) 276 .clampNumElements(0, v2s32, v4s32) 277 .clampNumElements(0, v2s64, v2s64) 278 .moreElementsToNextPow2(0); 279 280 getActionDefinitionsBuilder({G_FABS, G_FNEG}) 281 .legalFor({s32, s64, v2s32, v4s32, v2s64}) 282 .legalFor(HasFP16, {s16, v4s16, v8s16}) 283 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 284 .lowerIf(scalarOrEltWiderThan(0, 64)) 285 .clampNumElements(0, v4s16, v8s16) 286 .clampNumElements(0, v2s32, v4s32) 287 .clampNumElements(0, v2s64, v2s64) 288 .moreElementsToNextPow2(0) 289 .lowerFor({s16, v4s16, v8s16}); 290 291 getActionDefinitionsBuilder(G_FREM) 292 .libcallFor({s32, s64, s128}) 293 .minScalar(0, s32) 294 .scalarize(0); 295 296 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT}) 297 .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}}) 298 .libcallFor({{s64, s128}}) 299 .minScalarOrElt(1, MinFPScalar); 300 301 getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, 302 G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10, 303 G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH, 304 G_FSINH, G_FTANH}) 305 // We need a call for these, so we always need to scalarize. 306 .scalarize(0) 307 // Regardless of FP16 support, widen 16-bit elements to 32-bits. 308 .minScalar(0, s32) 309 .libcallFor({s32, s64, s128}); 310 getActionDefinitionsBuilder(G_FPOWI) 311 .scalarize(0) 312 .minScalar(0, s32) 313 .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}}); 314 315 getActionDefinitionsBuilder(G_INSERT) 316 .legalIf(all(typeInSet(0, {s32, s64, p0}), 317 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) 318 .widenScalarToNextPow2(0) 319 .clampScalar(0, s32, s64) 320 .widenScalarToNextPow2(1) 321 .minScalar(1, s8) 322 .maxScalarIf(typeInSet(0, {s32}), 1, s16) 323 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32); 324 325 getActionDefinitionsBuilder(G_EXTRACT) 326 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}), 327 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1))) 328 .widenScalarToNextPow2(1) 329 .clampScalar(1, s32, s128) 330 .widenScalarToNextPow2(0) 331 .minScalar(0, s16) 332 .maxScalarIf(typeInSet(1, {s32}), 0, s16) 333 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) 334 .maxScalarIf(typeInSet(1, {s128}), 0, s64); 335 336 337 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { 338 auto &Actions = getActionDefinitionsBuilder(Op); 339 340 if (Op == G_SEXTLOAD) 341 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); 342 343 // Atomics have zero extending behavior. 344 Actions 345 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, 346 {s32, p0, s16, 8}, 347 {s32, p0, s32, 8}, 348 {s64, p0, s8, 2}, 349 {s64, p0, s16, 2}, 350 {s64, p0, s32, 4}, 351 {s64, p0, s64, 8}, 352 {p0, p0, s64, 8}, 353 {v2s32, p0, s64, 8}}) 354 .widenScalarToNextPow2(0) 355 .clampScalar(0, s32, s64) 356 // TODO: We could support sum-of-pow2's but the lowering code doesn't know 357 // how to do that yet. 358 .unsupportedIfMemSizeNotPow2() 359 // Lower anything left over into G_*EXT and G_LOAD 360 .lower(); 361 } 362 363 auto IsPtrVecPred = [=](const LegalityQuery &Query) { 364 const LLT &ValTy = Query.Types[0]; 365 return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0; 366 }; 367 368 getActionDefinitionsBuilder(G_LOAD) 369 .customIf([=](const LegalityQuery &Query) { 370 return HasRCPC3 && Query.Types[0] == s128 && 371 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire; 372 }) 373 .customIf([=](const LegalityQuery &Query) { 374 return Query.Types[0] == s128 && 375 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 376 }) 377 .legalForTypesWithMemDesc({{s8, p0, s8, 8}, 378 {s16, p0, s16, 8}, 379 {s32, p0, s32, 8}, 380 {s64, p0, s64, 8}, 381 {p0, p0, s64, 8}, 382 {s128, p0, s128, 8}, 383 {v8s8, p0, s64, 8}, 384 {v16s8, p0, s128, 8}, 385 {v4s16, p0, s64, 8}, 386 {v8s16, p0, s128, 8}, 387 {v2s32, p0, s64, 8}, 388 {v4s32, p0, s128, 8}, 389 {v2s64, p0, s128, 8}}) 390 // These extends are also legal 391 .legalForTypesWithMemDesc( 392 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}}) 393 .legalForTypesWithMemDesc({ 394 // SVE vscale x 128 bit base sizes 395 {nxv16s8, p0, nxv16s8, 8}, 396 {nxv8s16, p0, nxv8s16, 8}, 397 {nxv4s32, p0, nxv4s32, 8}, 398 {nxv2s64, p0, nxv2s64, 8}, 399 }) 400 .widenScalarToNextPow2(0, /* MinSize = */ 8) 401 .clampMaxNumElements(0, s8, 16) 402 .clampMaxNumElements(0, s16, 8) 403 .clampMaxNumElements(0, s32, 4) 404 .clampMaxNumElements(0, s64, 2) 405 .clampMaxNumElements(0, p0, 2) 406 .lowerIfMemSizeNotByteSizePow2() 407 .clampScalar(0, s8, s64) 408 .narrowScalarIf( 409 [=](const LegalityQuery &Query) { 410 // Clamp extending load results to 32-bits. 411 return Query.Types[0].isScalar() && 412 Query.Types[0] != Query.MMODescrs[0].MemoryTy && 413 Query.Types[0].getSizeInBits() > 32; 414 }, 415 changeTo(0, s32)) 416 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out 417 .bitcastIf(typeInSet(0, {v4s8}), 418 [=](const LegalityQuery &Query) { 419 const LLT VecTy = Query.Types[0]; 420 return std::pair(0, LLT::scalar(VecTy.getSizeInBits())); 421 }) 422 .customIf(IsPtrVecPred) 423 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0) 424 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); 425 426 getActionDefinitionsBuilder(G_STORE) 427 .customIf([=](const LegalityQuery &Query) { 428 return HasRCPC3 && Query.Types[0] == s128 && 429 Query.MMODescrs[0].Ordering == AtomicOrdering::Release; 430 }) 431 .customIf([=](const LegalityQuery &Query) { 432 return Query.Types[0] == s128 && 433 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic; 434 }) 435 .legalForTypesWithMemDesc( 436 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16 437 {s32, p0, s8, 8}, // truncstorei8 from s32 438 {s64, p0, s8, 8}, // truncstorei8 from s64 439 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32 440 {s64, p0, s16, 8}, // truncstorei16 from s64 441 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8}, 442 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64 443 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8}, 444 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8}, 445 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}}) 446 .legalForTypesWithMemDesc({ 447 // SVE vscale x 128 bit base sizes 448 // TODO: Add nxv2p0. Consider bitcastIf. 449 // See #92130 450 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461 451 {nxv16s8, p0, nxv16s8, 8}, 452 {nxv8s16, p0, nxv8s16, 8}, 453 {nxv4s32, p0, nxv4s32, 8}, 454 {nxv2s64, p0, nxv2s64, 8}, 455 }) 456 .clampScalar(0, s8, s64) 457 .minScalarOrElt(0, s8) 458 .lowerIf([=](const LegalityQuery &Query) { 459 return Query.Types[0].isScalar() && 460 Query.Types[0] != Query.MMODescrs[0].MemoryTy; 461 }) 462 // Maximum: sN * k = 128 463 .clampMaxNumElements(0, s8, 16) 464 .clampMaxNumElements(0, s16, 8) 465 .clampMaxNumElements(0, s32, 4) 466 .clampMaxNumElements(0, s64, 2) 467 .clampMaxNumElements(0, p0, 2) 468 .lowerIfMemSizeNotPow2() 469 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out 470 .bitcastIf(all(typeInSet(0, {v4s8}), 471 LegalityPredicate([=](const LegalityQuery &Query) { 472 return Query.Types[0].getSizeInBits() == 473 Query.MMODescrs[0].MemoryTy.getSizeInBits(); 474 })), 475 [=](const LegalityQuery &Query) { 476 const LLT VecTy = Query.Types[0]; 477 return std::pair(0, LLT::scalar(VecTy.getSizeInBits())); 478 }) 479 .customIf(IsPtrVecPred) 480 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0) 481 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 482 .lower(); 483 484 getActionDefinitionsBuilder(G_INDEXED_STORE) 485 // Idx 0 == Ptr, Idx 1 == Val 486 // TODO: we can implement legalizations but as of now these are 487 // generated in a very specific way. 488 .legalForTypesWithMemDesc({ 489 {p0, s8, s8, 8}, 490 {p0, s16, s16, 8}, 491 {p0, s32, s8, 8}, 492 {p0, s32, s16, 8}, 493 {p0, s32, s32, 8}, 494 {p0, s64, s64, 8}, 495 {p0, p0, p0, 8}, 496 {p0, v8s8, v8s8, 8}, 497 {p0, v16s8, v16s8, 8}, 498 {p0, v4s16, v4s16, 8}, 499 {p0, v8s16, v8s16, 8}, 500 {p0, v2s32, v2s32, 8}, 501 {p0, v4s32, v4s32, 8}, 502 {p0, v2s64, v2s64, 8}, 503 {p0, v2p0, v2p0, 8}, 504 {p0, s128, s128, 8}, 505 }) 506 .unsupported(); 507 508 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) { 509 LLT LdTy = Query.Types[0]; 510 LLT PtrTy = Query.Types[1]; 511 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) && 512 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128) 513 return false; 514 if (PtrTy != p0) 515 return false; 516 return true; 517 }; 518 getActionDefinitionsBuilder(G_INDEXED_LOAD) 519 .unsupportedIf( 520 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 521 .legalIf(IndexedLoadBasicPred) 522 .unsupported(); 523 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD}) 524 .unsupportedIf( 525 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) 526 .legalIf(all(typeInSet(0, {s16, s32, s64}), 527 LegalityPredicate([=](const LegalityQuery &Q) { 528 LLT LdTy = Q.Types[0]; 529 LLT PtrTy = Q.Types[1]; 530 LLT MemTy = Q.MMODescrs[0].MemoryTy; 531 if (PtrTy != p0) 532 return false; 533 if (LdTy == s16) 534 return MemTy == s8; 535 if (LdTy == s32) 536 return MemTy == s8 || MemTy == s16; 537 if (LdTy == s64) 538 return MemTy == s8 || MemTy == s16 || MemTy == s32; 539 return false; 540 }))) 541 .unsupported(); 542 543 // Constants 544 getActionDefinitionsBuilder(G_CONSTANT) 545 .legalFor({p0, s8, s16, s32, s64}) 546 .widenScalarToNextPow2(0) 547 .clampScalar(0, s8, s64); 548 getActionDefinitionsBuilder(G_FCONSTANT) 549 .legalFor({s32, s64, s128}) 550 .legalFor(HasFP16, {s16}) 551 .clampScalar(0, MinFPScalar, s128); 552 553 // FIXME: fix moreElementsToNextPow2 554 getActionDefinitionsBuilder(G_ICMP) 555 .legalFor({{s32, s32}, {s32, s64}, {s32, p0}}) 556 .widenScalarOrEltToNextPow2(1) 557 .clampScalar(1, s32, s64) 558 .clampScalar(0, s32, s32) 559 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 560 .minScalarEltSameAsIf( 561 [=](const LegalityQuery &Query) { 562 const LLT &Ty = Query.Types[0]; 563 const LLT &SrcTy = Query.Types[1]; 564 return Ty.isVector() && !SrcTy.isPointerVector() && 565 Ty.getElementType() != SrcTy.getElementType(); 566 }, 567 0, 1) 568 .minScalarOrEltIf( 569 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; }, 570 1, s32) 571 .minScalarOrEltIf( 572 [=](const LegalityQuery &Query) { 573 return Query.Types[1].isPointerVector(); 574 }, 575 0, s64) 576 .moreElementsToNextPow2(1) 577 .clampNumElements(1, v8s8, v16s8) 578 .clampNumElements(1, v4s16, v8s16) 579 .clampNumElements(1, v2s32, v4s32) 580 .clampNumElements(1, v2s64, v2s64) 581 .clampNumElements(1, v2p0, v2p0) 582 .customIf(isVector(0)); 583 584 getActionDefinitionsBuilder(G_FCMP) 585 .legalFor({{s32, s32}, 586 {s32, s64}, 587 {v4s32, v4s32}, 588 {v2s32, v2s32}, 589 {v2s64, v2s64}}) 590 .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) 591 .widenScalarOrEltToNextPow2(1) 592 .clampScalar(0, s32, s32) 593 .minScalarOrElt(1, MinFPScalar) 594 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 595 .minScalarEltSameAsIf( 596 [=](const LegalityQuery &Query) { 597 const LLT &Ty = Query.Types[0]; 598 const LLT &SrcTy = Query.Types[1]; 599 return Ty.isVector() && !SrcTy.isPointerVector() && 600 Ty.getElementType() != SrcTy.getElementType(); 601 }, 602 0, 1) 603 .clampNumElements(1, v4s16, v8s16) 604 .clampNumElements(1, v2s32, v4s32) 605 .clampMaxNumElements(1, s64, 2) 606 .moreElementsToNextPow2(1) 607 .libcallFor({{s32, s128}}); 608 609 // Extensions 610 auto ExtLegalFunc = [=](const LegalityQuery &Query) { 611 unsigned DstSize = Query.Types[0].getSizeInBits(); 612 613 // Handle legal vectors using legalFor 614 if (Query.Types[0].isVector()) 615 return false; 616 617 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize)) 618 return false; // Extending to a scalar s128 needs narrowing. 619 620 const LLT &SrcTy = Query.Types[1]; 621 622 // Make sure we fit in a register otherwise. Don't bother checking that 623 // the source type is below 128 bits. We shouldn't be allowing anything 624 // through which is wider than the destination in the first place. 625 unsigned SrcSize = SrcTy.getSizeInBits(); 626 if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) 627 return false; 628 629 return true; 630 }; 631 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) 632 .legalIf(ExtLegalFunc) 633 .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}}) 634 .clampScalar(0, s64, s64) // Just for s128, others are handled above. 635 .moreElementsToNextPow2(0) 636 .clampMaxNumElements(1, s8, 8) 637 .clampMaxNumElements(1, s16, 4) 638 .clampMaxNumElements(1, s32, 2) 639 // Tries to convert a large EXTEND into two smaller EXTENDs 640 .lowerIf([=](const LegalityQuery &Query) { 641 return (Query.Types[0].getScalarSizeInBits() > 642 Query.Types[1].getScalarSizeInBits() * 2) && 643 Query.Types[0].isVector() && 644 (Query.Types[1].getScalarSizeInBits() == 8 || 645 Query.Types[1].getScalarSizeInBits() == 16); 646 }) 647 .clampMinNumElements(1, s8, 8) 648 .clampMinNumElements(1, s16, 4); 649 650 getActionDefinitionsBuilder(G_TRUNC) 651 .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}}) 652 .moreElementsToNextPow2(0) 653 .clampMaxNumElements(0, s8, 8) 654 .clampMaxNumElements(0, s16, 4) 655 .clampMaxNumElements(0, s32, 2) 656 .minScalarOrEltIf( 657 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, 658 0, s8) 659 .lowerIf([=](const LegalityQuery &Query) { 660 LLT DstTy = Query.Types[0]; 661 LLT SrcTy = Query.Types[1]; 662 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 && 663 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits(); 664 }) 665 .clampMinNumElements(0, s8, 8) 666 .clampMinNumElements(0, s16, 4) 667 .alwaysLegal(); 668 669 getActionDefinitionsBuilder(G_SEXT_INREG) 670 .legalFor({s32, s64}) 671 .legalFor(PackedVectorAllTypeList) 672 .maxScalar(0, s64) 673 .clampNumElements(0, v8s8, v16s8) 674 .clampNumElements(0, v4s16, v8s16) 675 .clampNumElements(0, v2s32, v4s32) 676 .clampMaxNumElements(0, s64, 2) 677 .lower(); 678 679 // FP conversions 680 getActionDefinitionsBuilder(G_FPTRUNC) 681 .legalFor( 682 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) 683 .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}}) 684 .clampNumElements(0, v4s16, v4s16) 685 .clampNumElements(0, v2s32, v2s32) 686 .scalarize(0); 687 688 getActionDefinitionsBuilder(G_FPEXT) 689 .legalFor( 690 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) 691 .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}}) 692 .clampNumElements(0, v4s32, v4s32) 693 .clampNumElements(0, v2s64, v2s64) 694 .scalarize(0); 695 696 // Conversions 697 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 698 .legalFor({{s32, s32}, 699 {s64, s32}, 700 {s32, s64}, 701 {s64, s64}, 702 {v2s64, v2s64}, 703 {v4s32, v4s32}, 704 {v2s32, v2s32}}) 705 .legalFor(HasFP16, 706 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) 707 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 708 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 709 // The range of a fp16 value fits into an i17, so we can lower the width 710 // to i64. 711 .narrowScalarIf( 712 [=](const LegalityQuery &Query) { 713 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64; 714 }, 715 changeTo(0, s64)) 716 .moreElementsToNextPow2(0) 717 .widenScalarOrEltToNextPow2OrMinSize(0) 718 .minScalar(0, s32) 719 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32) 720 .widenScalarIf( 721 [=](const LegalityQuery &Query) { 722 return Query.Types[0].getScalarSizeInBits() <= 64 && 723 Query.Types[0].getScalarSizeInBits() > 724 Query.Types[1].getScalarSizeInBits(); 725 }, 726 LegalizeMutations::changeElementSizeTo(1, 0)) 727 .widenScalarIf( 728 [=](const LegalityQuery &Query) { 729 return Query.Types[1].getScalarSizeInBits() <= 64 && 730 Query.Types[0].getScalarSizeInBits() < 731 Query.Types[1].getScalarSizeInBits(); 732 }, 733 LegalizeMutations::changeElementSizeTo(0, 1)) 734 .clampNumElements(0, v4s16, v8s16) 735 .clampNumElements(0, v2s32, v4s32) 736 .clampMaxNumElements(0, s64, 2) 737 .libcallFor( 738 {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}}); 739 740 getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT}) 741 .legalFor({{s32, s32}, 742 {s64, s32}, 743 {s32, s64}, 744 {s64, s64}, 745 {v2s64, v2s64}, 746 {v4s32, v4s32}, 747 {v2s32, v2s32}}) 748 .legalFor(HasFP16, 749 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}}) 750 // Handle types larger than i64 by scalarizing/lowering. 751 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 752 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 753 // The range of a fp16 value fits into an i17, so we can lower the width 754 // to i64. 755 .narrowScalarIf( 756 [=](const LegalityQuery &Query) { 757 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64; 758 }, 759 changeTo(0, s64)) 760 .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0) 761 .moreElementsToNextPow2(0) 762 .widenScalarToNextPow2(0, /*MinSize=*/32) 763 .minScalar(0, s32) 764 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32) 765 .widenScalarIf( 766 [=](const LegalityQuery &Query) { 767 unsigned ITySize = Query.Types[0].getScalarSizeInBits(); 768 return (ITySize == 16 || ITySize == 32 || ITySize == 64) && 769 ITySize > Query.Types[1].getScalarSizeInBits(); 770 }, 771 LegalizeMutations::changeElementSizeTo(1, 0)) 772 .widenScalarIf( 773 [=](const LegalityQuery &Query) { 774 unsigned FTySize = Query.Types[1].getScalarSizeInBits(); 775 return (FTySize == 16 || FTySize == 32 || FTySize == 64) && 776 Query.Types[0].getScalarSizeInBits() < FTySize; 777 }, 778 LegalizeMutations::changeElementSizeTo(0, 1)) 779 .widenScalarOrEltToNextPow2(0) 780 .clampNumElements(0, v4s16, v8s16) 781 .clampNumElements(0, v2s32, v4s32) 782 .clampMaxNumElements(0, s64, 2); 783 784 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 785 .legalFor({{s32, s32}, 786 {s64, s32}, 787 {s32, s64}, 788 {s64, s64}, 789 {v2s64, v2s64}, 790 {v4s32, v4s32}, 791 {v2s32, v2s32}}) 792 .legalFor(HasFP16, 793 {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}}) 794 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1) 795 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 796 .moreElementsToNextPow2(1) 797 .widenScalarOrEltToNextPow2OrMinSize(1) 798 .minScalar(1, s32) 799 .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32) 800 .widenScalarIf( 801 [=](const LegalityQuery &Query) { 802 return Query.Types[1].getScalarSizeInBits() <= 64 && 803 Query.Types[0].getScalarSizeInBits() < 804 Query.Types[1].getScalarSizeInBits(); 805 }, 806 LegalizeMutations::changeElementSizeTo(0, 1)) 807 .widenScalarIf( 808 [=](const LegalityQuery &Query) { 809 return Query.Types[0].getScalarSizeInBits() <= 64 && 810 Query.Types[0].getScalarSizeInBits() > 811 Query.Types[1].getScalarSizeInBits(); 812 }, 813 LegalizeMutations::changeElementSizeTo(1, 0)) 814 .clampNumElements(0, v4s16, v8s16) 815 .clampNumElements(0, v2s32, v4s32) 816 .clampMaxNumElements(0, s64, 2) 817 .libcallFor({{s16, s128}, 818 {s32, s128}, 819 {s64, s128}, 820 {s128, s128}, 821 {s128, s32}, 822 {s128, s64}}); 823 824 // Control-flow 825 getActionDefinitionsBuilder(G_BRCOND) 826 .legalFor({s32}) 827 .clampScalar(0, s32, s32); 828 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); 829 830 getActionDefinitionsBuilder(G_SELECT) 831 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) 832 .widenScalarToNextPow2(0) 833 .clampScalar(0, s32, s64) 834 .clampScalar(1, s32, s32) 835 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 836 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) 837 .lowerIf(isVector(0)); 838 839 // Pointer-handling 840 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); 841 842 if (TM.getCodeModel() == CodeModel::Small) 843 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom(); 844 else 845 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); 846 847 getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE) 848 .legalIf(all(typeIs(0, p0), typeIs(1, p0))); 849 850 getActionDefinitionsBuilder(G_PTRTOINT) 851 .legalFor({{s64, p0}, {v2s64, v2p0}}) 852 .widenScalarToNextPow2(0, 64) 853 .clampScalar(0, s64, s64) 854 .clampMaxNumElements(0, s64, 2); 855 856 getActionDefinitionsBuilder(G_INTTOPTR) 857 .unsupportedIf([&](const LegalityQuery &Query) { 858 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits(); 859 }) 860 .legalFor({{p0, s64}, {v2p0, v2s64}}) 861 .clampMaxNumElements(1, s64, 2); 862 863 // Casts for 32 and 64-bit width type are just copies. 864 // Same for 128-bit width type, except they are on the FPR bank. 865 getActionDefinitionsBuilder(G_BITCAST) 866 // Keeping 32-bit instructions legal to prevent regression in some tests 867 .legalForCartesianProduct({s32, v2s16, v4s8}) 868 .legalForCartesianProduct({s64, v8s8, v4s16, v2s32}) 869 .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0}) 870 .customIf([=](const LegalityQuery &Query) { 871 // Handle casts from i1 vectors to scalars. 872 LLT DstTy = Query.Types[0]; 873 LLT SrcTy = Query.Types[1]; 874 return DstTy.isScalar() && SrcTy.isVector() && 875 SrcTy.getScalarSizeInBits() == 1; 876 }) 877 .lowerIf([=](const LegalityQuery &Query) { 878 return Query.Types[0].isVector() != Query.Types[1].isVector(); 879 }) 880 .moreElementsToNextPow2(0) 881 .clampNumElements(0, v8s8, v16s8) 882 .clampNumElements(0, v4s16, v8s16) 883 .clampNumElements(0, v2s32, v4s32) 884 .lower(); 885 886 getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); 887 888 // va_list must be a pointer, but most sized types are pretty easy to handle 889 // as the destination. 890 getActionDefinitionsBuilder(G_VAARG) 891 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0}) 892 .clampScalar(0, s8, s64) 893 .widenScalarToNextPow2(0, /*Min*/ 8); 894 895 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 896 .lowerIf( 897 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); 898 899 bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE(); 900 901 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 902 .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}}) 903 .customFor(!UseOutlineAtomics, {{s128, p0}}) 904 .libcallFor(UseOutlineAtomics, 905 {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}}) 906 .clampScalar(0, s32, s64); 907 908 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, 909 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, 910 G_ATOMICRMW_XOR}) 911 .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}}) 912 .libcallFor(UseOutlineAtomics, 913 {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}}) 914 .clampScalar(0, s32, s64); 915 916 // Do not outline these atomics operations, as per comment in 917 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR(). 918 getActionDefinitionsBuilder( 919 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) 920 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) 921 .clampScalar(0, s32, s64); 922 923 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); 924 925 // Merge/Unmerge 926 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 927 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 928 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 929 getActionDefinitionsBuilder(Op) 930 .widenScalarToNextPow2(LitTyIdx, 8) 931 .widenScalarToNextPow2(BigTyIdx, 32) 932 .clampScalar(LitTyIdx, s8, s64) 933 .clampScalar(BigTyIdx, s32, s128) 934 .legalIf([=](const LegalityQuery &Q) { 935 switch (Q.Types[BigTyIdx].getSizeInBits()) { 936 case 32: 937 case 64: 938 case 128: 939 break; 940 default: 941 return false; 942 } 943 switch (Q.Types[LitTyIdx].getSizeInBits()) { 944 case 8: 945 case 16: 946 case 32: 947 case 64: 948 return true; 949 default: 950 return false; 951 } 952 }); 953 } 954 955 // TODO : nxv4s16, nxv2s16, nxv2s32 956 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 957 .legalFor(HasSVE, {{s16, nxv16s8, s64}, 958 {s16, nxv8s16, s64}, 959 {s32, nxv4s32, s64}, 960 {s64, nxv2s64, s64}}) 961 .unsupportedIf([=](const LegalityQuery &Query) { 962 const LLT &EltTy = Query.Types[1].getElementType(); 963 if (Query.Types[1].isScalableVector()) 964 return false; 965 return Query.Types[0] != EltTy; 966 }) 967 .minScalar(2, s64) 968 .customIf([=](const LegalityQuery &Query) { 969 const LLT &VecTy = Query.Types[1]; 970 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || 971 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || 972 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0; 973 }) 974 .minScalarOrEltIf( 975 [=](const LegalityQuery &Query) { 976 // We want to promote to <M x s1> to <M x s64> if that wouldn't 977 // cause the total vec size to be > 128b. 978 return Query.Types[1].isFixedVector() && 979 Query.Types[1].getNumElements() <= 2; 980 }, 981 0, s64) 982 .minScalarOrEltIf( 983 [=](const LegalityQuery &Query) { 984 return Query.Types[1].isFixedVector() && 985 Query.Types[1].getNumElements() <= 4; 986 }, 987 0, s32) 988 .minScalarOrEltIf( 989 [=](const LegalityQuery &Query) { 990 return Query.Types[1].isFixedVector() && 991 Query.Types[1].getNumElements() <= 8; 992 }, 993 0, s16) 994 .minScalarOrEltIf( 995 [=](const LegalityQuery &Query) { 996 return Query.Types[1].isFixedVector() && 997 Query.Types[1].getNumElements() <= 16; 998 }, 999 0, s8) 1000 .minScalarOrElt(0, s8) // Worst case, we need at least s8. 1001 .moreElementsToNextPow2(1) 1002 .clampMaxNumElements(1, s64, 2) 1003 .clampMaxNumElements(1, s32, 4) 1004 .clampMaxNumElements(1, s16, 8) 1005 .clampMaxNumElements(1, s8, 16) 1006 .clampMaxNumElements(1, p0, 2); 1007 1008 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) 1009 .legalIf( 1010 typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0})) 1011 .legalFor(HasSVE, {{nxv16s8, s32, s64}, 1012 {nxv8s16, s32, s64}, 1013 {nxv4s32, s32, s64}, 1014 {nxv2s64, s64, s64}}) 1015 .moreElementsToNextPow2(0) 1016 .widenVectorEltsToVectorMinSize(0, 64) 1017 .clampNumElements(0, v8s8, v16s8) 1018 .clampNumElements(0, v4s16, v8s16) 1019 .clampNumElements(0, v2s32, v4s32) 1020 .clampMaxNumElements(0, s64, 2) 1021 .clampMaxNumElements(0, p0, 2); 1022 1023 getActionDefinitionsBuilder(G_BUILD_VECTOR) 1024 .legalFor({{v8s8, s8}, 1025 {v16s8, s8}, 1026 {v4s16, s16}, 1027 {v8s16, s16}, 1028 {v2s32, s32}, 1029 {v4s32, s32}, 1030 {v2p0, p0}, 1031 {v2s64, s64}}) 1032 .clampNumElements(0, v4s32, v4s32) 1033 .clampNumElements(0, v2s64, v2s64) 1034 .minScalarOrElt(0, s8) 1035 .widenVectorEltsToVectorMinSize(0, 64) 1036 .widenScalarOrEltToNextPow2(0) 1037 .minScalarSameAs(1, 0); 1038 1039 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); 1040 1041 getActionDefinitionsBuilder(G_CTLZ) 1042 .legalForCartesianProduct( 1043 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) 1044 .scalarize(1) 1045 .widenScalarToNextPow2(1, /*Min=*/32) 1046 .clampScalar(1, s32, s64) 1047 .scalarSameSizeAs(0, 1); 1048 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower(); 1049 1050 // TODO: Custom lowering for v2s32, v4s32, v2s64. 1051 getActionDefinitionsBuilder(G_BITREVERSE) 1052 .legalFor({s32, s64, v8s8, v16s8}) 1053 .widenScalarToNextPow2(0, /*Min = */ 32) 1054 .clampScalar(0, s32, s64) 1055 .lower(); 1056 1057 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower(); 1058 1059 getActionDefinitionsBuilder(G_CTTZ) 1060 .lowerIf(isVector(0)) 1061 .widenScalarToNextPow2(1, /*Min=*/32) 1062 .clampScalar(1, s32, s64) 1063 .scalarSameSizeAs(0, 1) 1064 .legalFor(HasCSSC, {s32, s64}) 1065 .customFor(!HasCSSC, {s32, s64}); 1066 1067 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1068 .legalIf([=](const LegalityQuery &Query) { 1069 const LLT &DstTy = Query.Types[0]; 1070 const LLT &SrcTy = Query.Types[1]; 1071 // For now just support the TBL2 variant which needs the source vectors 1072 // to be the same size as the dest. 1073 if (DstTy != SrcTy) 1074 return false; 1075 return llvm::is_contained( 1076 {v2s64, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy); 1077 }) 1078 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar 1079 // destinations, we just want those lowered into G_BUILD_VECTOR or 1080 // G_EXTRACT_ELEMENT. 1081 .lowerIf([=](const LegalityQuery &Query) { 1082 return !Query.Types[0].isVector() || !Query.Types[1].isVector(); 1083 }) 1084 .moreElementsIf( 1085 [](const LegalityQuery &Query) { 1086 return Query.Types[0].isVector() && Query.Types[1].isVector() && 1087 Query.Types[0].getNumElements() > 1088 Query.Types[1].getNumElements(); 1089 }, 1090 changeTo(1, 0)) 1091 .moreElementsToNextPow2(0) 1092 .moreElementsIf( 1093 [](const LegalityQuery &Query) { 1094 return Query.Types[0].isVector() && Query.Types[1].isVector() && 1095 Query.Types[0].getNumElements() < 1096 Query.Types[1].getNumElements(); 1097 }, 1098 changeTo(0, 1)) 1099 .widenScalarOrEltToNextPow2OrMinSize(0, 8) 1100 .clampNumElements(0, v8s8, v16s8) 1101 .clampNumElements(0, v4s16, v8s16) 1102 .clampNumElements(0, v4s32, v4s32) 1103 .clampNumElements(0, v2s64, v2s64) 1104 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 1105 .bitcastIf(isPointerVector(0), [=](const LegalityQuery &Query) { 1106 // Bitcast pointers vector to i64. 1107 const LLT DstTy = Query.Types[0]; 1108 return std::pair(0, LLT::vector(DstTy.getElementCount(), 64)); 1109 }); 1110 1111 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1112 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}}) 1113 .bitcastIf( 1114 [=](const LegalityQuery &Query) { 1115 return Query.Types[0].getSizeInBits() <= 128 && 1116 Query.Types[1].getSizeInBits() <= 64; 1117 }, 1118 [=](const LegalityQuery &Query) { 1119 const LLT DstTy = Query.Types[0]; 1120 const LLT SrcTy = Query.Types[1]; 1121 return std::pair( 1122 0, DstTy.changeElementSize(SrcTy.getSizeInBits()) 1123 .changeElementCount( 1124 DstTy.getElementCount().divideCoefficientBy( 1125 SrcTy.getNumElements()))); 1126 }); 1127 1128 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0}); 1129 1130 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}}); 1131 1132 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); 1133 1134 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); 1135 1136 if (ST.hasMOPS()) { 1137 // G_BZERO is not supported. Currently it is only emitted by 1138 // PreLegalizerCombiner for G_MEMSET with zero constant. 1139 getActionDefinitionsBuilder(G_BZERO).unsupported(); 1140 1141 getActionDefinitionsBuilder(G_MEMSET) 1142 .legalForCartesianProduct({p0}, {s64}, {s64}) 1143 .customForCartesianProduct({p0}, {s8}, {s64}) 1144 .immIdx(0); // Inform verifier imm idx 0 is handled. 1145 1146 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) 1147 .legalForCartesianProduct({p0}, {p0}, {s64}) 1148 .immIdx(0); // Inform verifier imm idx 0 is handled. 1149 1150 // G_MEMCPY_INLINE does not have a tailcall immediate 1151 getActionDefinitionsBuilder(G_MEMCPY_INLINE) 1152 .legalForCartesianProduct({p0}, {p0}, {s64}); 1153 1154 } else { 1155 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) 1156 .libcall(); 1157 } 1158 1159 // FIXME: Legal vector types are only legal with NEON. 1160 getActionDefinitionsBuilder(G_ABS) 1161 .legalFor(HasCSSC, {s32, s64}) 1162 .legalFor(PackedVectorAllTypeList) 1163 .customIf([=](const LegalityQuery &Q) { 1164 // TODO: Fix suboptimal codegen for 128+ bit types. 1165 LLT SrcTy = Q.Types[0]; 1166 return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128; 1167 }) 1168 .widenScalarIf( 1169 [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; }, 1170 [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); }) 1171 .widenScalarIf( 1172 [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; }, 1173 [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); }) 1174 .clampNumElements(0, v8s8, v16s8) 1175 .clampNumElements(0, v4s16, v8s16) 1176 .clampNumElements(0, v2s32, v4s32) 1177 .clampNumElements(0, v2s64, v2s64) 1178 .moreElementsToNextPow2(0) 1179 .lower(); 1180 1181 // For fadd reductions we have pairwise operations available. We treat the 1182 // usual legal types as legal and handle the lowering to pairwise instructions 1183 // later. 1184 getActionDefinitionsBuilder(G_VECREDUCE_FADD) 1185 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}}) 1186 .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}}) 1187 .minScalarOrElt(0, MinFPScalar) 1188 .clampMaxNumElements(1, s64, 2) 1189 .clampMaxNumElements(1, s32, 4) 1190 .clampMaxNumElements(1, s16, 8) 1191 .lower(); 1192 1193 // For fmul reductions we need to split up into individual operations. We 1194 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of 1195 // smaller types, followed by scalarizing what remains. 1196 getActionDefinitionsBuilder(G_VECREDUCE_FMUL) 1197 .minScalarOrElt(0, MinFPScalar) 1198 .clampMaxNumElements(1, s64, 2) 1199 .clampMaxNumElements(1, s32, 4) 1200 .clampMaxNumElements(1, s16, 8) 1201 .clampMaxNumElements(1, s32, 2) 1202 .clampMaxNumElements(1, s16, 4) 1203 .scalarize(1) 1204 .lower(); 1205 1206 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL}) 1207 .scalarize(2) 1208 .lower(); 1209 1210 getActionDefinitionsBuilder(G_VECREDUCE_ADD) 1211 .legalFor({{s8, v16s8}, 1212 {s8, v8s8}, 1213 {s16, v8s16}, 1214 {s16, v4s16}, 1215 {s32, v4s32}, 1216 {s32, v2s32}, 1217 {s64, v2s64}}) 1218 .clampMaxNumElements(1, s64, 2) 1219 .clampMaxNumElements(1, s32, 4) 1220 .clampMaxNumElements(1, s16, 8) 1221 .clampMaxNumElements(1, s8, 16) 1222 .lower(); 1223 1224 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX, 1225 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM}) 1226 .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}}) 1227 .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}}) 1228 .minScalarOrElt(0, MinFPScalar) 1229 .clampMaxNumElements(1, s64, 2) 1230 .clampMaxNumElements(1, s32, 4) 1231 .clampMaxNumElements(1, s16, 8) 1232 .lower(); 1233 1234 getActionDefinitionsBuilder(G_VECREDUCE_MUL) 1235 .clampMaxNumElements(1, s32, 2) 1236 .clampMaxNumElements(1, s16, 4) 1237 .clampMaxNumElements(1, s8, 8) 1238 .scalarize(1) 1239 .lower(); 1240 1241 getActionDefinitionsBuilder( 1242 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX}) 1243 .legalFor({{s8, v8s8}, 1244 {s8, v16s8}, 1245 {s16, v4s16}, 1246 {s16, v8s16}, 1247 {s32, v2s32}, 1248 {s32, v4s32}}) 1249 .moreElementsIf( 1250 [=](const LegalityQuery &Query) { 1251 return Query.Types[1].isVector() && 1252 Query.Types[1].getElementType() != s8 && 1253 Query.Types[1].getNumElements() & 1; 1254 }, 1255 LegalizeMutations::moreElementsToNextPow2(1)) 1256 .clampMaxNumElements(1, s64, 2) 1257 .clampMaxNumElements(1, s32, 4) 1258 .clampMaxNumElements(1, s16, 8) 1259 .clampMaxNumElements(1, s8, 16) 1260 .scalarize(1) 1261 .lower(); 1262 1263 getActionDefinitionsBuilder( 1264 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 1265 // Try to break down into smaller vectors as long as they're at least 64 1266 // bits. This lets us use vector operations for some parts of the 1267 // reduction. 1268 .fewerElementsIf( 1269 [=](const LegalityQuery &Q) { 1270 LLT SrcTy = Q.Types[1]; 1271 if (SrcTy.isScalar()) 1272 return false; 1273 if (!isPowerOf2_32(SrcTy.getNumElements())) 1274 return false; 1275 // We can usually perform 64b vector operations. 1276 return SrcTy.getSizeInBits() > 64; 1277 }, 1278 [=](const LegalityQuery &Q) { 1279 LLT SrcTy = Q.Types[1]; 1280 return std::make_pair(1, SrcTy.divide(2)); 1281 }) 1282 .scalarize(1) 1283 .lower(); 1284 1285 // TODO: Update this to correct handling when adding AArch64/SVE support. 1286 getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower(); 1287 1288 getActionDefinitionsBuilder({G_FSHL, G_FSHR}) 1289 .customFor({{s32, s32}, {s32, s64}, {s64, s64}}) 1290 .lower(); 1291 1292 getActionDefinitionsBuilder(G_ROTR) 1293 .legalFor({{s32, s64}, {s64, s64}}) 1294 .customIf([=](const LegalityQuery &Q) { 1295 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64; 1296 }) 1297 .lower(); 1298 getActionDefinitionsBuilder(G_ROTL).lower(); 1299 1300 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1301 .customFor({{s32, s32}, {s64, s64}}); 1302 1303 auto always = [=](const LegalityQuery &Q) { return true; }; 1304 getActionDefinitionsBuilder(G_CTPOP) 1305 .legalFor(HasCSSC, {{s32, s32}, {s64, s64}}) 1306 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}}) 1307 .customFor(!HasCSSC, {{s32, s32}, {s64, s64}}) 1308 .customFor({{s128, s128}, 1309 {v2s64, v2s64}, 1310 {v2s32, v2s32}, 1311 {v4s32, v4s32}, 1312 {v4s16, v4s16}, 1313 {v8s16, v8s16}}) 1314 .clampScalar(0, s32, s128) 1315 .widenScalarToNextPow2(0) 1316 .minScalarEltSameAsIf(always, 1, 0) 1317 .maxScalarEltSameAsIf(always, 1, 0); 1318 1319 getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}) 1320 .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8}) 1321 .legalFor(HasSVE, {nxv2s64, nxv4s32, nxv8s16, nxv16s8}) 1322 .clampNumElements(0, v8s8, v16s8) 1323 .clampNumElements(0, v4s16, v8s16) 1324 .clampNumElements(0, v2s32, v4s32) 1325 .clampMaxNumElements(0, s64, 2) 1326 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) 1327 .moreElementsToNextPow2(0) 1328 .lower(); 1329 1330 // TODO: Libcall support for s128. 1331 // TODO: s16 should be legal with full FP16 support. 1332 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 1333 .legalFor({{s64, s32}, {s64, s64}}); 1334 1335 // TODO: Custom legalization for mismatched types. 1336 getActionDefinitionsBuilder(G_FCOPYSIGN) 1337 .moreElementsIf( 1338 [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); }, 1339 [=](const LegalityQuery &Query) { 1340 const LLT Ty = Query.Types[0]; 1341 return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty)); 1342 }) 1343 .lower(); 1344 1345 getActionDefinitionsBuilder(G_FMAD).lower(); 1346 1347 // Access to floating-point environment. 1348 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV, 1349 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE}) 1350 .libcall(); 1351 1352 getActionDefinitionsBuilder(G_IS_FPCLASS).lower(); 1353 1354 getActionDefinitionsBuilder(G_PREFETCH).custom(); 1355 1356 getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower(); 1357 1358 getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR) 1359 .legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}}) 1360 .widenScalarOrEltToNextPow2(0) 1361 .immIdx(0); // Inform verifier imm idx 0 is handled. 1362 1363 // TODO: {nxv16s8, s8}, {nxv8s16, s16} 1364 getActionDefinitionsBuilder(G_SPLAT_VECTOR) 1365 .legalFor(HasSVE, {{nxv4s32, s32}, {nxv2s64, s64}}); 1366 1367 getLegacyLegalizerInfo().computeTables(); 1368 verify(*ST.getInstrInfo()); 1369 } 1370 1371 bool AArch64LegalizerInfo::legalizeCustom( 1372 LegalizerHelper &Helper, MachineInstr &MI, 1373 LostDebugLocObserver &LocObserver) const { 1374 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1375 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 1376 GISelChangeObserver &Observer = Helper.Observer; 1377 switch (MI.getOpcode()) { 1378 default: 1379 // No idea what to do. 1380 return false; 1381 case TargetOpcode::G_VAARG: 1382 return legalizeVaArg(MI, MRI, MIRBuilder); 1383 case TargetOpcode::G_LOAD: 1384 case TargetOpcode::G_STORE: 1385 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer); 1386 case TargetOpcode::G_SHL: 1387 case TargetOpcode::G_ASHR: 1388 case TargetOpcode::G_LSHR: 1389 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); 1390 case TargetOpcode::G_GLOBAL_VALUE: 1391 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); 1392 case TargetOpcode::G_SBFX: 1393 case TargetOpcode::G_UBFX: 1394 return legalizeBitfieldExtract(MI, MRI, Helper); 1395 case TargetOpcode::G_FSHL: 1396 case TargetOpcode::G_FSHR: 1397 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper); 1398 case TargetOpcode::G_ROTR: 1399 return legalizeRotate(MI, MRI, Helper); 1400 case TargetOpcode::G_CTPOP: 1401 return legalizeCTPOP(MI, MRI, Helper); 1402 case TargetOpcode::G_ATOMIC_CMPXCHG: 1403 return legalizeAtomicCmpxchg128(MI, MRI, Helper); 1404 case TargetOpcode::G_CTTZ: 1405 return legalizeCTTZ(MI, Helper); 1406 case TargetOpcode::G_BZERO: 1407 case TargetOpcode::G_MEMCPY: 1408 case TargetOpcode::G_MEMMOVE: 1409 case TargetOpcode::G_MEMSET: 1410 return legalizeMemOps(MI, Helper); 1411 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1412 return legalizeExtractVectorElt(MI, MRI, Helper); 1413 case TargetOpcode::G_DYN_STACKALLOC: 1414 return legalizeDynStackAlloc(MI, Helper); 1415 case TargetOpcode::G_PREFETCH: 1416 return legalizePrefetch(MI, Helper); 1417 case TargetOpcode::G_ABS: 1418 return Helper.lowerAbsToCNeg(MI); 1419 case TargetOpcode::G_ICMP: 1420 return legalizeICMP(MI, MRI, MIRBuilder); 1421 case TargetOpcode::G_BITCAST: 1422 return legalizeBitcast(MI, Helper); 1423 } 1424 1425 llvm_unreachable("expected switch to return"); 1426 } 1427 1428 bool AArch64LegalizerInfo::legalizeBitcast(MachineInstr &MI, 1429 LegalizerHelper &Helper) const { 1430 assert(MI.getOpcode() == TargetOpcode::G_BITCAST && "Unexpected opcode"); 1431 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); 1432 // We're trying to handle casts from i1 vectors to scalars but reloading from 1433 // stack. 1434 if (!DstTy.isScalar() || !SrcTy.isVector() || 1435 SrcTy.getElementType() != LLT::scalar(1)) 1436 return false; 1437 1438 Helper.createStackStoreLoad(DstReg, SrcReg); 1439 MI.eraseFromParent(); 1440 return true; 1441 } 1442 1443 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI, 1444 MachineRegisterInfo &MRI, 1445 MachineIRBuilder &MIRBuilder, 1446 GISelChangeObserver &Observer, 1447 LegalizerHelper &Helper) const { 1448 assert(MI.getOpcode() == TargetOpcode::G_FSHL || 1449 MI.getOpcode() == TargetOpcode::G_FSHR); 1450 1451 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic 1452 // lowering 1453 Register ShiftNo = MI.getOperand(3).getReg(); 1454 LLT ShiftTy = MRI.getType(ShiftNo); 1455 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI); 1456 1457 // Adjust shift amount according to Opcode (FSHL/FSHR) 1458 // Convert FSHL to FSHR 1459 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg()); 1460 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false); 1461 1462 // Lower non-constant shifts and leave zero shifts to the optimizer. 1463 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0) 1464 return (Helper.lowerFunnelShiftAsShifts(MI) == 1465 LegalizerHelper::LegalizeResult::Legalized); 1466 1467 APInt Amount = VRegAndVal->Value.urem(BitWidth); 1468 1469 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount; 1470 1471 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount 1472 // in the range of 0 <-> BitWidth, it is legal 1473 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR && 1474 VRegAndVal->Value.ult(BitWidth)) 1475 return true; 1476 1477 // Cast the ShiftNumber to a 64-bit type 1478 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64)); 1479 1480 if (MI.getOpcode() == TargetOpcode::G_FSHR) { 1481 Observer.changingInstr(MI); 1482 MI.getOperand(3).setReg(Cast64.getReg(0)); 1483 Observer.changedInstr(MI); 1484 } 1485 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR 1486 // instruction 1487 else if (MI.getOpcode() == TargetOpcode::G_FSHL) { 1488 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()}, 1489 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(), 1490 Cast64.getReg(0)}); 1491 MI.eraseFromParent(); 1492 } 1493 return true; 1494 } 1495 1496 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI, 1497 MachineRegisterInfo &MRI, 1498 MachineIRBuilder &MIRBuilder) const { 1499 Register DstReg = MI.getOperand(0).getReg(); 1500 Register SrcReg1 = MI.getOperand(2).getReg(); 1501 Register SrcReg2 = MI.getOperand(3).getReg(); 1502 LLT DstTy = MRI.getType(DstReg); 1503 LLT SrcTy = MRI.getType(SrcReg1); 1504 1505 // Check the vector types are legal 1506 if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() || 1507 DstTy.getNumElements() != SrcTy.getNumElements() || 1508 (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128)) 1509 return false; 1510 1511 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for 1512 // following passes 1513 CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate(); 1514 if (Pred != CmpInst::ICMP_NE) 1515 return true; 1516 Register CmpReg = 1517 MIRBuilder 1518 .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2) 1519 .getReg(0); 1520 MIRBuilder.buildNot(DstReg, CmpReg); 1521 1522 MI.eraseFromParent(); 1523 return true; 1524 } 1525 1526 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI, 1527 MachineRegisterInfo &MRI, 1528 LegalizerHelper &Helper) const { 1529 // To allow for imported patterns to match, we ensure that the rotate amount 1530 // is 64b with an extension. 1531 Register AmtReg = MI.getOperand(2).getReg(); 1532 LLT AmtTy = MRI.getType(AmtReg); 1533 (void)AmtTy; 1534 assert(AmtTy.isScalar() && "Expected a scalar rotate"); 1535 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal"); 1536 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg); 1537 Helper.Observer.changingInstr(MI); 1538 MI.getOperand(2).setReg(NewAmt.getReg(0)); 1539 Helper.Observer.changedInstr(MI); 1540 return true; 1541 } 1542 1543 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( 1544 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1545 GISelChangeObserver &Observer) const { 1546 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 1547 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + 1548 // G_ADD_LOW instructions. 1549 // By splitting this here, we can optimize accesses in the small code model by 1550 // folding in the G_ADD_LOW into the load/store offset. 1551 auto &GlobalOp = MI.getOperand(1); 1552 // Don't modify an intrinsic call. 1553 if (GlobalOp.isSymbol()) 1554 return true; 1555 const auto* GV = GlobalOp.getGlobal(); 1556 if (GV->isThreadLocal()) 1557 return true; // Don't want to modify TLS vars. 1558 1559 auto &TM = ST->getTargetLowering()->getTargetMachine(); 1560 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM); 1561 1562 if (OpFlags & AArch64II::MO_GOT) 1563 return true; 1564 1565 auto Offset = GlobalOp.getOffset(); 1566 Register DstReg = MI.getOperand(0).getReg(); 1567 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) 1568 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); 1569 // Set the regclass on the dest reg too. 1570 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1571 1572 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so 1573 // by creating a MOVK that sets bits 48-63 of the register to (global address 1574 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to 1575 // prevent an incorrect tag being generated during relocation when the 1576 // global appears before the code section. Without the offset, a global at 1577 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced 1578 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = 1579 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` 1580 // instead of `0xf`. 1581 // This assumes that we're in the small code model so we can assume a binary 1582 // size of <= 4GB, which makes the untagged PC relative offset positive. The 1583 // binary must also be loaded into address range [0, 2^48). Both of these 1584 // properties need to be ensured at runtime when using tagged addresses. 1585 if (OpFlags & AArch64II::MO_TAGGED) { 1586 assert(!Offset && 1587 "Should not have folded in an offset for a tagged global!"); 1588 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) 1589 .addGlobalAddress(GV, 0x100000000, 1590 AArch64II::MO_PREL | AArch64II::MO_G3) 1591 .addImm(48); 1592 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); 1593 } 1594 1595 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) 1596 .addGlobalAddress(GV, Offset, 1597 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 1598 MI.eraseFromParent(); 1599 return true; 1600 } 1601 1602 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 1603 MachineInstr &MI) const { 1604 auto LowerBinOp = [&MI](unsigned Opcode) { 1605 MachineIRBuilder MIB(MI); 1606 MIB.buildInstr(Opcode, {MI.getOperand(0)}, 1607 {MI.getOperand(2), MI.getOperand(3)}); 1608 MI.eraseFromParent(); 1609 return true; 1610 }; 1611 1612 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 1613 switch (IntrinsicID) { 1614 case Intrinsic::vacopy: { 1615 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8; 1616 unsigned VaListSize = 1617 (ST->isTargetDarwin() || ST->isTargetWindows()) 1618 ? PtrSize 1619 : ST->isTargetILP32() ? 20 : 32; 1620 1621 MachineFunction &MF = *MI.getMF(); 1622 auto Val = MF.getRegInfo().createGenericVirtualRegister( 1623 LLT::scalar(VaListSize * 8)); 1624 MachineIRBuilder MIB(MI); 1625 MIB.buildLoad(Val, MI.getOperand(2), 1626 *MF.getMachineMemOperand(MachinePointerInfo(), 1627 MachineMemOperand::MOLoad, 1628 VaListSize, Align(PtrSize))); 1629 MIB.buildStore(Val, MI.getOperand(1), 1630 *MF.getMachineMemOperand(MachinePointerInfo(), 1631 MachineMemOperand::MOStore, 1632 VaListSize, Align(PtrSize))); 1633 MI.eraseFromParent(); 1634 return true; 1635 } 1636 case Intrinsic::get_dynamic_area_offset: { 1637 MachineIRBuilder &MIB = Helper.MIRBuilder; 1638 MIB.buildConstant(MI.getOperand(0).getReg(), 0); 1639 MI.eraseFromParent(); 1640 return true; 1641 } 1642 case Intrinsic::aarch64_mops_memset_tag: { 1643 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 1644 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 1645 // the instruction). 1646 MachineIRBuilder MIB(MI); 1647 auto &Value = MI.getOperand(3); 1648 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); 1649 Value.setReg(ExtValueReg); 1650 return true; 1651 } 1652 case Intrinsic::aarch64_prefetch: { 1653 MachineIRBuilder MIB(MI); 1654 auto &AddrVal = MI.getOperand(1); 1655 1656 int64_t IsWrite = MI.getOperand(2).getImm(); 1657 int64_t Target = MI.getOperand(3).getImm(); 1658 int64_t IsStream = MI.getOperand(4).getImm(); 1659 int64_t IsData = MI.getOperand(5).getImm(); 1660 1661 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1662 (!IsData << 3) | // IsDataCache bit 1663 (Target << 1) | // Cache level bits 1664 (unsigned)IsStream; // Stream bit 1665 1666 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 1667 MI.eraseFromParent(); 1668 return true; 1669 } 1670 case Intrinsic::aarch64_neon_uaddv: 1671 case Intrinsic::aarch64_neon_saddv: 1672 case Intrinsic::aarch64_neon_umaxv: 1673 case Intrinsic::aarch64_neon_smaxv: 1674 case Intrinsic::aarch64_neon_uminv: 1675 case Intrinsic::aarch64_neon_sminv: { 1676 MachineIRBuilder MIB(MI); 1677 MachineRegisterInfo &MRI = *MIB.getMRI(); 1678 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv || 1679 IntrinsicID == Intrinsic::aarch64_neon_smaxv || 1680 IntrinsicID == Intrinsic::aarch64_neon_sminv; 1681 1682 auto OldDst = MI.getOperand(0).getReg(); 1683 auto OldDstTy = MRI.getType(OldDst); 1684 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType(); 1685 if (OldDstTy == NewDstTy) 1686 return true; 1687 1688 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy); 1689 1690 Helper.Observer.changingInstr(MI); 1691 MI.getOperand(0).setReg(NewDst); 1692 Helper.Observer.changedInstr(MI); 1693 1694 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt()); 1695 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT, 1696 OldDst, NewDst); 1697 1698 return true; 1699 } 1700 case Intrinsic::aarch64_neon_uaddlp: 1701 case Intrinsic::aarch64_neon_saddlp: { 1702 MachineIRBuilder MIB(MI); 1703 1704 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp 1705 ? AArch64::G_UADDLP 1706 : AArch64::G_SADDLP; 1707 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)}); 1708 MI.eraseFromParent(); 1709 1710 return true; 1711 } 1712 case Intrinsic::aarch64_neon_uaddlv: 1713 case Intrinsic::aarch64_neon_saddlv: { 1714 MachineIRBuilder MIB(MI); 1715 MachineRegisterInfo &MRI = *MIB.getMRI(); 1716 1717 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv 1718 ? AArch64::G_UADDLV 1719 : AArch64::G_SADDLV; 1720 Register DstReg = MI.getOperand(0).getReg(); 1721 Register SrcReg = MI.getOperand(2).getReg(); 1722 LLT DstTy = MRI.getType(DstReg); 1723 1724 LLT MidTy, ExtTy; 1725 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) { 1726 MidTy = LLT::fixed_vector(4, 32); 1727 ExtTy = LLT::scalar(32); 1728 } else { 1729 MidTy = LLT::fixed_vector(2, 64); 1730 ExtTy = LLT::scalar(64); 1731 } 1732 1733 Register MidReg = 1734 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg(); 1735 Register ZeroReg = 1736 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg(); 1737 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy}, 1738 {MidReg, ZeroReg}) 1739 .getReg(0); 1740 1741 if (DstTy.getScalarSizeInBits() < 32) 1742 MIB.buildTrunc(DstReg, ExtReg); 1743 else 1744 MIB.buildCopy(DstReg, ExtReg); 1745 1746 MI.eraseFromParent(); 1747 1748 return true; 1749 } 1750 case Intrinsic::aarch64_neon_smax: 1751 return LowerBinOp(TargetOpcode::G_SMAX); 1752 case Intrinsic::aarch64_neon_smin: 1753 return LowerBinOp(TargetOpcode::G_SMIN); 1754 case Intrinsic::aarch64_neon_umax: 1755 return LowerBinOp(TargetOpcode::G_UMAX); 1756 case Intrinsic::aarch64_neon_umin: 1757 return LowerBinOp(TargetOpcode::G_UMIN); 1758 case Intrinsic::aarch64_neon_fmax: 1759 return LowerBinOp(TargetOpcode::G_FMAXIMUM); 1760 case Intrinsic::aarch64_neon_fmin: 1761 return LowerBinOp(TargetOpcode::G_FMINIMUM); 1762 case Intrinsic::aarch64_neon_fmaxnm: 1763 return LowerBinOp(TargetOpcode::G_FMAXNUM); 1764 case Intrinsic::aarch64_neon_fminnm: 1765 return LowerBinOp(TargetOpcode::G_FMINNUM); 1766 case Intrinsic::aarch64_neon_smull: 1767 return LowerBinOp(AArch64::G_SMULL); 1768 case Intrinsic::aarch64_neon_umull: 1769 return LowerBinOp(AArch64::G_UMULL); 1770 case Intrinsic::aarch64_neon_abs: { 1771 // Lower the intrinsic to G_ABS. 1772 MachineIRBuilder MIB(MI); 1773 MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)}); 1774 MI.eraseFromParent(); 1775 return true; 1776 } 1777 1778 case Intrinsic::vector_reverse: 1779 // TODO: Add support for vector_reverse 1780 return false; 1781 } 1782 1783 return true; 1784 } 1785 1786 bool AArch64LegalizerInfo::legalizeShlAshrLshr( 1787 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1788 GISelChangeObserver &Observer) const { 1789 assert(MI.getOpcode() == TargetOpcode::G_ASHR || 1790 MI.getOpcode() == TargetOpcode::G_LSHR || 1791 MI.getOpcode() == TargetOpcode::G_SHL); 1792 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the 1793 // imported patterns can select it later. Either way, it will be legal. 1794 Register AmtReg = MI.getOperand(2).getReg(); 1795 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI); 1796 if (!VRegAndVal) 1797 return true; 1798 // Check the shift amount is in range for an immediate form. 1799 int64_t Amount = VRegAndVal->Value.getSExtValue(); 1800 if (Amount > 31) 1801 return true; // This will have to remain a register variant. 1802 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); 1803 Observer.changingInstr(MI); 1804 MI.getOperand(2).setReg(ExtCst.getReg(0)); 1805 Observer.changedInstr(MI); 1806 return true; 1807 } 1808 1809 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset, 1810 MachineRegisterInfo &MRI) { 1811 Base = Root; 1812 Offset = 0; 1813 1814 Register NewBase; 1815 int64_t NewOffset; 1816 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) && 1817 isShiftedInt<7, 3>(NewOffset)) { 1818 Base = NewBase; 1819 Offset = NewOffset; 1820 } 1821 } 1822 1823 // FIXME: This should be removed and replaced with the generic bitcast legalize 1824 // action. 1825 bool AArch64LegalizerInfo::legalizeLoadStore( 1826 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, 1827 GISelChangeObserver &Observer) const { 1828 assert(MI.getOpcode() == TargetOpcode::G_STORE || 1829 MI.getOpcode() == TargetOpcode::G_LOAD); 1830 // Here we just try to handle vector loads/stores where our value type might 1831 // have pointer elements, which the SelectionDAG importer can't handle. To 1832 // allow the existing patterns for s64 to fire for p0, we just try to bitcast 1833 // the value to use s64 types. 1834 1835 // Custom legalization requires the instruction, if not deleted, must be fully 1836 // legalized. In order to allow further legalization of the inst, we create 1837 // a new instruction and erase the existing one. 1838 1839 Register ValReg = MI.getOperand(0).getReg(); 1840 const LLT ValTy = MRI.getType(ValReg); 1841 1842 if (ValTy == LLT::scalar(128)) { 1843 1844 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering(); 1845 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD; 1846 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire; 1847 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release; 1848 bool IsRcpC3 = 1849 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease); 1850 1851 LLT s64 = LLT::scalar(64); 1852 1853 unsigned Opcode; 1854 if (IsRcpC3) { 1855 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX; 1856 } else { 1857 // For LSE2, loads/stores should have been converted to monotonic and had 1858 // a fence inserted after them. 1859 assert(Ordering == AtomicOrdering::Monotonic || 1860 Ordering == AtomicOrdering::Unordered); 1861 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2"); 1862 1863 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi; 1864 } 1865 1866 MachineInstrBuilder NewI; 1867 if (IsLoad) { 1868 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {}); 1869 MIRBuilder.buildMergeLikeInstr( 1870 ValReg, {NewI->getOperand(0), NewI->getOperand(1)}); 1871 } else { 1872 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0)); 1873 NewI = MIRBuilder.buildInstr( 1874 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)}); 1875 } 1876 1877 if (IsRcpC3) { 1878 NewI.addUse(MI.getOperand(1).getReg()); 1879 } else { 1880 Register Base; 1881 int Offset; 1882 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI); 1883 NewI.addUse(Base); 1884 NewI.addImm(Offset / 8); 1885 } 1886 1887 NewI.cloneMemRefs(MI); 1888 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(), 1889 *MRI.getTargetRegisterInfo(), 1890 *ST->getRegBankInfo()); 1891 MI.eraseFromParent(); 1892 return true; 1893 } 1894 1895 if (!ValTy.isPointerVector() || 1896 ValTy.getElementType().getAddressSpace() != 0) { 1897 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store"); 1898 return false; 1899 } 1900 1901 unsigned PtrSize = ValTy.getElementType().getSizeInBits(); 1902 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize); 1903 auto &MMO = **MI.memoperands_begin(); 1904 MMO.setType(NewTy); 1905 1906 if (MI.getOpcode() == TargetOpcode::G_STORE) { 1907 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg); 1908 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO); 1909 } else { 1910 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO); 1911 MIRBuilder.buildBitcast(ValReg, NewLoad); 1912 } 1913 MI.eraseFromParent(); 1914 return true; 1915 } 1916 1917 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, 1918 MachineRegisterInfo &MRI, 1919 MachineIRBuilder &MIRBuilder) const { 1920 MachineFunction &MF = MIRBuilder.getMF(); 1921 Align Alignment(MI.getOperand(2).getImm()); 1922 Register Dst = MI.getOperand(0).getReg(); 1923 Register ListPtr = MI.getOperand(1).getReg(); 1924 1925 LLT PtrTy = MRI.getType(ListPtr); 1926 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1927 1928 const unsigned PtrSize = PtrTy.getSizeInBits() / 8; 1929 const Align PtrAlign = Align(PtrSize); 1930 auto List = MIRBuilder.buildLoad( 1931 PtrTy, ListPtr, 1932 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1933 PtrTy, PtrAlign)); 1934 1935 MachineInstrBuilder DstPtr; 1936 if (Alignment > PtrAlign) { 1937 // Realign the list to the actual required alignment. 1938 auto AlignMinus1 = 1939 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1); 1940 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); 1941 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment)); 1942 } else 1943 DstPtr = List; 1944 1945 LLT ValTy = MRI.getType(Dst); 1946 uint64_t ValSize = ValTy.getSizeInBits() / 8; 1947 MIRBuilder.buildLoad( 1948 Dst, DstPtr, 1949 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, 1950 ValTy, std::max(Alignment, PtrAlign))); 1951 1952 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign)); 1953 1954 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); 1955 1956 MIRBuilder.buildStore(NewList, ListPtr, 1957 *MF.getMachineMemOperand(MachinePointerInfo(), 1958 MachineMemOperand::MOStore, 1959 PtrTy, PtrAlign)); 1960 1961 MI.eraseFromParent(); 1962 return true; 1963 } 1964 1965 bool AArch64LegalizerInfo::legalizeBitfieldExtract( 1966 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 1967 // Only legal if we can select immediate forms. 1968 // TODO: Lower this otherwise. 1969 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && 1970 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 1971 } 1972 1973 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, 1974 MachineRegisterInfo &MRI, 1975 LegalizerHelper &Helper) const { 1976 // When there is no integer popcount instruction (FEAT_CSSC isn't available), 1977 // it can be more efficiently lowered to the following sequence that uses 1978 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD 1979 // registers are cheap. 1980 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 1981 // CNT V0.8B, V0.8B // 8xbyte pop-counts 1982 // ADDV B0, V0.8B // sum 8xbyte pop-counts 1983 // UMOV X0, V0.B[0] // copy byte result back to integer reg 1984 // 1985 // For 128 bit vector popcounts, we lower to the following sequence: 1986 // cnt.16b v0, v0 // v8s16, v4s32, v2s64 1987 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64 1988 // uaddlp.4s v0, v0 // v4s32, v2s64 1989 // uaddlp.2d v0, v0 // v2s64 1990 // 1991 // For 64 bit vector popcounts, we lower to the following sequence: 1992 // cnt.8b v0, v0 // v4s16, v2s32 1993 // uaddlp.4h v0, v0 // v4s16, v2s32 1994 // uaddlp.2s v0, v0 // v2s32 1995 1996 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 1997 Register Dst = MI.getOperand(0).getReg(); 1998 Register Val = MI.getOperand(1).getReg(); 1999 LLT Ty = MRI.getType(Val); 2000 unsigned Size = Ty.getSizeInBits(); 2001 2002 assert(Ty == MRI.getType(Dst) && 2003 "Expected src and dst to have the same type!"); 2004 2005 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) { 2006 LLT s64 = LLT::scalar(64); 2007 2008 auto Split = MIRBuilder.buildUnmerge(s64, Val); 2009 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0)); 2010 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1)); 2011 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2); 2012 2013 MIRBuilder.buildZExt(Dst, Add); 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 if (!ST->hasNEON() || 2019 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) { 2020 // Use generic lowering when custom lowering is not possible. 2021 return Ty.isScalar() && (Size == 32 || Size == 64) && 2022 Helper.lowerBitCount(MI) == 2023 LegalizerHelper::LegalizeResult::Legalized; 2024 } 2025 2026 // Pre-conditioning: widen Val up to the nearest vector type. 2027 // s32,s64,v4s16,v2s32 -> v8i8 2028 // v8s16,v4s32,v2s64 -> v16i8 2029 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8); 2030 if (Ty.isScalar()) { 2031 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!"); 2032 if (Size == 32) { 2033 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); 2034 } 2035 } 2036 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0); 2037 2038 // Count bits in each byte-sized lane. 2039 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val); 2040 2041 // Sum across lanes. 2042 2043 if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 && 2044 Ty.getScalarSizeInBits() != 16) { 2045 LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty; 2046 auto Zeros = MIRBuilder.buildConstant(Dt, 0); 2047 auto Ones = MIRBuilder.buildConstant(VTy, 1); 2048 MachineInstrBuilder Sum; 2049 2050 if (Ty == LLT::fixed_vector(2, 64)) { 2051 auto UDOT = 2052 MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP}); 2053 Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT}); 2054 } else if (Ty == LLT::fixed_vector(4, 32)) { 2055 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP}); 2056 } else if (Ty == LLT::fixed_vector(2, 32)) { 2057 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP}); 2058 } else { 2059 llvm_unreachable("unexpected vector shape"); 2060 } 2061 2062 Sum->getOperand(0).setReg(Dst); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 Register HSum = CTPOP.getReg(0); 2068 unsigned Opc; 2069 SmallVector<LLT> HAddTys; 2070 if (Ty.isScalar()) { 2071 Opc = Intrinsic::aarch64_neon_uaddlv; 2072 HAddTys.push_back(LLT::scalar(32)); 2073 } else if (Ty == LLT::fixed_vector(8, 16)) { 2074 Opc = Intrinsic::aarch64_neon_uaddlp; 2075 HAddTys.push_back(LLT::fixed_vector(8, 16)); 2076 } else if (Ty == LLT::fixed_vector(4, 32)) { 2077 Opc = Intrinsic::aarch64_neon_uaddlp; 2078 HAddTys.push_back(LLT::fixed_vector(8, 16)); 2079 HAddTys.push_back(LLT::fixed_vector(4, 32)); 2080 } else if (Ty == LLT::fixed_vector(2, 64)) { 2081 Opc = Intrinsic::aarch64_neon_uaddlp; 2082 HAddTys.push_back(LLT::fixed_vector(8, 16)); 2083 HAddTys.push_back(LLT::fixed_vector(4, 32)); 2084 HAddTys.push_back(LLT::fixed_vector(2, 64)); 2085 } else if (Ty == LLT::fixed_vector(4, 16)) { 2086 Opc = Intrinsic::aarch64_neon_uaddlp; 2087 HAddTys.push_back(LLT::fixed_vector(4, 16)); 2088 } else if (Ty == LLT::fixed_vector(2, 32)) { 2089 Opc = Intrinsic::aarch64_neon_uaddlp; 2090 HAddTys.push_back(LLT::fixed_vector(4, 16)); 2091 HAddTys.push_back(LLT::fixed_vector(2, 32)); 2092 } else 2093 llvm_unreachable("unexpected vector shape"); 2094 MachineInstrBuilder UADD; 2095 for (LLT HTy : HAddTys) { 2096 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum); 2097 HSum = UADD.getReg(0); 2098 } 2099 2100 // Post-conditioning. 2101 if (Ty.isScalar() && (Size == 64 || Size == 128)) 2102 MIRBuilder.buildZExt(Dst, UADD); 2103 else 2104 UADD->getOperand(0).setReg(Dst); 2105 MI.eraseFromParent(); 2106 return true; 2107 } 2108 2109 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( 2110 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 2111 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2112 LLT s64 = LLT::scalar(64); 2113 auto Addr = MI.getOperand(1).getReg(); 2114 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2)); 2115 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3)); 2116 auto DstLo = MRI.createGenericVirtualRegister(s64); 2117 auto DstHi = MRI.createGenericVirtualRegister(s64); 2118 2119 MachineInstrBuilder CAS; 2120 if (ST->hasLSE()) { 2121 // We have 128-bit CASP instructions taking XSeqPair registers, which are 2122 // s128. We need the merge/unmerge to bracket the expansion and pair up with 2123 // the rest of the MIR so we must reassemble the extracted registers into a 2124 // 128-bit known-regclass one with code like this: 2125 // 2126 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input 2127 // %out = CASP %in1, ... 2128 // %OldLo = G_EXTRACT %out, 0 2129 // %OldHi = G_EXTRACT %out, 64 2130 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 2131 unsigned Opcode; 2132 switch (Ordering) { 2133 case AtomicOrdering::Acquire: 2134 Opcode = AArch64::CASPAX; 2135 break; 2136 case AtomicOrdering::Release: 2137 Opcode = AArch64::CASPLX; 2138 break; 2139 case AtomicOrdering::AcquireRelease: 2140 case AtomicOrdering::SequentiallyConsistent: 2141 Opcode = AArch64::CASPALX; 2142 break; 2143 default: 2144 Opcode = AArch64::CASPX; 2145 break; 2146 } 2147 2148 LLT s128 = LLT::scalar(128); 2149 auto CASDst = MRI.createGenericVirtualRegister(s128); 2150 auto CASDesired = MRI.createGenericVirtualRegister(s128); 2151 auto CASNew = MRI.createGenericVirtualRegister(s128); 2152 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {}) 2153 .addUse(DesiredI->getOperand(0).getReg()) 2154 .addImm(AArch64::sube64) 2155 .addUse(DesiredI->getOperand(1).getReg()) 2156 .addImm(AArch64::subo64); 2157 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {}) 2158 .addUse(NewI->getOperand(0).getReg()) 2159 .addImm(AArch64::sube64) 2160 .addUse(NewI->getOperand(1).getReg()) 2161 .addImm(AArch64::subo64); 2162 2163 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr}); 2164 2165 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0); 2166 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64); 2167 } else { 2168 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP 2169 // can take arbitrary registers so it just has the normal GPR64 operands the 2170 // rest of AArch64 is expecting. 2171 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); 2172 unsigned Opcode; 2173 switch (Ordering) { 2174 case AtomicOrdering::Acquire: 2175 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 2176 break; 2177 case AtomicOrdering::Release: 2178 Opcode = AArch64::CMP_SWAP_128_RELEASE; 2179 break; 2180 case AtomicOrdering::AcquireRelease: 2181 case AtomicOrdering::SequentiallyConsistent: 2182 Opcode = AArch64::CMP_SWAP_128; 2183 break; 2184 default: 2185 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 2186 break; 2187 } 2188 2189 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); 2190 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, 2191 {Addr, DesiredI->getOperand(0), 2192 DesiredI->getOperand(1), NewI->getOperand(0), 2193 NewI->getOperand(1)}); 2194 } 2195 2196 CAS.cloneMemRefs(MI); 2197 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(), 2198 *MRI.getTargetRegisterInfo(), 2199 *ST->getRegBankInfo()); 2200 2201 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi}); 2202 MI.eraseFromParent(); 2203 return true; 2204 } 2205 2206 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, 2207 LegalizerHelper &Helper) const { 2208 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2209 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 2210 LLT Ty = MRI.getType(MI.getOperand(1).getReg()); 2211 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1)); 2212 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse); 2213 MI.eraseFromParent(); 2214 return true; 2215 } 2216 2217 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, 2218 LegalizerHelper &Helper) const { 2219 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2220 2221 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic 2222 if (MI.getOpcode() == TargetOpcode::G_MEMSET) { 2223 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by 2224 // the instruction). 2225 auto &Value = MI.getOperand(1); 2226 Register ExtValueReg = 2227 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); 2228 Value.setReg(ExtValueReg); 2229 return true; 2230 } 2231 2232 return false; 2233 } 2234 2235 bool AArch64LegalizerInfo::legalizeExtractVectorElt( 2236 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { 2237 const GExtractVectorElement *Element = cast<GExtractVectorElement>(&MI); 2238 auto VRegAndVal = 2239 getIConstantVRegValWithLookThrough(Element->getIndexReg(), MRI); 2240 if (VRegAndVal) 2241 return true; 2242 LLT VecTy = MRI.getType(Element->getVectorReg()); 2243 if (VecTy.isScalableVector()) 2244 return true; 2245 return Helper.lowerExtractInsertVectorElt(MI) != 2246 LegalizerHelper::LegalizeResult::UnableToLegalize; 2247 } 2248 2249 bool AArch64LegalizerInfo::legalizeDynStackAlloc( 2250 MachineInstr &MI, LegalizerHelper &Helper) const { 2251 MachineFunction &MF = *MI.getParent()->getParent(); 2252 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; 2253 MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); 2254 2255 // If stack probing is not enabled for this function, use the default 2256 // lowering. 2257 if (!MF.getFunction().hasFnAttribute("probe-stack") || 2258 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != 2259 "inline-asm") { 2260 Helper.lowerDynStackAlloc(MI); 2261 return true; 2262 } 2263 2264 Register Dst = MI.getOperand(0).getReg(); 2265 Register AllocSize = MI.getOperand(1).getReg(); 2266 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 2267 2268 assert(MRI.getType(Dst) == LLT::pointer(0, 64) && 2269 "Unexpected type for dynamic alloca"); 2270 assert(MRI.getType(AllocSize) == LLT::scalar(64) && 2271 "Unexpected type for dynamic alloca"); 2272 2273 LLT PtrTy = MRI.getType(Dst); 2274 Register SPReg = 2275 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); 2276 Register SPTmp = 2277 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); 2278 auto NewMI = 2279 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); 2280 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); 2281 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); 2282 MIRBuilder.buildCopy(Dst, SPTmp); 2283 2284 MI.eraseFromParent(); 2285 return true; 2286 } 2287 2288 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI, 2289 LegalizerHelper &Helper) const { 2290 MachineIRBuilder &MIB = Helper.MIRBuilder; 2291 auto &AddrVal = MI.getOperand(0); 2292 2293 int64_t IsWrite = MI.getOperand(1).getImm(); 2294 int64_t Locality = MI.getOperand(2).getImm(); 2295 int64_t IsData = MI.getOperand(3).getImm(); 2296 2297 bool IsStream = Locality == 0; 2298 if (Locality != 0) { 2299 assert(Locality <= 3 && "Prefetch locality out-of-range"); 2300 // The locality degree is the opposite of the cache speed. 2301 // Put the number the other way around. 2302 // The encoding starts at 0 for level 1 2303 Locality = 3 - Locality; 2304 } 2305 2306 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream; 2307 2308 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal); 2309 MI.eraseFromParent(); 2310 return true; 2311 } 2312