1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "MCTargetDesc/ARMAddressingModes.h" 25 #include "MCTargetDesc/ARMBaseInfo.h" 26 #include "Utils/ARMBaseInfo.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/BitVector.h" 31 #include "llvm/ADT/DenseMap.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringExtras.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/StringSwitch.h" 39 #include "llvm/ADT/Triple.h" 40 #include "llvm/ADT/Twine.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/CodeGen/CallingConvLower.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/IntrinsicLowering.h" 45 #include "llvm/CodeGen/MachineBasicBlock.h" 46 #include "llvm/CodeGen/MachineConstantPool.h" 47 #include "llvm/CodeGen/MachineFrameInfo.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineInstr.h" 50 #include "llvm/CodeGen/MachineInstrBuilder.h" 51 #include "llvm/CodeGen/MachineJumpTableInfo.h" 52 #include "llvm/CodeGen/MachineMemOperand.h" 53 #include "llvm/CodeGen/MachineOperand.h" 54 #include "llvm/CodeGen/MachineRegisterInfo.h" 55 #include "llvm/CodeGen/RuntimeLibcalls.h" 56 #include "llvm/CodeGen/SelectionDAG.h" 57 #include "llvm/CodeGen/SelectionDAGNodes.h" 58 #include "llvm/CodeGen/TargetInstrInfo.h" 59 #include "llvm/CodeGen/TargetLowering.h" 60 #include "llvm/CodeGen/TargetOpcodes.h" 61 #include "llvm/CodeGen/TargetRegisterInfo.h" 62 #include "llvm/CodeGen/TargetSubtargetInfo.h" 63 #include "llvm/CodeGen/ValueTypes.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/CallingConv.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Function.h" 72 #include "llvm/IR/GlobalAlias.h" 73 #include "llvm/IR/GlobalValue.h" 74 #include "llvm/IR/GlobalVariable.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/InlineAsm.h" 77 #include "llvm/IR/Instruction.h" 78 #include "llvm/IR/Instructions.h" 79 #include "llvm/IR/IntrinsicInst.h" 80 #include "llvm/IR/Intrinsics.h" 81 #include "llvm/IR/Module.h" 82 #include "llvm/IR/PatternMatch.h" 83 #include "llvm/IR/Type.h" 84 #include "llvm/IR/User.h" 85 #include "llvm/IR/Value.h" 86 #include "llvm/MC/MCInstrDesc.h" 87 #include "llvm/MC/MCInstrItineraries.h" 88 #include "llvm/MC/MCRegisterInfo.h" 89 #include "llvm/MC/MCSchedule.h" 90 #include "llvm/Support/AtomicOrdering.h" 91 #include "llvm/Support/BranchProbability.h" 92 #include "llvm/Support/Casting.h" 93 #include "llvm/Support/CodeGen.h" 94 #include "llvm/Support/CommandLine.h" 95 #include "llvm/Support/Compiler.h" 96 #include "llvm/Support/Debug.h" 97 #include "llvm/Support/ErrorHandling.h" 98 #include "llvm/Support/KnownBits.h" 99 #include "llvm/Support/MachineValueType.h" 100 #include "llvm/Support/MathExtras.h" 101 #include "llvm/Support/raw_ostream.h" 102 #include "llvm/Target/TargetMachine.h" 103 #include "llvm/Target/TargetOptions.h" 104 #include <algorithm> 105 #include <cassert> 106 #include <cstdint> 107 #include <cstdlib> 108 #include <iterator> 109 #include <limits> 110 #include <string> 111 #include <tuple> 112 #include <utility> 113 #include <vector> 114 115 using namespace llvm; 116 using namespace llvm::PatternMatch; 117 118 #define DEBUG_TYPE "arm-isel" 119 120 STATISTIC(NumTailCalls, "Number of tail calls"); 121 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 122 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 123 STATISTIC(NumConstpoolPromoted, 124 "Number of constants with their storage promoted into constant pools"); 125 126 static cl::opt<bool> 127 ARMInterworking("arm-interworking", cl::Hidden, 128 cl::desc("Enable / disable ARM interworking (for debugging only)"), 129 cl::init(true)); 130 131 static cl::opt<bool> EnableConstpoolPromotion( 132 "arm-promote-constant", cl::Hidden, 133 cl::desc("Enable / disable promotion of unnamed_addr constants into " 134 "constant pools"), 135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 136 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 137 "arm-promote-constant-max-size", cl::Hidden, 138 cl::desc("Maximum size of constant to promote into a constant pool"), 139 cl::init(64)); 140 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 141 "arm-promote-constant-max-total", cl::Hidden, 142 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 143 cl::init(128)); 144 145 // The APCS parameter registers. 146 static const MCPhysReg GPRArgRegs[] = { 147 ARM::R0, ARM::R1, ARM::R2, ARM::R3 148 }; 149 150 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 151 MVT PromotedBitwiseVT) { 152 if (VT != PromotedLdStVT) { 153 setOperationAction(ISD::LOAD, VT, Promote); 154 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 155 156 setOperationAction(ISD::STORE, VT, Promote); 157 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 158 } 159 160 MVT ElemTy = VT.getVectorElementType(); 161 if (ElemTy != MVT::f64) 162 setOperationAction(ISD::SETCC, VT, Custom); 163 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 164 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 165 if (ElemTy == MVT::i32) { 166 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 167 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 168 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 169 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 170 } else { 171 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 172 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 173 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 174 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 175 } 176 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 177 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 178 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 179 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 180 setOperationAction(ISD::SELECT, VT, Expand); 181 setOperationAction(ISD::SELECT_CC, VT, Expand); 182 setOperationAction(ISD::VSELECT, VT, Expand); 183 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 184 if (VT.isInteger()) { 185 setOperationAction(ISD::SHL, VT, Custom); 186 setOperationAction(ISD::SRA, VT, Custom); 187 setOperationAction(ISD::SRL, VT, Custom); 188 } 189 190 // Promote all bit-wise operations. 191 if (VT.isInteger() && VT != PromotedBitwiseVT) { 192 setOperationAction(ISD::AND, VT, Promote); 193 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 194 setOperationAction(ISD::OR, VT, Promote); 195 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 196 setOperationAction(ISD::XOR, VT, Promote); 197 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 198 } 199 200 // Neon does not support vector divide/remainder operations. 201 setOperationAction(ISD::SDIV, VT, Expand); 202 setOperationAction(ISD::UDIV, VT, Expand); 203 setOperationAction(ISD::FDIV, VT, Expand); 204 setOperationAction(ISD::SREM, VT, Expand); 205 setOperationAction(ISD::UREM, VT, Expand); 206 setOperationAction(ISD::FREM, VT, Expand); 207 208 if (!VT.isFloatingPoint() && 209 VT != MVT::v2i64 && VT != MVT::v1i64) 210 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 211 setOperationAction(Opcode, VT, Legal); 212 } 213 214 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 215 addRegisterClass(VT, &ARM::DPRRegClass); 216 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 217 } 218 219 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 220 addRegisterClass(VT, &ARM::DPairRegClass); 221 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 222 } 223 224 void ARMTargetLowering::setAllExpand(MVT VT) { 225 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 226 setOperationAction(Opc, VT, Expand); 227 228 // We support these really simple operations even on types where all 229 // the actual arithmetic has to be broken down into simpler 230 // operations or turned into library calls. 231 setOperationAction(ISD::BITCAST, VT, Legal); 232 setOperationAction(ISD::LOAD, VT, Legal); 233 setOperationAction(ISD::STORE, VT, Legal); 234 setOperationAction(ISD::UNDEF, VT, Legal); 235 } 236 237 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 238 LegalizeAction Action) { 239 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 240 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 241 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 242 } 243 244 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 245 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 246 247 for (auto VT : IntTypes) { 248 addRegisterClass(VT, &ARM::MQPRRegClass); 249 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 250 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 251 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 252 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 253 setOperationAction(ISD::SHL, VT, Custom); 254 setOperationAction(ISD::SRA, VT, Custom); 255 setOperationAction(ISD::SRL, VT, Custom); 256 setOperationAction(ISD::SMIN, VT, Legal); 257 setOperationAction(ISD::SMAX, VT, Legal); 258 setOperationAction(ISD::UMIN, VT, Legal); 259 setOperationAction(ISD::UMAX, VT, Legal); 260 setOperationAction(ISD::ABS, VT, Legal); 261 setOperationAction(ISD::SETCC, VT, Custom); 262 setOperationAction(ISD::MLOAD, VT, Custom); 263 setOperationAction(ISD::MSTORE, VT, Legal); 264 setOperationAction(ISD::CTLZ, VT, Legal); 265 setOperationAction(ISD::CTTZ, VT, Custom); 266 setOperationAction(ISD::BITREVERSE, VT, Legal); 267 setOperationAction(ISD::BSWAP, VT, Legal); 268 setOperationAction(ISD::SADDSAT, VT, Legal); 269 setOperationAction(ISD::UADDSAT, VT, Legal); 270 setOperationAction(ISD::SSUBSAT, VT, Legal); 271 setOperationAction(ISD::USUBSAT, VT, Legal); 272 273 // No native support for these. 274 setOperationAction(ISD::UDIV, VT, Expand); 275 setOperationAction(ISD::SDIV, VT, Expand); 276 setOperationAction(ISD::UREM, VT, Expand); 277 setOperationAction(ISD::SREM, VT, Expand); 278 setOperationAction(ISD::CTPOP, VT, Expand); 279 280 // Vector reductions 281 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 282 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 283 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 284 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 285 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 286 287 if (!HasMVEFP) { 288 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 289 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 290 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 291 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 292 } 293 294 // Pre and Post inc are supported on loads and stores 295 for (unsigned im = (unsigned)ISD::PRE_INC; 296 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 297 setIndexedLoadAction(im, VT, Legal); 298 setIndexedStoreAction(im, VT, Legal); 299 } 300 } 301 302 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 303 for (auto VT : FloatTypes) { 304 addRegisterClass(VT, &ARM::MQPRRegClass); 305 if (!HasMVEFP) 306 setAllExpand(VT); 307 308 // These are legal or custom whether we have MVE.fp or not 309 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 310 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 311 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 312 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 313 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 314 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 315 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 316 setOperationAction(ISD::SETCC, VT, Custom); 317 setOperationAction(ISD::MLOAD, VT, Custom); 318 setOperationAction(ISD::MSTORE, VT, Legal); 319 320 // Pre and Post inc are supported on loads and stores 321 for (unsigned im = (unsigned)ISD::PRE_INC; 322 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 323 setIndexedLoadAction(im, VT, Legal); 324 setIndexedStoreAction(im, VT, Legal); 325 } 326 327 if (HasMVEFP) { 328 setOperationAction(ISD::FMINNUM, VT, Legal); 329 setOperationAction(ISD::FMAXNUM, VT, Legal); 330 setOperationAction(ISD::FROUND, VT, Legal); 331 332 // No native support for these. 333 setOperationAction(ISD::FDIV, VT, Expand); 334 setOperationAction(ISD::FREM, VT, Expand); 335 setOperationAction(ISD::FSQRT, VT, Expand); 336 setOperationAction(ISD::FSIN, VT, Expand); 337 setOperationAction(ISD::FCOS, VT, Expand); 338 setOperationAction(ISD::FPOW, VT, Expand); 339 setOperationAction(ISD::FLOG, VT, Expand); 340 setOperationAction(ISD::FLOG2, VT, Expand); 341 setOperationAction(ISD::FLOG10, VT, Expand); 342 setOperationAction(ISD::FEXP, VT, Expand); 343 setOperationAction(ISD::FEXP2, VT, Expand); 344 setOperationAction(ISD::FNEARBYINT, VT, Expand); 345 } 346 } 347 348 // We 'support' these types up to bitcast/load/store level, regardless of 349 // MVE integer-only / float support. Only doing FP data processing on the FP 350 // vector types is inhibited at integer-only level. 351 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 352 for (auto VT : LongTypes) { 353 addRegisterClass(VT, &ARM::MQPRRegClass); 354 setAllExpand(VT); 355 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 356 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 357 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 358 } 359 // We can do bitwise operations on v2i64 vectors 360 setOperationAction(ISD::AND, MVT::v2i64, Legal); 361 setOperationAction(ISD::OR, MVT::v2i64, Legal); 362 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 363 364 // It is legal to extload from v4i8 to v4i16 or v4i32. 365 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 366 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 367 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 368 369 // Some truncating stores are legal too. 370 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 371 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 372 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 373 374 // Pre and Post inc on these are legal, given the correct extends 375 for (unsigned im = (unsigned)ISD::PRE_INC; 376 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 377 setIndexedLoadAction(im, MVT::v8i8, Legal); 378 setIndexedStoreAction(im, MVT::v8i8, Legal); 379 setIndexedLoadAction(im, MVT::v4i8, Legal); 380 setIndexedStoreAction(im, MVT::v4i8, Legal); 381 setIndexedLoadAction(im, MVT::v4i16, Legal); 382 setIndexedStoreAction(im, MVT::v4i16, Legal); 383 } 384 385 // Predicate types 386 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 387 for (auto VT : pTypes) { 388 addRegisterClass(VT, &ARM::VCCRRegClass); 389 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 390 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 391 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 392 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 393 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 394 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 395 setOperationAction(ISD::SETCC, VT, Custom); 396 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 397 setOperationAction(ISD::LOAD, VT, Custom); 398 setOperationAction(ISD::STORE, VT, Custom); 399 } 400 } 401 402 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 403 const ARMSubtarget &STI) 404 : TargetLowering(TM), Subtarget(&STI) { 405 RegInfo = Subtarget->getRegisterInfo(); 406 Itins = Subtarget->getInstrItineraryData(); 407 408 setBooleanContents(ZeroOrOneBooleanContent); 409 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 410 411 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 412 !Subtarget->isTargetWatchOS()) { 413 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 414 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 415 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 416 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 417 : CallingConv::ARM_AAPCS); 418 } 419 420 if (Subtarget->isTargetMachO()) { 421 // Uses VFP for Thumb libfuncs if available. 422 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 423 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 424 static const struct { 425 const RTLIB::Libcall Op; 426 const char * const Name; 427 const ISD::CondCode Cond; 428 } LibraryCalls[] = { 429 // Single-precision floating-point arithmetic. 430 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 431 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 432 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 433 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 434 435 // Double-precision floating-point arithmetic. 436 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 437 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 438 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 439 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 440 441 // Single-precision comparisons. 442 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 443 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 444 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 445 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 446 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 447 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 448 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 449 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 450 451 // Double-precision comparisons. 452 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 453 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 454 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 455 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 456 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 457 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 458 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 459 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 460 461 // Floating-point to integer conversions. 462 // i64 conversions are done via library routines even when generating VFP 463 // instructions, so use the same ones. 464 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 465 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 466 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 467 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 468 469 // Conversions between floating types. 470 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 471 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 472 473 // Integer to floating-point conversions. 474 // i64 conversions are done via library routines even when generating VFP 475 // instructions, so use the same ones. 476 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 477 // e.g., __floatunsidf vs. __floatunssidfvfp. 478 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 479 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 480 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 481 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 482 }; 483 484 for (const auto &LC : LibraryCalls) { 485 setLibcallName(LC.Op, LC.Name); 486 if (LC.Cond != ISD::SETCC_INVALID) 487 setCmpLibcallCC(LC.Op, LC.Cond); 488 } 489 } 490 } 491 492 // These libcalls are not available in 32-bit. 493 setLibcallName(RTLIB::SHL_I128, nullptr); 494 setLibcallName(RTLIB::SRL_I128, nullptr); 495 setLibcallName(RTLIB::SRA_I128, nullptr); 496 497 // RTLIB 498 if (Subtarget->isAAPCS_ABI() && 499 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 500 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 501 static const struct { 502 const RTLIB::Libcall Op; 503 const char * const Name; 504 const CallingConv::ID CC; 505 const ISD::CondCode Cond; 506 } LibraryCalls[] = { 507 // Double-precision floating-point arithmetic helper functions 508 // RTABI chapter 4.1.2, Table 2 509 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 510 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 511 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 512 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 513 514 // Double-precision floating-point comparison helper functions 515 // RTABI chapter 4.1.2, Table 3 516 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 517 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 518 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 519 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 520 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 521 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 522 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 523 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 524 525 // Single-precision floating-point arithmetic helper functions 526 // RTABI chapter 4.1.2, Table 4 527 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 528 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 529 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 530 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 531 532 // Single-precision floating-point comparison helper functions 533 // RTABI chapter 4.1.2, Table 5 534 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 535 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 536 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 537 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 538 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 539 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 540 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 541 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 542 543 // Floating-point to integer conversions. 544 // RTABI chapter 4.1.2, Table 6 545 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 546 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 547 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 548 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 549 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 550 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 551 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 552 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 553 554 // Conversions between floating types. 555 // RTABI chapter 4.1.2, Table 7 556 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 557 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 558 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 559 560 // Integer to floating-point conversions. 561 // RTABI chapter 4.1.2, Table 8 562 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 565 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 566 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 567 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 568 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 569 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 570 571 // Long long helper functions 572 // RTABI chapter 4.2, Table 9 573 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 574 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 575 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 576 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 577 578 // Integer division functions 579 // RTABI chapter 4.3.1 580 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 582 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 583 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 584 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 585 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 586 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 587 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 588 }; 589 590 for (const auto &LC : LibraryCalls) { 591 setLibcallName(LC.Op, LC.Name); 592 setLibcallCallingConv(LC.Op, LC.CC); 593 if (LC.Cond != ISD::SETCC_INVALID) 594 setCmpLibcallCC(LC.Op, LC.Cond); 595 } 596 597 // EABI dependent RTLIB 598 if (TM.Options.EABIVersion == EABI::EABI4 || 599 TM.Options.EABIVersion == EABI::EABI5) { 600 static const struct { 601 const RTLIB::Libcall Op; 602 const char *const Name; 603 const CallingConv::ID CC; 604 const ISD::CondCode Cond; 605 } MemOpsLibraryCalls[] = { 606 // Memory operations 607 // RTABI chapter 4.3.4 608 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 609 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 610 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 611 }; 612 613 for (const auto &LC : MemOpsLibraryCalls) { 614 setLibcallName(LC.Op, LC.Name); 615 setLibcallCallingConv(LC.Op, LC.CC); 616 if (LC.Cond != ISD::SETCC_INVALID) 617 setCmpLibcallCC(LC.Op, LC.Cond); 618 } 619 } 620 } 621 622 if (Subtarget->isTargetWindows()) { 623 static const struct { 624 const RTLIB::Libcall Op; 625 const char * const Name; 626 const CallingConv::ID CC; 627 } LibraryCalls[] = { 628 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 629 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 630 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 631 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 632 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 633 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 634 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 635 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 636 }; 637 638 for (const auto &LC : LibraryCalls) { 639 setLibcallName(LC.Op, LC.Name); 640 setLibcallCallingConv(LC.Op, LC.CC); 641 } 642 } 643 644 // Use divmod compiler-rt calls for iOS 5.0 and later. 645 if (Subtarget->isTargetMachO() && 646 !(Subtarget->isTargetIOS() && 647 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 648 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 649 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 650 } 651 652 // The half <-> float conversion functions are always soft-float on 653 // non-watchos platforms, but are needed for some targets which use a 654 // hard-float calling convention by default. 655 if (!Subtarget->isTargetWatchABI()) { 656 if (Subtarget->isAAPCS_ABI()) { 657 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 658 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 659 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 660 } else { 661 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 662 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 663 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 664 } 665 } 666 667 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 668 // a __gnu_ prefix (which is the default). 669 if (Subtarget->isTargetAEABI()) { 670 static const struct { 671 const RTLIB::Libcall Op; 672 const char * const Name; 673 const CallingConv::ID CC; 674 } LibraryCalls[] = { 675 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 676 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 677 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 678 }; 679 680 for (const auto &LC : LibraryCalls) { 681 setLibcallName(LC.Op, LC.Name); 682 setLibcallCallingConv(LC.Op, LC.CC); 683 } 684 } 685 686 if (Subtarget->isThumb1Only()) 687 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 688 else 689 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 690 691 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 692 Subtarget->hasFPRegs()) { 693 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 694 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 695 if (!Subtarget->hasVFP2Base()) 696 setAllExpand(MVT::f32); 697 if (!Subtarget->hasFP64()) 698 setAllExpand(MVT::f64); 699 } 700 701 if (Subtarget->hasFullFP16()) { 702 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 703 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 704 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 705 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 706 707 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 708 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 709 } 710 711 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 712 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 713 setTruncStoreAction(VT, InnerVT, Expand); 714 addAllExtLoads(VT, InnerVT, Expand); 715 } 716 717 setOperationAction(ISD::MULHS, VT, Expand); 718 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 719 setOperationAction(ISD::MULHU, VT, Expand); 720 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 721 722 setOperationAction(ISD::BSWAP, VT, Expand); 723 } 724 725 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 726 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 727 728 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 729 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 730 731 if (Subtarget->hasMVEIntegerOps()) 732 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 733 734 // Combine low-overhead loop intrinsics so that we can lower i1 types. 735 if (Subtarget->hasLOB()) { 736 setTargetDAGCombine(ISD::BRCOND); 737 setTargetDAGCombine(ISD::BR_CC); 738 } 739 740 if (Subtarget->hasNEON()) { 741 addDRTypeForNEON(MVT::v2f32); 742 addDRTypeForNEON(MVT::v8i8); 743 addDRTypeForNEON(MVT::v4i16); 744 addDRTypeForNEON(MVT::v2i32); 745 addDRTypeForNEON(MVT::v1i64); 746 747 addQRTypeForNEON(MVT::v4f32); 748 addQRTypeForNEON(MVT::v2f64); 749 addQRTypeForNEON(MVT::v16i8); 750 addQRTypeForNEON(MVT::v8i16); 751 addQRTypeForNEON(MVT::v4i32); 752 addQRTypeForNEON(MVT::v2i64); 753 754 if (Subtarget->hasFullFP16()) { 755 addQRTypeForNEON(MVT::v8f16); 756 addDRTypeForNEON(MVT::v4f16); 757 } 758 } 759 760 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 761 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 762 // none of Neon, MVE or VFP supports any arithmetic operations on it. 763 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 764 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 765 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 766 // FIXME: Code duplication: FDIV and FREM are expanded always, see 767 // ARMTargetLowering::addTypeForNEON method for details. 768 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 769 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 770 // FIXME: Create unittest. 771 // In another words, find a way when "copysign" appears in DAG with vector 772 // operands. 773 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 774 // FIXME: Code duplication: SETCC has custom operation action, see 775 // ARMTargetLowering::addTypeForNEON method for details. 776 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 777 // FIXME: Create unittest for FNEG and for FABS. 778 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 779 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 780 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 781 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 782 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 783 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 784 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 785 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 786 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 787 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 788 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 789 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 790 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 791 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 792 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 793 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 794 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 795 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 796 } 797 798 if (Subtarget->hasNEON()) { 799 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 800 // supported for v4f32. 801 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 802 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 803 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 804 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 805 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 806 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 807 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 808 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 809 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 810 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 811 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 812 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 813 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 814 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 815 816 // Mark v2f32 intrinsics. 817 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 818 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 819 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 820 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 821 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 822 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 823 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 824 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 825 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 826 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 827 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 828 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 829 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 830 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 831 832 // Neon does not support some operations on v1i64 and v2i64 types. 833 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 834 // Custom handling for some quad-vector types to detect VMULL. 835 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 836 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 837 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 838 // Custom handling for some vector types to avoid expensive expansions 839 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 840 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 841 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 842 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 843 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 844 // a destination type that is wider than the source, and nor does 845 // it have a FP_TO_[SU]INT instruction with a narrower destination than 846 // source. 847 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 848 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 849 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 850 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 851 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 852 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 853 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 854 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 855 856 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 857 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 858 859 // NEON does not have single instruction CTPOP for vectors with element 860 // types wider than 8-bits. However, custom lowering can leverage the 861 // v8i8/v16i8 vcnt instruction. 862 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 863 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 864 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 865 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 866 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 867 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 868 869 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 870 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 871 872 // NEON does not have single instruction CTTZ for vectors. 873 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 874 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 875 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 876 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 877 878 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 879 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 880 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 881 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 882 883 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 884 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 885 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 886 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 887 888 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 889 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 890 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 891 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 892 893 // NEON only has FMA instructions as of VFP4. 894 if (!Subtarget->hasVFP4Base()) { 895 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 896 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 897 } 898 899 setTargetDAGCombine(ISD::INTRINSIC_VOID); 900 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 901 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 902 setTargetDAGCombine(ISD::SHL); 903 setTargetDAGCombine(ISD::SRL); 904 setTargetDAGCombine(ISD::SRA); 905 setTargetDAGCombine(ISD::FP_TO_SINT); 906 setTargetDAGCombine(ISD::FP_TO_UINT); 907 setTargetDAGCombine(ISD::FDIV); 908 setTargetDAGCombine(ISD::LOAD); 909 910 // It is legal to extload from v4i8 to v4i16 or v4i32. 911 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 912 MVT::v2i32}) { 913 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 914 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 915 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 916 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 917 } 918 } 919 } 920 921 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 922 setTargetDAGCombine(ISD::BUILD_VECTOR); 923 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 924 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 925 setTargetDAGCombine(ISD::STORE); 926 setTargetDAGCombine(ISD::SIGN_EXTEND); 927 setTargetDAGCombine(ISD::ZERO_EXTEND); 928 setTargetDAGCombine(ISD::ANY_EXTEND); 929 } 930 931 if (!Subtarget->hasFP64()) { 932 // When targeting a floating-point unit with only single-precision 933 // operations, f64 is legal for the few double-precision instructions which 934 // are present However, no double-precision operations other than moves, 935 // loads and stores are provided by the hardware. 936 setOperationAction(ISD::FADD, MVT::f64, Expand); 937 setOperationAction(ISD::FSUB, MVT::f64, Expand); 938 setOperationAction(ISD::FMUL, MVT::f64, Expand); 939 setOperationAction(ISD::FMA, MVT::f64, Expand); 940 setOperationAction(ISD::FDIV, MVT::f64, Expand); 941 setOperationAction(ISD::FREM, MVT::f64, Expand); 942 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 943 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 944 setOperationAction(ISD::FNEG, MVT::f64, Expand); 945 setOperationAction(ISD::FABS, MVT::f64, Expand); 946 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 947 setOperationAction(ISD::FSIN, MVT::f64, Expand); 948 setOperationAction(ISD::FCOS, MVT::f64, Expand); 949 setOperationAction(ISD::FPOW, MVT::f64, Expand); 950 setOperationAction(ISD::FLOG, MVT::f64, Expand); 951 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 952 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 953 setOperationAction(ISD::FEXP, MVT::f64, Expand); 954 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 955 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 956 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 957 setOperationAction(ISD::FRINT, MVT::f64, Expand); 958 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 959 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 960 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 961 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 962 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 963 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 964 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 965 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 966 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 967 } 968 969 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 970 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 971 if (Subtarget->hasFullFP16()) 972 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 973 } 974 975 if (!Subtarget->hasFP16()) 976 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 977 978 if (!Subtarget->hasFP64()) 979 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 980 981 computeRegisterProperties(Subtarget->getRegisterInfo()); 982 983 // ARM does not have floating-point extending loads. 984 for (MVT VT : MVT::fp_valuetypes()) { 985 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 986 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 987 } 988 989 // ... or truncating stores 990 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 991 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 992 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 993 994 // ARM does not have i1 sign extending load. 995 for (MVT VT : MVT::integer_valuetypes()) 996 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 997 998 // ARM supports all 4 flavors of integer indexed load / store. 999 if (!Subtarget->isThumb1Only()) { 1000 for (unsigned im = (unsigned)ISD::PRE_INC; 1001 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1002 setIndexedLoadAction(im, MVT::i1, Legal); 1003 setIndexedLoadAction(im, MVT::i8, Legal); 1004 setIndexedLoadAction(im, MVT::i16, Legal); 1005 setIndexedLoadAction(im, MVT::i32, Legal); 1006 setIndexedStoreAction(im, MVT::i1, Legal); 1007 setIndexedStoreAction(im, MVT::i8, Legal); 1008 setIndexedStoreAction(im, MVT::i16, Legal); 1009 setIndexedStoreAction(im, MVT::i32, Legal); 1010 } 1011 } else { 1012 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1013 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1014 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1015 } 1016 1017 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1018 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1019 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1020 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1021 1022 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1023 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1024 if (Subtarget->hasDSP()) { 1025 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1026 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1027 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1028 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1029 } 1030 if (Subtarget->hasBaseDSP()) { 1031 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1032 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1033 } 1034 1035 // i64 operation support. 1036 setOperationAction(ISD::MUL, MVT::i64, Expand); 1037 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1038 if (Subtarget->isThumb1Only()) { 1039 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1040 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1041 } 1042 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1043 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1044 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1045 1046 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1047 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1048 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1049 setOperationAction(ISD::SRL, MVT::i64, Custom); 1050 setOperationAction(ISD::SRA, MVT::i64, Custom); 1051 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1052 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1053 1054 // MVE lowers 64 bit shifts to lsll and lsrl 1055 // assuming that ISD::SRL and SRA of i64 are already marked custom 1056 if (Subtarget->hasMVEIntegerOps()) 1057 setOperationAction(ISD::SHL, MVT::i64, Custom); 1058 1059 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1060 if (Subtarget->isThumb1Only()) { 1061 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1062 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1063 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1064 } 1065 1066 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1067 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1068 1069 // ARM does not have ROTL. 1070 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1071 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1072 setOperationAction(ISD::ROTL, VT, Expand); 1073 setOperationAction(ISD::ROTR, VT, Expand); 1074 } 1075 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1076 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1077 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1078 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1079 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1080 } 1081 1082 // @llvm.readcyclecounter requires the Performance Monitors extension. 1083 // Default to the 0 expansion on unsupported platforms. 1084 // FIXME: Technically there are older ARM CPUs that have 1085 // implementation-specific ways of obtaining this information. 1086 if (Subtarget->hasPerfMon()) 1087 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1088 1089 // Only ARMv6 has BSWAP. 1090 if (!Subtarget->hasV6Ops()) 1091 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1092 1093 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1094 : Subtarget->hasDivideInARMMode(); 1095 if (!hasDivide) { 1096 // These are expanded into libcalls if the cpu doesn't have HW divider. 1097 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1098 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1099 } 1100 1101 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1102 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1103 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1104 1105 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1106 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1107 } 1108 1109 setOperationAction(ISD::SREM, MVT::i32, Expand); 1110 setOperationAction(ISD::UREM, MVT::i32, Expand); 1111 1112 // Register based DivRem for AEABI (RTABI 4.2) 1113 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1114 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1115 Subtarget->isTargetWindows()) { 1116 setOperationAction(ISD::SREM, MVT::i64, Custom); 1117 setOperationAction(ISD::UREM, MVT::i64, Custom); 1118 HasStandaloneRem = false; 1119 1120 if (Subtarget->isTargetWindows()) { 1121 const struct { 1122 const RTLIB::Libcall Op; 1123 const char * const Name; 1124 const CallingConv::ID CC; 1125 } LibraryCalls[] = { 1126 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1127 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1128 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1129 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1130 1131 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1132 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1133 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1134 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1135 }; 1136 1137 for (const auto &LC : LibraryCalls) { 1138 setLibcallName(LC.Op, LC.Name); 1139 setLibcallCallingConv(LC.Op, LC.CC); 1140 } 1141 } else { 1142 const struct { 1143 const RTLIB::Libcall Op; 1144 const char * const Name; 1145 const CallingConv::ID CC; 1146 } LibraryCalls[] = { 1147 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1148 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1149 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1150 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1151 1152 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1153 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1154 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1155 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1156 }; 1157 1158 for (const auto &LC : LibraryCalls) { 1159 setLibcallName(LC.Op, LC.Name); 1160 setLibcallCallingConv(LC.Op, LC.CC); 1161 } 1162 } 1163 1164 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1165 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1166 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1167 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1168 } else { 1169 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1170 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1171 } 1172 1173 if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) 1174 for (auto &VT : {MVT::f32, MVT::f64}) 1175 setOperationAction(ISD::FPOWI, VT, Custom); 1176 1177 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1178 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1179 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1180 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1181 1182 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1183 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1184 1185 // Use the default implementation. 1186 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1187 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1188 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1189 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1190 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1191 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1192 1193 if (Subtarget->isTargetWindows()) 1194 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1195 else 1196 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1197 1198 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1199 // the default expansion. 1200 InsertFencesForAtomic = false; 1201 if (Subtarget->hasAnyDataBarrier() && 1202 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1203 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1204 // to ldrex/strex loops already. 1205 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1206 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1207 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1208 1209 // On v8, we have particularly efficient implementations of atomic fences 1210 // if they can be combined with nearby atomic loads and stores. 1211 if (!Subtarget->hasAcquireRelease() || 1212 getTargetMachine().getOptLevel() == 0) { 1213 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1214 InsertFencesForAtomic = true; 1215 } 1216 } else { 1217 // If there's anything we can use as a barrier, go through custom lowering 1218 // for ATOMIC_FENCE. 1219 // If target has DMB in thumb, Fences can be inserted. 1220 if (Subtarget->hasDataBarrier()) 1221 InsertFencesForAtomic = true; 1222 1223 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1224 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1225 1226 // Set them all for expansion, which will force libcalls. 1227 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1228 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1229 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1230 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1231 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1232 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1233 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1234 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1235 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1236 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1237 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1238 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1239 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1240 // Unordered/Monotonic case. 1241 if (!InsertFencesForAtomic) { 1242 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1243 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1244 } 1245 } 1246 1247 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1248 1249 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1250 if (!Subtarget->hasV6Ops()) { 1251 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1252 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1253 } 1254 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1255 1256 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1257 !Subtarget->isThumb1Only()) { 1258 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1259 // iff target supports vfp2. 1260 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1261 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1262 } 1263 1264 // We want to custom lower some of our intrinsics. 1265 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1266 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1267 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1268 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1269 if (Subtarget->useSjLjEH()) 1270 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1271 1272 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1273 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1274 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1275 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1276 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1277 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1278 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1279 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1280 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1281 if (Subtarget->hasFullFP16()) { 1282 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1283 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1284 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1285 } 1286 1287 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1288 1289 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1290 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1291 if (Subtarget->hasFullFP16()) 1292 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1293 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1294 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1295 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1296 1297 // We don't support sin/cos/fmod/copysign/pow 1298 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1299 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1300 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1301 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1302 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1303 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1304 setOperationAction(ISD::FREM, MVT::f64, Expand); 1305 setOperationAction(ISD::FREM, MVT::f32, Expand); 1306 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1307 !Subtarget->isThumb1Only()) { 1308 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1309 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1310 } 1311 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1312 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1313 1314 if (!Subtarget->hasVFP4Base()) { 1315 setOperationAction(ISD::FMA, MVT::f64, Expand); 1316 setOperationAction(ISD::FMA, MVT::f32, Expand); 1317 } 1318 1319 // Various VFP goodness 1320 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1321 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1322 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1323 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1324 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1325 } 1326 1327 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1328 if (!Subtarget->hasFP16()) { 1329 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1330 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1331 } 1332 } 1333 1334 // Use __sincos_stret if available. 1335 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1336 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1337 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1338 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1339 } 1340 1341 // FP-ARMv8 implements a lot of rounding-like FP operations. 1342 if (Subtarget->hasFPARMv8Base()) { 1343 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1344 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1345 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1346 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1347 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1348 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1349 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1350 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1351 if (Subtarget->hasNEON()) { 1352 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1353 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1354 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1355 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1356 } 1357 1358 if (Subtarget->hasFP64()) { 1359 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1360 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1361 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1362 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1363 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1364 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1365 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1366 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1367 } 1368 } 1369 1370 // FP16 often need to be promoted to call lib functions 1371 if (Subtarget->hasFullFP16()) { 1372 setOperationAction(ISD::FREM, MVT::f16, Promote); 1373 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1374 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1375 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1376 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1377 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1378 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1379 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1380 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1381 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1382 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1383 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1384 1385 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1386 } 1387 1388 if (Subtarget->hasNEON()) { 1389 // vmin and vmax aren't available in a scalar form, so we use 1390 // a NEON instruction with an undef lane instead. 1391 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1392 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1393 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1394 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1395 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1396 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1397 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1398 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1399 1400 if (Subtarget->hasFullFP16()) { 1401 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1402 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1403 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1404 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1405 1406 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1407 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1408 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1409 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1410 } 1411 } 1412 1413 // We have target-specific dag combine patterns for the following nodes: 1414 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1415 setTargetDAGCombine(ISD::ADD); 1416 setTargetDAGCombine(ISD::SUB); 1417 setTargetDAGCombine(ISD::MUL); 1418 setTargetDAGCombine(ISD::AND); 1419 setTargetDAGCombine(ISD::OR); 1420 setTargetDAGCombine(ISD::XOR); 1421 1422 if (Subtarget->hasV6Ops()) 1423 setTargetDAGCombine(ISD::SRL); 1424 if (Subtarget->isThumb1Only()) 1425 setTargetDAGCombine(ISD::SHL); 1426 1427 setStackPointerRegisterToSaveRestore(ARM::SP); 1428 1429 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1430 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1431 setSchedulingPreference(Sched::RegPressure); 1432 else 1433 setSchedulingPreference(Sched::Hybrid); 1434 1435 //// temporary - rewrite interface to use type 1436 MaxStoresPerMemset = 8; 1437 MaxStoresPerMemsetOptSize = 4; 1438 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1439 MaxStoresPerMemcpyOptSize = 2; 1440 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1441 MaxStoresPerMemmoveOptSize = 2; 1442 1443 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1444 // are at least 4 bytes aligned. 1445 setMinStackArgumentAlignment(Align(4)); 1446 1447 // Prefer likely predicted branches to selects on out-of-order cores. 1448 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1449 1450 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1451 1452 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1453 1454 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1455 setTargetDAGCombine(ISD::ABS); 1456 } 1457 1458 bool ARMTargetLowering::useSoftFloat() const { 1459 return Subtarget->useSoftFloat(); 1460 } 1461 1462 // FIXME: It might make sense to define the representative register class as the 1463 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1464 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1465 // SPR's representative would be DPR_VFP2. This should work well if register 1466 // pressure tracking were modified such that a register use would increment the 1467 // pressure of the register class's representative and all of it's super 1468 // classes' representatives transitively. We have not implemented this because 1469 // of the difficulty prior to coalescing of modeling operand register classes 1470 // due to the common occurrence of cross class copies and subregister insertions 1471 // and extractions. 1472 std::pair<const TargetRegisterClass *, uint8_t> 1473 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1474 MVT VT) const { 1475 const TargetRegisterClass *RRC = nullptr; 1476 uint8_t Cost = 1; 1477 switch (VT.SimpleTy) { 1478 default: 1479 return TargetLowering::findRepresentativeClass(TRI, VT); 1480 // Use DPR as representative register class for all floating point 1481 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1482 // the cost is 1 for both f32 and f64. 1483 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1484 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1485 RRC = &ARM::DPRRegClass; 1486 // When NEON is used for SP, only half of the register file is available 1487 // because operations that define both SP and DP results will be constrained 1488 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1489 // coalescing by double-counting the SP regs. See the FIXME above. 1490 if (Subtarget->useNEONForSinglePrecisionFP()) 1491 Cost = 2; 1492 break; 1493 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1494 case MVT::v4f32: case MVT::v2f64: 1495 RRC = &ARM::DPRRegClass; 1496 Cost = 2; 1497 break; 1498 case MVT::v4i64: 1499 RRC = &ARM::DPRRegClass; 1500 Cost = 4; 1501 break; 1502 case MVT::v8i64: 1503 RRC = &ARM::DPRRegClass; 1504 Cost = 8; 1505 break; 1506 } 1507 return std::make_pair(RRC, Cost); 1508 } 1509 1510 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1511 switch ((ARMISD::NodeType)Opcode) { 1512 case ARMISD::FIRST_NUMBER: break; 1513 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1514 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1515 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1516 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1517 case ARMISD::CALL: return "ARMISD::CALL"; 1518 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1519 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1520 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1521 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1522 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1523 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1524 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1525 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1526 case ARMISD::CMP: return "ARMISD::CMP"; 1527 case ARMISD::CMN: return "ARMISD::CMN"; 1528 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1529 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1530 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1531 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1532 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1533 1534 case ARMISD::CMOV: return "ARMISD::CMOV"; 1535 case ARMISD::SUBS: return "ARMISD::SUBS"; 1536 1537 case ARMISD::SSAT: return "ARMISD::SSAT"; 1538 case ARMISD::USAT: return "ARMISD::USAT"; 1539 1540 case ARMISD::ASRL: return "ARMISD::ASRL"; 1541 case ARMISD::LSRL: return "ARMISD::LSRL"; 1542 case ARMISD::LSLL: return "ARMISD::LSLL"; 1543 1544 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1545 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1546 case ARMISD::RRX: return "ARMISD::RRX"; 1547 1548 case ARMISD::ADDC: return "ARMISD::ADDC"; 1549 case ARMISD::ADDE: return "ARMISD::ADDE"; 1550 case ARMISD::SUBC: return "ARMISD::SUBC"; 1551 case ARMISD::SUBE: return "ARMISD::SUBE"; 1552 case ARMISD::LSLS: return "ARMISD::LSLS"; 1553 1554 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1555 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1556 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1557 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1558 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1559 1560 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1561 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1562 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1563 1564 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1565 1566 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1567 1568 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1569 1570 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1571 1572 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1573 1574 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1575 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1576 1577 case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; 1578 case ARMISD::VCMP: return "ARMISD::VCMP"; 1579 case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; 1580 case ARMISD::VTST: return "ARMISD::VTST"; 1581 1582 case ARMISD::VSHLs: return "ARMISD::VSHLs"; 1583 case ARMISD::VSHLu: return "ARMISD::VSHLu"; 1584 case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; 1585 case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; 1586 case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; 1587 case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; 1588 case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; 1589 case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; 1590 case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; 1591 case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; 1592 case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; 1593 case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; 1594 case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; 1595 case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; 1596 case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; 1597 case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; 1598 case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; 1599 case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; 1600 case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; 1601 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1602 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1603 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1604 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1605 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1606 case ARMISD::VDUP: return "ARMISD::VDUP"; 1607 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1608 case ARMISD::VEXT: return "ARMISD::VEXT"; 1609 case ARMISD::VREV64: return "ARMISD::VREV64"; 1610 case ARMISD::VREV32: return "ARMISD::VREV32"; 1611 case ARMISD::VREV16: return "ARMISD::VREV16"; 1612 case ARMISD::VZIP: return "ARMISD::VZIP"; 1613 case ARMISD::VUZP: return "ARMISD::VUZP"; 1614 case ARMISD::VTRN: return "ARMISD::VTRN"; 1615 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1616 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1617 case ARMISD::VMOVN: return "ARMISD::VMOVN"; 1618 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1619 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1620 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1621 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1622 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1623 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1624 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1625 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1626 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1627 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1628 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1629 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1630 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1631 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1632 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1633 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1634 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1635 case ARMISD::QADD16b: return "ARMISD::QADD16b"; 1636 case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; 1637 case ARMISD::QADD8b: return "ARMISD::QADD8b"; 1638 case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; 1639 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1640 case ARMISD::BFI: return "ARMISD::BFI"; 1641 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1642 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1643 case ARMISD::VBSL: return "ARMISD::VBSL"; 1644 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1645 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1646 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1647 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1648 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1649 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1650 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1651 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1652 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1653 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1654 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1655 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1656 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1657 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1658 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1659 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1660 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1661 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1662 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1663 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1664 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1665 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1666 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1667 case ARMISD::WLS: return "ARMISD::WLS"; 1668 case ARMISD::LE: return "ARMISD::LE"; 1669 case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; 1670 case ARMISD::CSINV: return "ARMISD::CSINV"; 1671 case ARMISD::CSNEG: return "ARMISD::CSNEG"; 1672 case ARMISD::CSINC: return "ARMISD::CSINC"; 1673 } 1674 return nullptr; 1675 } 1676 1677 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1678 EVT VT) const { 1679 if (!VT.isVector()) 1680 return getPointerTy(DL); 1681 1682 // MVE has a predicate register. 1683 if (Subtarget->hasMVEIntegerOps() && 1684 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1685 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1686 return VT.changeVectorElementTypeToInteger(); 1687 } 1688 1689 /// getRegClassFor - Return the register class that should be used for the 1690 /// specified value type. 1691 const TargetRegisterClass * 1692 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1693 (void)isDivergent; 1694 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1695 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1696 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1697 // MVE Q registers. 1698 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1699 if (VT == MVT::v4i64) 1700 return &ARM::QQPRRegClass; 1701 if (VT == MVT::v8i64) 1702 return &ARM::QQQQPRRegClass; 1703 } 1704 return TargetLowering::getRegClassFor(VT); 1705 } 1706 1707 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1708 // source/dest is aligned and the copy size is large enough. We therefore want 1709 // to align such objects passed to memory intrinsics. 1710 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1711 unsigned &PrefAlign) const { 1712 if (!isa<MemIntrinsic>(CI)) 1713 return false; 1714 MinSize = 8; 1715 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1716 // cycle faster than 4-byte aligned LDM. 1717 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1718 return true; 1719 } 1720 1721 // Create a fast isel object. 1722 FastISel * 1723 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1724 const TargetLibraryInfo *libInfo) const { 1725 return ARM::createFastISel(funcInfo, libInfo); 1726 } 1727 1728 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1729 unsigned NumVals = N->getNumValues(); 1730 if (!NumVals) 1731 return Sched::RegPressure; 1732 1733 for (unsigned i = 0; i != NumVals; ++i) { 1734 EVT VT = N->getValueType(i); 1735 if (VT == MVT::Glue || VT == MVT::Other) 1736 continue; 1737 if (VT.isFloatingPoint() || VT.isVector()) 1738 return Sched::ILP; 1739 } 1740 1741 if (!N->isMachineOpcode()) 1742 return Sched::RegPressure; 1743 1744 // Load are scheduled for latency even if there instruction itinerary 1745 // is not available. 1746 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1747 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1748 1749 if (MCID.getNumDefs() == 0) 1750 return Sched::RegPressure; 1751 if (!Itins->isEmpty() && 1752 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1753 return Sched::ILP; 1754 1755 return Sched::RegPressure; 1756 } 1757 1758 //===----------------------------------------------------------------------===// 1759 // Lowering Code 1760 //===----------------------------------------------------------------------===// 1761 1762 static bool isSRL16(const SDValue &Op) { 1763 if (Op.getOpcode() != ISD::SRL) 1764 return false; 1765 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1766 return Const->getZExtValue() == 16; 1767 return false; 1768 } 1769 1770 static bool isSRA16(const SDValue &Op) { 1771 if (Op.getOpcode() != ISD::SRA) 1772 return false; 1773 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1774 return Const->getZExtValue() == 16; 1775 return false; 1776 } 1777 1778 static bool isSHL16(const SDValue &Op) { 1779 if (Op.getOpcode() != ISD::SHL) 1780 return false; 1781 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1782 return Const->getZExtValue() == 16; 1783 return false; 1784 } 1785 1786 // Check for a signed 16-bit value. We special case SRA because it makes it 1787 // more simple when also looking for SRAs that aren't sign extending a 1788 // smaller value. Without the check, we'd need to take extra care with 1789 // checking order for some operations. 1790 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1791 if (isSRA16(Op)) 1792 return isSHL16(Op.getOperand(0)); 1793 return DAG.ComputeNumSignBits(Op) == 17; 1794 } 1795 1796 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1797 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1798 switch (CC) { 1799 default: llvm_unreachable("Unknown condition code!"); 1800 case ISD::SETNE: return ARMCC::NE; 1801 case ISD::SETEQ: return ARMCC::EQ; 1802 case ISD::SETGT: return ARMCC::GT; 1803 case ISD::SETGE: return ARMCC::GE; 1804 case ISD::SETLT: return ARMCC::LT; 1805 case ISD::SETLE: return ARMCC::LE; 1806 case ISD::SETUGT: return ARMCC::HI; 1807 case ISD::SETUGE: return ARMCC::HS; 1808 case ISD::SETULT: return ARMCC::LO; 1809 case ISD::SETULE: return ARMCC::LS; 1810 } 1811 } 1812 1813 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1814 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1815 ARMCC::CondCodes &CondCode2) { 1816 CondCode2 = ARMCC::AL; 1817 switch (CC) { 1818 default: llvm_unreachable("Unknown FP condition!"); 1819 case ISD::SETEQ: 1820 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1821 case ISD::SETGT: 1822 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1823 case ISD::SETGE: 1824 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1825 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1826 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1827 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1828 case ISD::SETO: CondCode = ARMCC::VC; break; 1829 case ISD::SETUO: CondCode = ARMCC::VS; break; 1830 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1831 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1832 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1833 case ISD::SETLT: 1834 case ISD::SETULT: CondCode = ARMCC::LT; break; 1835 case ISD::SETLE: 1836 case ISD::SETULE: CondCode = ARMCC::LE; break; 1837 case ISD::SETNE: 1838 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1839 } 1840 } 1841 1842 //===----------------------------------------------------------------------===// 1843 // Calling Convention Implementation 1844 //===----------------------------------------------------------------------===// 1845 1846 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1847 /// account presence of floating point hardware and calling convention 1848 /// limitations, such as support for variadic functions. 1849 CallingConv::ID 1850 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1851 bool isVarArg) const { 1852 switch (CC) { 1853 default: 1854 report_fatal_error("Unsupported calling convention"); 1855 case CallingConv::ARM_AAPCS: 1856 case CallingConv::ARM_APCS: 1857 case CallingConv::GHC: 1858 case CallingConv::CFGuard_Check: 1859 return CC; 1860 case CallingConv::PreserveMost: 1861 return CallingConv::PreserveMost; 1862 case CallingConv::ARM_AAPCS_VFP: 1863 case CallingConv::Swift: 1864 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1865 case CallingConv::C: 1866 if (!Subtarget->isAAPCS_ABI()) 1867 return CallingConv::ARM_APCS; 1868 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 1869 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1870 !isVarArg) 1871 return CallingConv::ARM_AAPCS_VFP; 1872 else 1873 return CallingConv::ARM_AAPCS; 1874 case CallingConv::Fast: 1875 case CallingConv::CXX_FAST_TLS: 1876 if (!Subtarget->isAAPCS_ABI()) { 1877 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 1878 return CallingConv::Fast; 1879 return CallingConv::ARM_APCS; 1880 } else if (Subtarget->hasVFP2Base() && 1881 !Subtarget->isThumb1Only() && !isVarArg) 1882 return CallingConv::ARM_AAPCS_VFP; 1883 else 1884 return CallingConv::ARM_AAPCS; 1885 } 1886 } 1887 1888 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1889 bool isVarArg) const { 1890 return CCAssignFnForNode(CC, false, isVarArg); 1891 } 1892 1893 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1894 bool isVarArg) const { 1895 return CCAssignFnForNode(CC, true, isVarArg); 1896 } 1897 1898 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1899 /// CallingConvention. 1900 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1901 bool Return, 1902 bool isVarArg) const { 1903 switch (getEffectiveCallingConv(CC, isVarArg)) { 1904 default: 1905 report_fatal_error("Unsupported calling convention"); 1906 case CallingConv::ARM_APCS: 1907 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1908 case CallingConv::ARM_AAPCS: 1909 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1910 case CallingConv::ARM_AAPCS_VFP: 1911 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1912 case CallingConv::Fast: 1913 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1914 case CallingConv::GHC: 1915 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1916 case CallingConv::PreserveMost: 1917 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1918 case CallingConv::CFGuard_Check: 1919 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 1920 } 1921 } 1922 1923 /// LowerCallResult - Lower the result values of a call into the 1924 /// appropriate copies out of appropriate physical registers. 1925 SDValue ARMTargetLowering::LowerCallResult( 1926 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1927 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1928 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1929 SDValue ThisVal) const { 1930 // Assign locations to each value returned by this call. 1931 SmallVector<CCValAssign, 16> RVLocs; 1932 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1933 *DAG.getContext()); 1934 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1935 1936 // Copy all of the result registers out of their specified physreg. 1937 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1938 CCValAssign VA = RVLocs[i]; 1939 1940 // Pass 'this' value directly from the argument to return value, to avoid 1941 // reg unit interference 1942 if (i == 0 && isThisReturn) { 1943 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1944 "unexpected return calling convention register assignment"); 1945 InVals.push_back(ThisVal); 1946 continue; 1947 } 1948 1949 SDValue Val; 1950 if (VA.needsCustom()) { 1951 // Handle f64 or half of a v2f64. 1952 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1953 InFlag); 1954 Chain = Lo.getValue(1); 1955 InFlag = Lo.getValue(2); 1956 VA = RVLocs[++i]; // skip ahead to next loc 1957 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1958 InFlag); 1959 Chain = Hi.getValue(1); 1960 InFlag = Hi.getValue(2); 1961 if (!Subtarget->isLittle()) 1962 std::swap (Lo, Hi); 1963 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1964 1965 if (VA.getLocVT() == MVT::v2f64) { 1966 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1967 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1968 DAG.getConstant(0, dl, MVT::i32)); 1969 1970 VA = RVLocs[++i]; // skip ahead to next loc 1971 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1972 Chain = Lo.getValue(1); 1973 InFlag = Lo.getValue(2); 1974 VA = RVLocs[++i]; // skip ahead to next loc 1975 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1976 Chain = Hi.getValue(1); 1977 InFlag = Hi.getValue(2); 1978 if (!Subtarget->isLittle()) 1979 std::swap (Lo, Hi); 1980 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1981 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1982 DAG.getConstant(1, dl, MVT::i32)); 1983 } 1984 } else { 1985 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1986 InFlag); 1987 Chain = Val.getValue(1); 1988 InFlag = Val.getValue(2); 1989 } 1990 1991 switch (VA.getLocInfo()) { 1992 default: llvm_unreachable("Unknown loc info!"); 1993 case CCValAssign::Full: break; 1994 case CCValAssign::BCvt: 1995 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1996 break; 1997 } 1998 1999 InVals.push_back(Val); 2000 } 2001 2002 return Chain; 2003 } 2004 2005 /// LowerMemOpCallTo - Store the argument to the stack. 2006 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2007 SDValue Arg, const SDLoc &dl, 2008 SelectionDAG &DAG, 2009 const CCValAssign &VA, 2010 ISD::ArgFlagsTy Flags) const { 2011 unsigned LocMemOffset = VA.getLocMemOffset(); 2012 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2013 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2014 StackPtr, PtrOff); 2015 return DAG.getStore( 2016 Chain, dl, Arg, PtrOff, 2017 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2018 } 2019 2020 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2021 SDValue Chain, SDValue &Arg, 2022 RegsToPassVector &RegsToPass, 2023 CCValAssign &VA, CCValAssign &NextVA, 2024 SDValue &StackPtr, 2025 SmallVectorImpl<SDValue> &MemOpChains, 2026 ISD::ArgFlagsTy Flags) const { 2027 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2028 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2029 unsigned id = Subtarget->isLittle() ? 0 : 1; 2030 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2031 2032 if (NextVA.isRegLoc()) 2033 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2034 else { 2035 assert(NextVA.isMemLoc()); 2036 if (!StackPtr.getNode()) 2037 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2038 getPointerTy(DAG.getDataLayout())); 2039 2040 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2041 dl, DAG, NextVA, 2042 Flags)); 2043 } 2044 } 2045 2046 /// LowerCall - Lowering a call into a callseq_start <- 2047 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2048 /// nodes. 2049 SDValue 2050 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2051 SmallVectorImpl<SDValue> &InVals) const { 2052 SelectionDAG &DAG = CLI.DAG; 2053 SDLoc &dl = CLI.DL; 2054 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2055 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2056 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2057 SDValue Chain = CLI.Chain; 2058 SDValue Callee = CLI.Callee; 2059 bool &isTailCall = CLI.IsTailCall; 2060 CallingConv::ID CallConv = CLI.CallConv; 2061 bool doesNotRet = CLI.DoesNotReturn; 2062 bool isVarArg = CLI.IsVarArg; 2063 2064 MachineFunction &MF = DAG.getMachineFunction(); 2065 MachineFunction::CallSiteInfo CSInfo; 2066 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2067 bool isThisReturn = false; 2068 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); 2069 bool PreferIndirect = false; 2070 2071 // Disable tail calls if they're not supported. 2072 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 2073 isTailCall = false; 2074 2075 if (isa<GlobalAddressSDNode>(Callee)) { 2076 // If we're optimizing for minimum size and the function is called three or 2077 // more times in this block, we can improve codesize by calling indirectly 2078 // as BLXr has a 16-bit encoding. 2079 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2080 if (CLI.CS) { 2081 auto *BB = CLI.CS.getParent(); 2082 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2083 count_if(GV->users(), [&BB](const User *U) { 2084 return isa<Instruction>(U) && 2085 cast<Instruction>(U)->getParent() == BB; 2086 }) > 2; 2087 } 2088 } 2089 if (isTailCall) { 2090 // Check if it's really possible to do a tail call. 2091 isTailCall = IsEligibleForTailCallOptimization( 2092 Callee, CallConv, isVarArg, isStructRet, 2093 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2094 PreferIndirect); 2095 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 2096 report_fatal_error("failed to perform tail call elimination on a call " 2097 "site marked musttail"); 2098 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2099 // detected sibcalls. 2100 if (isTailCall) 2101 ++NumTailCalls; 2102 } 2103 2104 // Analyze operands of the call, assigning locations to each operand. 2105 SmallVector<CCValAssign, 16> ArgLocs; 2106 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2107 *DAG.getContext()); 2108 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2109 2110 // Get a count of how many bytes are to be pushed on the stack. 2111 unsigned NumBytes = CCInfo.getNextStackOffset(); 2112 2113 if (isTailCall) { 2114 // For tail calls, memory operands are available in our caller's stack. 2115 NumBytes = 0; 2116 } else { 2117 // Adjust the stack pointer for the new arguments... 2118 // These operations are automatically eliminated by the prolog/epilog pass 2119 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2120 } 2121 2122 SDValue StackPtr = 2123 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2124 2125 RegsToPassVector RegsToPass; 2126 SmallVector<SDValue, 8> MemOpChains; 2127 2128 // Walk the register/memloc assignments, inserting copies/loads. In the case 2129 // of tail call optimization, arguments are handled later. 2130 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2131 i != e; 2132 ++i, ++realArgIdx) { 2133 CCValAssign &VA = ArgLocs[i]; 2134 SDValue Arg = OutVals[realArgIdx]; 2135 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2136 bool isByVal = Flags.isByVal(); 2137 2138 // Promote the value if needed. 2139 switch (VA.getLocInfo()) { 2140 default: llvm_unreachable("Unknown loc info!"); 2141 case CCValAssign::Full: break; 2142 case CCValAssign::SExt: 2143 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2144 break; 2145 case CCValAssign::ZExt: 2146 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2147 break; 2148 case CCValAssign::AExt: 2149 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2150 break; 2151 case CCValAssign::BCvt: 2152 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2153 break; 2154 } 2155 2156 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2157 if (VA.needsCustom()) { 2158 if (VA.getLocVT() == MVT::v2f64) { 2159 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2160 DAG.getConstant(0, dl, MVT::i32)); 2161 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2162 DAG.getConstant(1, dl, MVT::i32)); 2163 2164 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 2165 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2166 2167 VA = ArgLocs[++i]; // skip ahead to next loc 2168 if (VA.isRegLoc()) { 2169 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 2170 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2171 } else { 2172 assert(VA.isMemLoc()); 2173 2174 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 2175 dl, DAG, VA, Flags)); 2176 } 2177 } else { 2178 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2179 StackPtr, MemOpChains, Flags); 2180 } 2181 } else if (VA.isRegLoc()) { 2182 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2183 Outs[0].VT == MVT::i32) { 2184 assert(VA.getLocVT() == MVT::i32 && 2185 "unexpected calling convention register assignment"); 2186 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2187 "unexpected use of 'returned'"); 2188 isThisReturn = true; 2189 } 2190 const TargetOptions &Options = DAG.getTarget().Options; 2191 if (Options.EnableDebugEntryValues) 2192 CSInfo.emplace_back(VA.getLocReg(), i); 2193 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2194 } else if (isByVal) { 2195 assert(VA.isMemLoc()); 2196 unsigned offset = 0; 2197 2198 // True if this byval aggregate will be split between registers 2199 // and memory. 2200 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2201 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2202 2203 if (CurByValIdx < ByValArgsCount) { 2204 2205 unsigned RegBegin, RegEnd; 2206 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2207 2208 EVT PtrVT = 2209 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2210 unsigned int i, j; 2211 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2212 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2213 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2214 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 2215 MachinePointerInfo(), 2216 DAG.InferPtrAlignment(AddArg)); 2217 MemOpChains.push_back(Load.getValue(1)); 2218 RegsToPass.push_back(std::make_pair(j, Load)); 2219 } 2220 2221 // If parameter size outsides register area, "offset" value 2222 // helps us to calculate stack slot for remained part properly. 2223 offset = RegEnd - RegBegin; 2224 2225 CCInfo.nextInRegsParam(); 2226 } 2227 2228 if (Flags.getByValSize() > 4*offset) { 2229 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2230 unsigned LocMemOffset = VA.getLocMemOffset(); 2231 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2232 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2233 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2234 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2235 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2236 MVT::i32); 2237 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 2238 MVT::i32); 2239 2240 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2241 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2242 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2243 Ops)); 2244 } 2245 } else if (!isTailCall) { 2246 assert(VA.isMemLoc()); 2247 2248 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2249 dl, DAG, VA, Flags)); 2250 } 2251 } 2252 2253 if (!MemOpChains.empty()) 2254 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2255 2256 // Build a sequence of copy-to-reg nodes chained together with token chain 2257 // and flag operands which copy the outgoing args into the appropriate regs. 2258 SDValue InFlag; 2259 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2260 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2261 RegsToPass[i].second, InFlag); 2262 InFlag = Chain.getValue(1); 2263 } 2264 2265 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2266 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2267 // node so that legalize doesn't hack it. 2268 bool isDirect = false; 2269 2270 const TargetMachine &TM = getTargetMachine(); 2271 const Module *Mod = MF.getFunction().getParent(); 2272 const GlobalValue *GV = nullptr; 2273 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2274 GV = G->getGlobal(); 2275 bool isStub = 2276 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2277 2278 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2279 bool isLocalARMFunc = false; 2280 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2281 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2282 2283 if (Subtarget->genLongCalls()) { 2284 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2285 "long-calls codegen is not position independent!"); 2286 // Handle a global address or an external symbol. If it's not one of 2287 // those, the target's already in a register, so we don't need to do 2288 // anything extra. 2289 if (isa<GlobalAddressSDNode>(Callee)) { 2290 // Create a constant pool entry for the callee address 2291 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2292 ARMConstantPoolValue *CPV = 2293 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2294 2295 // Get the address of the callee into a register 2296 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2297 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2298 Callee = DAG.getLoad( 2299 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2300 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2301 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2302 const char *Sym = S->getSymbol(); 2303 2304 // Create a constant pool entry for the callee address 2305 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2306 ARMConstantPoolValue *CPV = 2307 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2308 ARMPCLabelIndex, 0); 2309 // Get the address of the callee into a register 2310 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2311 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2312 Callee = DAG.getLoad( 2313 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2314 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2315 } 2316 } else if (isa<GlobalAddressSDNode>(Callee)) { 2317 if (!PreferIndirect) { 2318 isDirect = true; 2319 bool isDef = GV->isStrongDefinitionForLinker(); 2320 2321 // ARM call to a local ARM function is predicable. 2322 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2323 // tBX takes a register source operand. 2324 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2325 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2326 Callee = DAG.getNode( 2327 ARMISD::WrapperPIC, dl, PtrVt, 2328 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2329 Callee = DAG.getLoad( 2330 PtrVt, dl, DAG.getEntryNode(), Callee, 2331 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2332 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2333 MachineMemOperand::MOInvariant); 2334 } else if (Subtarget->isTargetCOFF()) { 2335 assert(Subtarget->isTargetWindows() && 2336 "Windows is the only supported COFF target"); 2337 unsigned TargetFlags = GV->hasDLLImportStorageClass() 2338 ? ARMII::MO_DLLIMPORT 2339 : ARMII::MO_NO_FLAG; 2340 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2341 TargetFlags); 2342 if (GV->hasDLLImportStorageClass()) 2343 Callee = 2344 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2345 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2346 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2347 } else { 2348 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2349 } 2350 } 2351 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2352 isDirect = true; 2353 // tBX takes a register source operand. 2354 const char *Sym = S->getSymbol(); 2355 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2356 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2357 ARMConstantPoolValue *CPV = 2358 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2359 ARMPCLabelIndex, 4); 2360 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2361 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2362 Callee = DAG.getLoad( 2363 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2364 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2365 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2366 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2367 } else { 2368 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2369 } 2370 } 2371 2372 // FIXME: handle tail calls differently. 2373 unsigned CallOpc; 2374 if (Subtarget->isThumb()) { 2375 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2376 CallOpc = ARMISD::CALL_NOLINK; 2377 else 2378 CallOpc = ARMISD::CALL; 2379 } else { 2380 if (!isDirect && !Subtarget->hasV5TOps()) 2381 CallOpc = ARMISD::CALL_NOLINK; 2382 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2383 // Emit regular call when code size is the priority 2384 !Subtarget->hasMinSize()) 2385 // "mov lr, pc; b _foo" to avoid confusing the RSP 2386 CallOpc = ARMISD::CALL_NOLINK; 2387 else 2388 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2389 } 2390 2391 std::vector<SDValue> Ops; 2392 Ops.push_back(Chain); 2393 Ops.push_back(Callee); 2394 2395 // Add argument registers to the end of the list so that they are known live 2396 // into the call. 2397 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2398 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2399 RegsToPass[i].second.getValueType())); 2400 2401 // Add a register mask operand representing the call-preserved registers. 2402 if (!isTailCall) { 2403 const uint32_t *Mask; 2404 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2405 if (isThisReturn) { 2406 // For 'this' returns, use the R0-preserving mask if applicable 2407 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2408 if (!Mask) { 2409 // Set isThisReturn to false if the calling convention is not one that 2410 // allows 'returned' to be modeled in this way, so LowerCallResult does 2411 // not try to pass 'this' straight through 2412 isThisReturn = false; 2413 Mask = ARI->getCallPreservedMask(MF, CallConv); 2414 } 2415 } else 2416 Mask = ARI->getCallPreservedMask(MF, CallConv); 2417 2418 assert(Mask && "Missing call preserved mask for calling convention"); 2419 Ops.push_back(DAG.getRegisterMask(Mask)); 2420 } 2421 2422 if (InFlag.getNode()) 2423 Ops.push_back(InFlag); 2424 2425 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2426 if (isTailCall) { 2427 MF.getFrameInfo().setHasTailCall(); 2428 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2429 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2430 return Ret; 2431 } 2432 2433 // Returns a chain and a flag for retval copy to use. 2434 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2435 InFlag = Chain.getValue(1); 2436 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2437 2438 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2439 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2440 if (!Ins.empty()) 2441 InFlag = Chain.getValue(1); 2442 2443 // Handle result values, copying them out of physregs into vregs that we 2444 // return. 2445 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2446 InVals, isThisReturn, 2447 isThisReturn ? OutVals[0] : SDValue()); 2448 } 2449 2450 /// HandleByVal - Every parameter *after* a byval parameter is passed 2451 /// on the stack. Remember the next parameter register to allocate, 2452 /// and then confiscate the rest of the parameter registers to insure 2453 /// this. 2454 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2455 unsigned Align) const { 2456 // Byval (as with any stack) slots are always at least 4 byte aligned. 2457 Align = std::max(Align, 4U); 2458 2459 unsigned Reg = State->AllocateReg(GPRArgRegs); 2460 if (!Reg) 2461 return; 2462 2463 unsigned AlignInRegs = Align / 4; 2464 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2465 for (unsigned i = 0; i < Waste; ++i) 2466 Reg = State->AllocateReg(GPRArgRegs); 2467 2468 if (!Reg) 2469 return; 2470 2471 unsigned Excess = 4 * (ARM::R4 - Reg); 2472 2473 // Special case when NSAA != SP and parameter size greater than size of 2474 // all remained GPR regs. In that case we can't split parameter, we must 2475 // send it to stack. We also must set NCRN to R4, so waste all 2476 // remained registers. 2477 const unsigned NSAAOffset = State->getNextStackOffset(); 2478 if (NSAAOffset != 0 && Size > Excess) { 2479 while (State->AllocateReg(GPRArgRegs)) 2480 ; 2481 return; 2482 } 2483 2484 // First register for byval parameter is the first register that wasn't 2485 // allocated before this method call, so it would be "reg". 2486 // If parameter is small enough to be saved in range [reg, r4), then 2487 // the end (first after last) register would be reg + param-size-in-regs, 2488 // else parameter would be splitted between registers and stack, 2489 // end register would be r4 in this case. 2490 unsigned ByValRegBegin = Reg; 2491 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2492 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2493 // Note, first register is allocated in the beginning of function already, 2494 // allocate remained amount of registers we need. 2495 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2496 State->AllocateReg(GPRArgRegs); 2497 // A byval parameter that is split between registers and memory needs its 2498 // size truncated here. 2499 // In the case where the entire structure fits in registers, we set the 2500 // size in memory to zero. 2501 Size = std::max<int>(Size - Excess, 0); 2502 } 2503 2504 /// MatchingStackOffset - Return true if the given stack call argument is 2505 /// already available in the same position (relatively) of the caller's 2506 /// incoming argument stack. 2507 static 2508 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2509 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2510 const TargetInstrInfo *TII) { 2511 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2512 int FI = std::numeric_limits<int>::max(); 2513 if (Arg.getOpcode() == ISD::CopyFromReg) { 2514 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2515 if (!Register::isVirtualRegister(VR)) 2516 return false; 2517 MachineInstr *Def = MRI->getVRegDef(VR); 2518 if (!Def) 2519 return false; 2520 if (!Flags.isByVal()) { 2521 if (!TII->isLoadFromStackSlot(*Def, FI)) 2522 return false; 2523 } else { 2524 return false; 2525 } 2526 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2527 if (Flags.isByVal()) 2528 // ByVal argument is passed in as a pointer but it's now being 2529 // dereferenced. e.g. 2530 // define @foo(%struct.X* %A) { 2531 // tail call @bar(%struct.X* byval %A) 2532 // } 2533 return false; 2534 SDValue Ptr = Ld->getBasePtr(); 2535 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2536 if (!FINode) 2537 return false; 2538 FI = FINode->getIndex(); 2539 } else 2540 return false; 2541 2542 assert(FI != std::numeric_limits<int>::max()); 2543 if (!MFI.isFixedObjectIndex(FI)) 2544 return false; 2545 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2546 } 2547 2548 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2549 /// for tail call optimization. Targets which want to do tail call 2550 /// optimization should implement this function. 2551 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2552 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2553 bool isCalleeStructRet, bool isCallerStructRet, 2554 const SmallVectorImpl<ISD::OutputArg> &Outs, 2555 const SmallVectorImpl<SDValue> &OutVals, 2556 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2557 const bool isIndirect) const { 2558 MachineFunction &MF = DAG.getMachineFunction(); 2559 const Function &CallerF = MF.getFunction(); 2560 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2561 2562 assert(Subtarget->supportsTailCall()); 2563 2564 // Indirect tail calls cannot be optimized for Thumb1 if the args 2565 // to the call take up r0-r3. The reason is that there are no legal registers 2566 // left to hold the pointer to the function to be called. 2567 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2568 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2569 return false; 2570 2571 // Look for obvious safe cases to perform tail call optimization that do not 2572 // require ABI changes. This is what gcc calls sibcall. 2573 2574 // Exception-handling functions need a special set of instructions to indicate 2575 // a return to the hardware. Tail-calling another function would probably 2576 // break this. 2577 if (CallerF.hasFnAttribute("interrupt")) 2578 return false; 2579 2580 // Also avoid sibcall optimization if either caller or callee uses struct 2581 // return semantics. 2582 if (isCalleeStructRet || isCallerStructRet) 2583 return false; 2584 2585 // Externally-defined functions with weak linkage should not be 2586 // tail-called on ARM when the OS does not support dynamic 2587 // pre-emption of symbols, as the AAELF spec requires normal calls 2588 // to undefined weak functions to be replaced with a NOP or jump to the 2589 // next instruction. The behaviour of branch instructions in this 2590 // situation (as used for tail calls) is implementation-defined, so we 2591 // cannot rely on the linker replacing the tail call with a return. 2592 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2593 const GlobalValue *GV = G->getGlobal(); 2594 const Triple &TT = getTargetMachine().getTargetTriple(); 2595 if (GV->hasExternalWeakLinkage() && 2596 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2597 return false; 2598 } 2599 2600 // Check that the call results are passed in the same way. 2601 LLVMContext &C = *DAG.getContext(); 2602 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2603 CCAssignFnForReturn(CalleeCC, isVarArg), 2604 CCAssignFnForReturn(CallerCC, isVarArg))) 2605 return false; 2606 // The callee has to preserve all registers the caller needs to preserve. 2607 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2608 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2609 if (CalleeCC != CallerCC) { 2610 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2611 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2612 return false; 2613 } 2614 2615 // If Caller's vararg or byval argument has been split between registers and 2616 // stack, do not perform tail call, since part of the argument is in caller's 2617 // local frame. 2618 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2619 if (AFI_Caller->getArgRegsSaveSize()) 2620 return false; 2621 2622 // If the callee takes no arguments then go on to check the results of the 2623 // call. 2624 if (!Outs.empty()) { 2625 // Check if stack adjustment is needed. For now, do not do this if any 2626 // argument is passed on the stack. 2627 SmallVector<CCValAssign, 16> ArgLocs; 2628 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2629 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2630 if (CCInfo.getNextStackOffset()) { 2631 // Check if the arguments are already laid out in the right way as 2632 // the caller's fixed stack objects. 2633 MachineFrameInfo &MFI = MF.getFrameInfo(); 2634 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2635 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2636 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2637 i != e; 2638 ++i, ++realArgIdx) { 2639 CCValAssign &VA = ArgLocs[i]; 2640 EVT RegVT = VA.getLocVT(); 2641 SDValue Arg = OutVals[realArgIdx]; 2642 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2643 if (VA.getLocInfo() == CCValAssign::Indirect) 2644 return false; 2645 if (VA.needsCustom()) { 2646 // f64 and vector types are split into multiple registers or 2647 // register/stack-slot combinations. The types will not match 2648 // the registers; give up on memory f64 refs until we figure 2649 // out what to do about this. 2650 if (!VA.isRegLoc()) 2651 return false; 2652 if (!ArgLocs[++i].isRegLoc()) 2653 return false; 2654 if (RegVT == MVT::v2f64) { 2655 if (!ArgLocs[++i].isRegLoc()) 2656 return false; 2657 if (!ArgLocs[++i].isRegLoc()) 2658 return false; 2659 } 2660 } else if (!VA.isRegLoc()) { 2661 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2662 MFI, MRI, TII)) 2663 return false; 2664 } 2665 } 2666 } 2667 2668 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2669 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2670 return false; 2671 } 2672 2673 return true; 2674 } 2675 2676 bool 2677 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2678 MachineFunction &MF, bool isVarArg, 2679 const SmallVectorImpl<ISD::OutputArg> &Outs, 2680 LLVMContext &Context) const { 2681 SmallVector<CCValAssign, 16> RVLocs; 2682 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2683 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2684 } 2685 2686 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2687 const SDLoc &DL, SelectionDAG &DAG) { 2688 const MachineFunction &MF = DAG.getMachineFunction(); 2689 const Function &F = MF.getFunction(); 2690 2691 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2692 2693 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2694 // version of the "preferred return address". These offsets affect the return 2695 // instruction if this is a return from PL1 without hypervisor extensions. 2696 // IRQ/FIQ: +4 "subs pc, lr, #4" 2697 // SWI: 0 "subs pc, lr, #0" 2698 // ABORT: +4 "subs pc, lr, #4" 2699 // UNDEF: +4/+2 "subs pc, lr, #0" 2700 // UNDEF varies depending on where the exception came from ARM or Thumb 2701 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2702 2703 int64_t LROffset; 2704 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2705 IntKind == "ABORT") 2706 LROffset = 4; 2707 else if (IntKind == "SWI" || IntKind == "UNDEF") 2708 LROffset = 0; 2709 else 2710 report_fatal_error("Unsupported interrupt attribute. If present, value " 2711 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2712 2713 RetOps.insert(RetOps.begin() + 1, 2714 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2715 2716 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2717 } 2718 2719 SDValue 2720 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2721 bool isVarArg, 2722 const SmallVectorImpl<ISD::OutputArg> &Outs, 2723 const SmallVectorImpl<SDValue> &OutVals, 2724 const SDLoc &dl, SelectionDAG &DAG) const { 2725 // CCValAssign - represent the assignment of the return value to a location. 2726 SmallVector<CCValAssign, 16> RVLocs; 2727 2728 // CCState - Info about the registers and stack slots. 2729 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2730 *DAG.getContext()); 2731 2732 // Analyze outgoing return values. 2733 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2734 2735 SDValue Flag; 2736 SmallVector<SDValue, 4> RetOps; 2737 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2738 bool isLittleEndian = Subtarget->isLittle(); 2739 2740 MachineFunction &MF = DAG.getMachineFunction(); 2741 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2742 AFI->setReturnRegsCount(RVLocs.size()); 2743 2744 // Copy the result values into the output registers. 2745 for (unsigned i = 0, realRVLocIdx = 0; 2746 i != RVLocs.size(); 2747 ++i, ++realRVLocIdx) { 2748 CCValAssign &VA = RVLocs[i]; 2749 assert(VA.isRegLoc() && "Can only return in registers!"); 2750 2751 SDValue Arg = OutVals[realRVLocIdx]; 2752 bool ReturnF16 = false; 2753 2754 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2755 // Half-precision return values can be returned like this: 2756 // 2757 // t11 f16 = fadd ... 2758 // t12: i16 = bitcast t11 2759 // t13: i32 = zero_extend t12 2760 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2761 // 2762 // to avoid code generation for bitcasts, we simply set Arg to the node 2763 // that produces the f16 value, t11 in this case. 2764 // 2765 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2766 SDValue ZE = Arg.getOperand(0); 2767 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2768 SDValue BC = ZE.getOperand(0); 2769 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2770 Arg = BC.getOperand(0); 2771 ReturnF16 = true; 2772 } 2773 } 2774 } 2775 } 2776 2777 switch (VA.getLocInfo()) { 2778 default: llvm_unreachable("Unknown loc info!"); 2779 case CCValAssign::Full: break; 2780 case CCValAssign::BCvt: 2781 if (!ReturnF16) 2782 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2783 break; 2784 } 2785 2786 if (VA.needsCustom()) { 2787 if (VA.getLocVT() == MVT::v2f64) { 2788 // Extract the first half and return it in two registers. 2789 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2790 DAG.getConstant(0, dl, MVT::i32)); 2791 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2792 DAG.getVTList(MVT::i32, MVT::i32), Half); 2793 2794 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2795 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2796 Flag); 2797 Flag = Chain.getValue(1); 2798 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2799 VA = RVLocs[++i]; // skip ahead to next loc 2800 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2801 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2802 Flag); 2803 Flag = Chain.getValue(1); 2804 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2805 VA = RVLocs[++i]; // skip ahead to next loc 2806 2807 // Extract the 2nd half and fall through to handle it as an f64 value. 2808 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2809 DAG.getConstant(1, dl, MVT::i32)); 2810 } 2811 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2812 // available. 2813 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2814 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2815 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2816 fmrrd.getValue(isLittleEndian ? 0 : 1), 2817 Flag); 2818 Flag = Chain.getValue(1); 2819 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2820 VA = RVLocs[++i]; // skip ahead to next loc 2821 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2822 fmrrd.getValue(isLittleEndian ? 1 : 0), 2823 Flag); 2824 } else 2825 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2826 2827 // Guarantee that all emitted copies are 2828 // stuck together, avoiding something bad. 2829 Flag = Chain.getValue(1); 2830 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2831 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2832 } 2833 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2834 const MCPhysReg *I = 2835 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2836 if (I) { 2837 for (; *I; ++I) { 2838 if (ARM::GPRRegClass.contains(*I)) 2839 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2840 else if (ARM::DPRRegClass.contains(*I)) 2841 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2842 else 2843 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2844 } 2845 } 2846 2847 // Update chain and glue. 2848 RetOps[0] = Chain; 2849 if (Flag.getNode()) 2850 RetOps.push_back(Flag); 2851 2852 // CPUs which aren't M-class use a special sequence to return from 2853 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2854 // though we use "subs pc, lr, #N"). 2855 // 2856 // M-class CPUs actually use a normal return sequence with a special 2857 // (hardware-provided) value in LR, so the normal code path works. 2858 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2859 !Subtarget->isMClass()) { 2860 if (Subtarget->isThumb1Only()) 2861 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2862 return LowerInterruptReturn(RetOps, dl, DAG); 2863 } 2864 2865 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2866 } 2867 2868 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2869 if (N->getNumValues() != 1) 2870 return false; 2871 if (!N->hasNUsesOfValue(1, 0)) 2872 return false; 2873 2874 SDValue TCChain = Chain; 2875 SDNode *Copy = *N->use_begin(); 2876 if (Copy->getOpcode() == ISD::CopyToReg) { 2877 // If the copy has a glue operand, we conservatively assume it isn't safe to 2878 // perform a tail call. 2879 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2880 return false; 2881 TCChain = Copy->getOperand(0); 2882 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2883 SDNode *VMov = Copy; 2884 // f64 returned in a pair of GPRs. 2885 SmallPtrSet<SDNode*, 2> Copies; 2886 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2887 UI != UE; ++UI) { 2888 if (UI->getOpcode() != ISD::CopyToReg) 2889 return false; 2890 Copies.insert(*UI); 2891 } 2892 if (Copies.size() > 2) 2893 return false; 2894 2895 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2896 UI != UE; ++UI) { 2897 SDValue UseChain = UI->getOperand(0); 2898 if (Copies.count(UseChain.getNode())) 2899 // Second CopyToReg 2900 Copy = *UI; 2901 else { 2902 // We are at the top of this chain. 2903 // If the copy has a glue operand, we conservatively assume it 2904 // isn't safe to perform a tail call. 2905 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2906 return false; 2907 // First CopyToReg 2908 TCChain = UseChain; 2909 } 2910 } 2911 } else if (Copy->getOpcode() == ISD::BITCAST) { 2912 // f32 returned in a single GPR. 2913 if (!Copy->hasOneUse()) 2914 return false; 2915 Copy = *Copy->use_begin(); 2916 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2917 return false; 2918 // If the copy has a glue operand, we conservatively assume it isn't safe to 2919 // perform a tail call. 2920 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2921 return false; 2922 TCChain = Copy->getOperand(0); 2923 } else { 2924 return false; 2925 } 2926 2927 bool HasRet = false; 2928 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2929 UI != UE; ++UI) { 2930 if (UI->getOpcode() != ARMISD::RET_FLAG && 2931 UI->getOpcode() != ARMISD::INTRET_FLAG) 2932 return false; 2933 HasRet = true; 2934 } 2935 2936 if (!HasRet) 2937 return false; 2938 2939 Chain = TCChain; 2940 return true; 2941 } 2942 2943 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2944 if (!Subtarget->supportsTailCall()) 2945 return false; 2946 2947 auto Attr = 2948 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2949 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2950 return false; 2951 2952 return true; 2953 } 2954 2955 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2956 // and pass the lower and high parts through. 2957 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2958 SDLoc DL(Op); 2959 SDValue WriteValue = Op->getOperand(2); 2960 2961 // This function is only supposed to be called for i64 type argument. 2962 assert(WriteValue.getValueType() == MVT::i64 2963 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2964 2965 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2966 DAG.getConstant(0, DL, MVT::i32)); 2967 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2968 DAG.getConstant(1, DL, MVT::i32)); 2969 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2970 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2971 } 2972 2973 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2974 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2975 // one of the above mentioned nodes. It has to be wrapped because otherwise 2976 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2977 // be used to form addressing mode. These wrapped nodes will be selected 2978 // into MOVi. 2979 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 2980 SelectionDAG &DAG) const { 2981 EVT PtrVT = Op.getValueType(); 2982 // FIXME there is no actual debug info here 2983 SDLoc dl(Op); 2984 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2985 SDValue Res; 2986 2987 // When generating execute-only code Constant Pools must be promoted to the 2988 // global data section. It's a bit ugly that we can't share them across basic 2989 // blocks, but this way we guarantee that execute-only behaves correct with 2990 // position-independent addressing modes. 2991 if (Subtarget->genExecuteOnly()) { 2992 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 2993 auto T = const_cast<Type*>(CP->getType()); 2994 auto C = const_cast<Constant*>(CP->getConstVal()); 2995 auto M = const_cast<Module*>(DAG.getMachineFunction(). 2996 getFunction().getParent()); 2997 auto GV = new GlobalVariable( 2998 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 2999 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3000 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3001 Twine(AFI->createPICLabelUId()) 3002 ); 3003 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3004 dl, PtrVT); 3005 return LowerGlobalAddress(GA, DAG); 3006 } 3007 3008 if (CP->isMachineConstantPoolEntry()) 3009 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 3010 CP->getAlignment()); 3011 else 3012 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 3013 CP->getAlignment()); 3014 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3015 } 3016 3017 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3018 return MachineJumpTableInfo::EK_Inline; 3019 } 3020 3021 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3022 SelectionDAG &DAG) const { 3023 MachineFunction &MF = DAG.getMachineFunction(); 3024 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3025 unsigned ARMPCLabelIndex = 0; 3026 SDLoc DL(Op); 3027 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3028 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3029 SDValue CPAddr; 3030 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3031 if (!IsPositionIndependent) { 3032 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 3033 } else { 3034 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3035 ARMPCLabelIndex = AFI->createPICLabelUId(); 3036 ARMConstantPoolValue *CPV = 3037 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3038 ARMCP::CPBlockAddress, PCAdj); 3039 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3040 } 3041 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3042 SDValue Result = DAG.getLoad( 3043 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3044 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3045 if (!IsPositionIndependent) 3046 return Result; 3047 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3048 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3049 } 3050 3051 /// Convert a TLS address reference into the correct sequence of loads 3052 /// and calls to compute the variable's address for Darwin, and return an 3053 /// SDValue containing the final node. 3054 3055 /// Darwin only has one TLS scheme which must be capable of dealing with the 3056 /// fully general situation, in the worst case. This means: 3057 /// + "extern __thread" declaration. 3058 /// + Defined in a possibly unknown dynamic library. 3059 /// 3060 /// The general system is that each __thread variable has a [3 x i32] descriptor 3061 /// which contains information used by the runtime to calculate the address. The 3062 /// only part of this the compiler needs to know about is the first word, which 3063 /// contains a function pointer that must be called with the address of the 3064 /// entire descriptor in "r0". 3065 /// 3066 /// Since this descriptor may be in a different unit, in general access must 3067 /// proceed along the usual ARM rules. A common sequence to produce is: 3068 /// 3069 /// movw rT1, :lower16:_var$non_lazy_ptr 3070 /// movt rT1, :upper16:_var$non_lazy_ptr 3071 /// ldr r0, [rT1] 3072 /// ldr rT2, [r0] 3073 /// blx rT2 3074 /// [...address now in r0...] 3075 SDValue 3076 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3077 SelectionDAG &DAG) const { 3078 assert(Subtarget->isTargetDarwin() && 3079 "This function expects a Darwin target"); 3080 SDLoc DL(Op); 3081 3082 // First step is to get the address of the actua global symbol. This is where 3083 // the TLS descriptor lives. 3084 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3085 3086 // The first entry in the descriptor is a function pointer that we must call 3087 // to obtain the address of the variable. 3088 SDValue Chain = DAG.getEntryNode(); 3089 SDValue FuncTLVGet = DAG.getLoad( 3090 MVT::i32, DL, Chain, DescAddr, 3091 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 3092 /* Alignment = */ 4, 3093 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3094 MachineMemOperand::MOInvariant); 3095 Chain = FuncTLVGet.getValue(1); 3096 3097 MachineFunction &F = DAG.getMachineFunction(); 3098 MachineFrameInfo &MFI = F.getFrameInfo(); 3099 MFI.setAdjustsStack(true); 3100 3101 // TLS calls preserve all registers except those that absolutely must be 3102 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3103 // silly). 3104 auto TRI = 3105 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3106 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3107 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3108 3109 // Finally, we can make the call. This is just a degenerate version of a 3110 // normal AArch64 call node: r0 takes the address of the descriptor, and 3111 // returns the address of the variable in this thread. 3112 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3113 Chain = 3114 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3115 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3116 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3117 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3118 } 3119 3120 SDValue 3121 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3122 SelectionDAG &DAG) const { 3123 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3124 3125 SDValue Chain = DAG.getEntryNode(); 3126 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3127 SDLoc DL(Op); 3128 3129 // Load the current TEB (thread environment block) 3130 SDValue Ops[] = {Chain, 3131 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3132 DAG.getTargetConstant(15, DL, MVT::i32), 3133 DAG.getTargetConstant(0, DL, MVT::i32), 3134 DAG.getTargetConstant(13, DL, MVT::i32), 3135 DAG.getTargetConstant(0, DL, MVT::i32), 3136 DAG.getTargetConstant(2, DL, MVT::i32)}; 3137 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3138 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3139 3140 SDValue TEB = CurrentTEB.getValue(0); 3141 Chain = CurrentTEB.getValue(1); 3142 3143 // Load the ThreadLocalStoragePointer from the TEB 3144 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3145 SDValue TLSArray = 3146 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3147 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3148 3149 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3150 // offset into the TLSArray. 3151 3152 // Load the TLS index from the C runtime 3153 SDValue TLSIndex = 3154 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3155 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3156 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3157 3158 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3159 DAG.getConstant(2, DL, MVT::i32)); 3160 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3161 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3162 MachinePointerInfo()); 3163 3164 // Get the offset of the start of the .tls section (section base) 3165 const auto *GA = cast<GlobalAddressSDNode>(Op); 3166 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3167 SDValue Offset = DAG.getLoad( 3168 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3169 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 3170 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3171 3172 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3173 } 3174 3175 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3176 SDValue 3177 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3178 SelectionDAG &DAG) const { 3179 SDLoc dl(GA); 3180 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3181 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3182 MachineFunction &MF = DAG.getMachineFunction(); 3183 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3184 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3185 ARMConstantPoolValue *CPV = 3186 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3187 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3188 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3189 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3190 Argument = DAG.getLoad( 3191 PtrVT, dl, DAG.getEntryNode(), Argument, 3192 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3193 SDValue Chain = Argument.getValue(1); 3194 3195 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3196 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3197 3198 // call __tls_get_addr. 3199 ArgListTy Args; 3200 ArgListEntry Entry; 3201 Entry.Node = Argument; 3202 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3203 Args.push_back(Entry); 3204 3205 // FIXME: is there useful debug info available here? 3206 TargetLowering::CallLoweringInfo CLI(DAG); 3207 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3208 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3209 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3210 3211 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3212 return CallResult.first; 3213 } 3214 3215 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3216 // "local exec" model. 3217 SDValue 3218 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3219 SelectionDAG &DAG, 3220 TLSModel::Model model) const { 3221 const GlobalValue *GV = GA->getGlobal(); 3222 SDLoc dl(GA); 3223 SDValue Offset; 3224 SDValue Chain = DAG.getEntryNode(); 3225 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3226 // Get the Thread Pointer 3227 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3228 3229 if (model == TLSModel::InitialExec) { 3230 MachineFunction &MF = DAG.getMachineFunction(); 3231 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3232 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3233 // Initial exec model. 3234 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3235 ARMConstantPoolValue *CPV = 3236 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3237 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3238 true); 3239 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3240 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3241 Offset = DAG.getLoad( 3242 PtrVT, dl, Chain, Offset, 3243 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3244 Chain = Offset.getValue(1); 3245 3246 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3247 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3248 3249 Offset = DAG.getLoad( 3250 PtrVT, dl, Chain, Offset, 3251 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3252 } else { 3253 // local exec model 3254 assert(model == TLSModel::LocalExec); 3255 ARMConstantPoolValue *CPV = 3256 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3257 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3258 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3259 Offset = DAG.getLoad( 3260 PtrVT, dl, Chain, Offset, 3261 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3262 } 3263 3264 // The address of the thread local variable is the add of the thread 3265 // pointer with the offset of the variable. 3266 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3267 } 3268 3269 SDValue 3270 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3271 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3272 if (DAG.getTarget().useEmulatedTLS()) 3273 return LowerToTLSEmulatedModel(GA, DAG); 3274 3275 if (Subtarget->isTargetDarwin()) 3276 return LowerGlobalTLSAddressDarwin(Op, DAG); 3277 3278 if (Subtarget->isTargetWindows()) 3279 return LowerGlobalTLSAddressWindows(Op, DAG); 3280 3281 // TODO: implement the "local dynamic" model 3282 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3283 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3284 3285 switch (model) { 3286 case TLSModel::GeneralDynamic: 3287 case TLSModel::LocalDynamic: 3288 return LowerToTLSGeneralDynamicModel(GA, DAG); 3289 case TLSModel::InitialExec: 3290 case TLSModel::LocalExec: 3291 return LowerToTLSExecModels(GA, DAG, model); 3292 } 3293 llvm_unreachable("bogus TLS model"); 3294 } 3295 3296 /// Return true if all users of V are within function F, looking through 3297 /// ConstantExprs. 3298 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3299 SmallVector<const User*,4> Worklist; 3300 for (auto *U : V->users()) 3301 Worklist.push_back(U); 3302 while (!Worklist.empty()) { 3303 auto *U = Worklist.pop_back_val(); 3304 if (isa<ConstantExpr>(U)) { 3305 for (auto *UU : U->users()) 3306 Worklist.push_back(UU); 3307 continue; 3308 } 3309 3310 auto *I = dyn_cast<Instruction>(U); 3311 if (!I || I->getParent()->getParent() != F) 3312 return false; 3313 } 3314 return true; 3315 } 3316 3317 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3318 const GlobalValue *GV, SelectionDAG &DAG, 3319 EVT PtrVT, const SDLoc &dl) { 3320 // If we're creating a pool entry for a constant global with unnamed address, 3321 // and the global is small enough, we can emit it inline into the constant pool 3322 // to save ourselves an indirection. 3323 // 3324 // This is a win if the constant is only used in one function (so it doesn't 3325 // need to be duplicated) or duplicating the constant wouldn't increase code 3326 // size (implying the constant is no larger than 4 bytes). 3327 const Function &F = DAG.getMachineFunction().getFunction(); 3328 3329 // We rely on this decision to inline being idemopotent and unrelated to the 3330 // use-site. We know that if we inline a variable at one use site, we'll 3331 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3332 // doesn't know about this optimization, so bail out if it's enabled else 3333 // we could decide to inline here (and thus never emit the GV) but require 3334 // the GV from fast-isel generated code. 3335 if (!EnableConstpoolPromotion || 3336 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3337 return SDValue(); 3338 3339 auto *GVar = dyn_cast<GlobalVariable>(GV); 3340 if (!GVar || !GVar->hasInitializer() || 3341 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3342 !GVar->hasLocalLinkage()) 3343 return SDValue(); 3344 3345 // If we inline a value that contains relocations, we move the relocations 3346 // from .data to .text. This is not allowed in position-independent code. 3347 auto *Init = GVar->getInitializer(); 3348 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3349 Init->needsRelocation()) 3350 return SDValue(); 3351 3352 // The constant islands pass can only really deal with alignment requests 3353 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3354 // any type wanting greater alignment requirements than 4 bytes. We also 3355 // can only promote constants that are multiples of 4 bytes in size or 3356 // are paddable to a multiple of 4. Currently we only try and pad constants 3357 // that are strings for simplicity. 3358 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3359 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3360 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3361 unsigned RequiredPadding = 4 - (Size % 4); 3362 bool PaddingPossible = 3363 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3364 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3365 Size == 0) 3366 return SDValue(); 3367 3368 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3369 MachineFunction &MF = DAG.getMachineFunction(); 3370 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3371 3372 // We can't bloat the constant pool too much, else the ConstantIslands pass 3373 // may fail to converge. If we haven't promoted this global yet (it may have 3374 // multiple uses), and promoting it would increase the constant pool size (Sz 3375 // > 4), ensure we have space to do so up to MaxTotal. 3376 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3377 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3378 ConstpoolPromotionMaxTotal) 3379 return SDValue(); 3380 3381 // This is only valid if all users are in a single function; we can't clone 3382 // the constant in general. The LLVM IR unnamed_addr allows merging 3383 // constants, but not cloning them. 3384 // 3385 // We could potentially allow cloning if we could prove all uses of the 3386 // constant in the current function don't care about the address, like 3387 // printf format strings. But that isn't implemented for now. 3388 if (!allUsersAreInFunction(GVar, &F)) 3389 return SDValue(); 3390 3391 // We're going to inline this global. Pad it out if needed. 3392 if (RequiredPadding != 4) { 3393 StringRef S = CDAInit->getAsString(); 3394 3395 SmallVector<uint8_t,16> V(S.size()); 3396 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3397 while (RequiredPadding--) 3398 V.push_back(0); 3399 Init = ConstantDataArray::get(*DAG.getContext(), V); 3400 } 3401 3402 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3403 SDValue CPAddr = 3404 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3405 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3406 AFI->markGlobalAsPromotedToConstantPool(GVar); 3407 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3408 PaddedSize - 4); 3409 } 3410 ++NumConstpoolPromoted; 3411 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3412 } 3413 3414 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3415 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3416 if (!(GV = GA->getBaseObject())) 3417 return false; 3418 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3419 return V->isConstant(); 3420 return isa<Function>(GV); 3421 } 3422 3423 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3424 SelectionDAG &DAG) const { 3425 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3426 default: llvm_unreachable("unknown object format"); 3427 case Triple::COFF: 3428 return LowerGlobalAddressWindows(Op, DAG); 3429 case Triple::ELF: 3430 return LowerGlobalAddressELF(Op, DAG); 3431 case Triple::MachO: 3432 return LowerGlobalAddressDarwin(Op, DAG); 3433 } 3434 } 3435 3436 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3437 SelectionDAG &DAG) const { 3438 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3439 SDLoc dl(Op); 3440 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3441 const TargetMachine &TM = getTargetMachine(); 3442 bool IsRO = isReadOnly(GV); 3443 3444 // promoteToConstantPool only if not generating XO text section 3445 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3446 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3447 return V; 3448 3449 if (isPositionIndependent()) { 3450 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3451 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3452 UseGOT_PREL ? ARMII::MO_GOT : 0); 3453 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3454 if (UseGOT_PREL) 3455 Result = 3456 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3457 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3458 return Result; 3459 } else if (Subtarget->isROPI() && IsRO) { 3460 // PC-relative. 3461 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3462 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3463 return Result; 3464 } else if (Subtarget->isRWPI() && !IsRO) { 3465 // SB-relative. 3466 SDValue RelAddr; 3467 if (Subtarget->useMovt()) { 3468 ++NumMovwMovt; 3469 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3470 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3471 } else { // use literal pool for address constant 3472 ARMConstantPoolValue *CPV = 3473 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3474 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3475 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3476 RelAddr = DAG.getLoad( 3477 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3478 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3479 } 3480 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3481 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3482 return Result; 3483 } 3484 3485 // If we have T2 ops, we can materialize the address directly via movt/movw 3486 // pair. This is always cheaper. 3487 if (Subtarget->useMovt()) { 3488 ++NumMovwMovt; 3489 // FIXME: Once remat is capable of dealing with instructions with register 3490 // operands, expand this into two nodes. 3491 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3492 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3493 } else { 3494 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3495 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3496 return DAG.getLoad( 3497 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3498 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3499 } 3500 } 3501 3502 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3503 SelectionDAG &DAG) const { 3504 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3505 "ROPI/RWPI not currently supported for Darwin"); 3506 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3507 SDLoc dl(Op); 3508 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3509 3510 if (Subtarget->useMovt()) 3511 ++NumMovwMovt; 3512 3513 // FIXME: Once remat is capable of dealing with instructions with register 3514 // operands, expand this into multiple nodes 3515 unsigned Wrapper = 3516 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3517 3518 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3519 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3520 3521 if (Subtarget->isGVIndirectSymbol(GV)) 3522 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3523 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3524 return Result; 3525 } 3526 3527 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3528 SelectionDAG &DAG) const { 3529 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3530 assert(Subtarget->useMovt() && 3531 "Windows on ARM expects to use movw/movt"); 3532 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3533 "ROPI/RWPI not currently supported for Windows"); 3534 3535 const TargetMachine &TM = getTargetMachine(); 3536 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3537 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3538 if (GV->hasDLLImportStorageClass()) 3539 TargetFlags = ARMII::MO_DLLIMPORT; 3540 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3541 TargetFlags = ARMII::MO_COFFSTUB; 3542 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3543 SDValue Result; 3544 SDLoc DL(Op); 3545 3546 ++NumMovwMovt; 3547 3548 // FIXME: Once remat is capable of dealing with instructions with register 3549 // operands, expand this into two nodes. 3550 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3551 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3552 TargetFlags)); 3553 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3554 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3555 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3556 return Result; 3557 } 3558 3559 SDValue 3560 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3561 SDLoc dl(Op); 3562 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3563 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3564 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3565 Op.getOperand(1), Val); 3566 } 3567 3568 SDValue 3569 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3570 SDLoc dl(Op); 3571 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3572 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3573 } 3574 3575 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3576 SelectionDAG &DAG) const { 3577 SDLoc dl(Op); 3578 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3579 Op.getOperand(0)); 3580 } 3581 3582 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3583 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3584 unsigned IntNo = 3585 cast<ConstantSDNode>( 3586 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3587 ->getZExtValue(); 3588 switch (IntNo) { 3589 default: 3590 return SDValue(); // Don't custom lower most intrinsics. 3591 case Intrinsic::arm_gnu_eabi_mcount: { 3592 MachineFunction &MF = DAG.getMachineFunction(); 3593 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3594 SDLoc dl(Op); 3595 SDValue Chain = Op.getOperand(0); 3596 // call "\01__gnu_mcount_nc" 3597 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3598 const uint32_t *Mask = 3599 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3600 assert(Mask && "Missing call preserved mask for calling convention"); 3601 // Mark LR an implicit live-in. 3602 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3603 SDValue ReturnAddress = 3604 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3605 std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; 3606 SDValue Callee = 3607 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3608 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3609 if (Subtarget->isThumb()) 3610 return SDValue( 3611 DAG.getMachineNode( 3612 ARM::tBL_PUSHLR, dl, ResultTys, 3613 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3614 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3615 0); 3616 return SDValue( 3617 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3618 {ReturnAddress, Callee, RegisterMask, Chain}), 3619 0); 3620 } 3621 } 3622 } 3623 3624 SDValue 3625 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3626 const ARMSubtarget *Subtarget) const { 3627 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3628 SDLoc dl(Op); 3629 switch (IntNo) { 3630 default: return SDValue(); // Don't custom lower most intrinsics. 3631 case Intrinsic::thread_pointer: { 3632 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3633 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3634 } 3635 case Intrinsic::arm_cls: { 3636 const SDValue &Operand = Op.getOperand(1); 3637 const EVT VTy = Op.getValueType(); 3638 SDValue SRA = 3639 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3640 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3641 SDValue SHL = 3642 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3643 SDValue OR = 3644 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3645 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3646 return Result; 3647 } 3648 case Intrinsic::arm_cls64: { 3649 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3650 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3651 const SDValue &Operand = Op.getOperand(1); 3652 const EVT VTy = Op.getValueType(); 3653 3654 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3655 DAG.getConstant(1, dl, VTy)); 3656 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3657 DAG.getConstant(0, dl, VTy)); 3658 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3659 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3660 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3661 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3662 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3663 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3664 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3665 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3666 SDValue CheckLo = 3667 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3668 SDValue HiIsZero = 3669 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3670 SDValue AdjustedLo = 3671 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3672 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3673 SDValue Result = 3674 DAG.getSelect(dl, VTy, CheckLo, 3675 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3676 return Result; 3677 } 3678 case Intrinsic::eh_sjlj_lsda: { 3679 MachineFunction &MF = DAG.getMachineFunction(); 3680 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3681 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3682 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3683 SDValue CPAddr; 3684 bool IsPositionIndependent = isPositionIndependent(); 3685 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3686 ARMConstantPoolValue *CPV = 3687 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3688 ARMCP::CPLSDA, PCAdj); 3689 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3690 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3691 SDValue Result = DAG.getLoad( 3692 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3693 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3694 3695 if (IsPositionIndependent) { 3696 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3697 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3698 } 3699 return Result; 3700 } 3701 case Intrinsic::arm_neon_vabs: 3702 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3703 Op.getOperand(1)); 3704 case Intrinsic::arm_neon_vmulls: 3705 case Intrinsic::arm_neon_vmullu: { 3706 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3707 ? ARMISD::VMULLs : ARMISD::VMULLu; 3708 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3709 Op.getOperand(1), Op.getOperand(2)); 3710 } 3711 case Intrinsic::arm_neon_vminnm: 3712 case Intrinsic::arm_neon_vmaxnm: { 3713 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3714 ? ISD::FMINNUM : ISD::FMAXNUM; 3715 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3716 Op.getOperand(1), Op.getOperand(2)); 3717 } 3718 case Intrinsic::arm_neon_vminu: 3719 case Intrinsic::arm_neon_vmaxu: { 3720 if (Op.getValueType().isFloatingPoint()) 3721 return SDValue(); 3722 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3723 ? ISD::UMIN : ISD::UMAX; 3724 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3725 Op.getOperand(1), Op.getOperand(2)); 3726 } 3727 case Intrinsic::arm_neon_vmins: 3728 case Intrinsic::arm_neon_vmaxs: { 3729 // v{min,max}s is overloaded between signed integers and floats. 3730 if (!Op.getValueType().isFloatingPoint()) { 3731 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3732 ? ISD::SMIN : ISD::SMAX; 3733 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3734 Op.getOperand(1), Op.getOperand(2)); 3735 } 3736 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3737 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3738 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3739 Op.getOperand(1), Op.getOperand(2)); 3740 } 3741 case Intrinsic::arm_neon_vtbl1: 3742 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3743 Op.getOperand(1), Op.getOperand(2)); 3744 case Intrinsic::arm_neon_vtbl2: 3745 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3746 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3747 case Intrinsic::arm_mve_pred_i2v: 3748 case Intrinsic::arm_mve_pred_v2i: 3749 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 3750 Op.getOperand(1)); 3751 } 3752 } 3753 3754 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3755 const ARMSubtarget *Subtarget) { 3756 SDLoc dl(Op); 3757 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3758 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3759 if (SSID == SyncScope::SingleThread) 3760 return Op; 3761 3762 if (!Subtarget->hasDataBarrier()) { 3763 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3764 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3765 // here. 3766 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3767 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3768 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3769 DAG.getConstant(0, dl, MVT::i32)); 3770 } 3771 3772 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3773 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3774 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3775 if (Subtarget->isMClass()) { 3776 // Only a full system barrier exists in the M-class architectures. 3777 Domain = ARM_MB::SY; 3778 } else if (Subtarget->preferISHSTBarriers() && 3779 Ord == AtomicOrdering::Release) { 3780 // Swift happens to implement ISHST barriers in a way that's compatible with 3781 // Release semantics but weaker than ISH so we'd be fools not to use 3782 // it. Beware: other processors probably don't! 3783 Domain = ARM_MB::ISHST; 3784 } 3785 3786 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3787 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3788 DAG.getConstant(Domain, dl, MVT::i32)); 3789 } 3790 3791 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3792 const ARMSubtarget *Subtarget) { 3793 // ARM pre v5TE and Thumb1 does not have preload instructions. 3794 if (!(Subtarget->isThumb2() || 3795 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3796 // Just preserve the chain. 3797 return Op.getOperand(0); 3798 3799 SDLoc dl(Op); 3800 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3801 if (!isRead && 3802 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3803 // ARMv7 with MP extension has PLDW. 3804 return Op.getOperand(0); 3805 3806 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3807 if (Subtarget->isThumb()) { 3808 // Invert the bits. 3809 isRead = ~isRead & 1; 3810 isData = ~isData & 1; 3811 } 3812 3813 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3814 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3815 DAG.getConstant(isData, dl, MVT::i32)); 3816 } 3817 3818 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3819 MachineFunction &MF = DAG.getMachineFunction(); 3820 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3821 3822 // vastart just stores the address of the VarArgsFrameIndex slot into the 3823 // memory location argument. 3824 SDLoc dl(Op); 3825 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3826 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3827 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3828 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3829 MachinePointerInfo(SV)); 3830 } 3831 3832 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3833 CCValAssign &NextVA, 3834 SDValue &Root, 3835 SelectionDAG &DAG, 3836 const SDLoc &dl) const { 3837 MachineFunction &MF = DAG.getMachineFunction(); 3838 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3839 3840 const TargetRegisterClass *RC; 3841 if (AFI->isThumb1OnlyFunction()) 3842 RC = &ARM::tGPRRegClass; 3843 else 3844 RC = &ARM::GPRRegClass; 3845 3846 // Transform the arguments stored in physical registers into virtual ones. 3847 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3848 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3849 3850 SDValue ArgValue2; 3851 if (NextVA.isMemLoc()) { 3852 MachineFrameInfo &MFI = MF.getFrameInfo(); 3853 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3854 3855 // Create load node to retrieve arguments from the stack. 3856 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3857 ArgValue2 = DAG.getLoad( 3858 MVT::i32, dl, Root, FIN, 3859 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3860 } else { 3861 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3862 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3863 } 3864 if (!Subtarget->isLittle()) 3865 std::swap (ArgValue, ArgValue2); 3866 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3867 } 3868 3869 // The remaining GPRs hold either the beginning of variable-argument 3870 // data, or the beginning of an aggregate passed by value (usually 3871 // byval). Either way, we allocate stack slots adjacent to the data 3872 // provided by our caller, and store the unallocated registers there. 3873 // If this is a variadic function, the va_list pointer will begin with 3874 // these values; otherwise, this reassembles a (byval) structure that 3875 // was split between registers and memory. 3876 // Return: The frame index registers were stored into. 3877 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3878 const SDLoc &dl, SDValue &Chain, 3879 const Value *OrigArg, 3880 unsigned InRegsParamRecordIdx, 3881 int ArgOffset, unsigned ArgSize) const { 3882 // Currently, two use-cases possible: 3883 // Case #1. Non-var-args function, and we meet first byval parameter. 3884 // Setup first unallocated register as first byval register; 3885 // eat all remained registers 3886 // (these two actions are performed by HandleByVal method). 3887 // Then, here, we initialize stack frame with 3888 // "store-reg" instructions. 3889 // Case #2. Var-args function, that doesn't contain byval parameters. 3890 // The same: eat all remained unallocated registers, 3891 // initialize stack frame. 3892 3893 MachineFunction &MF = DAG.getMachineFunction(); 3894 MachineFrameInfo &MFI = MF.getFrameInfo(); 3895 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3896 unsigned RBegin, REnd; 3897 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3898 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3899 } else { 3900 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3901 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3902 REnd = ARM::R4; 3903 } 3904 3905 if (REnd != RBegin) 3906 ArgOffset = -4 * (ARM::R4 - RBegin); 3907 3908 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3909 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3910 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3911 3912 SmallVector<SDValue, 4> MemOps; 3913 const TargetRegisterClass *RC = 3914 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3915 3916 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3917 unsigned VReg = MF.addLiveIn(Reg, RC); 3918 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3919 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3920 MachinePointerInfo(OrigArg, 4 * i)); 3921 MemOps.push_back(Store); 3922 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3923 } 3924 3925 if (!MemOps.empty()) 3926 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3927 return FrameIndex; 3928 } 3929 3930 // Setup stack frame, the va_list pointer will start from. 3931 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3932 const SDLoc &dl, SDValue &Chain, 3933 unsigned ArgOffset, 3934 unsigned TotalArgRegsSaveSize, 3935 bool ForceMutable) const { 3936 MachineFunction &MF = DAG.getMachineFunction(); 3937 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3938 3939 // Try to store any remaining integer argument regs 3940 // to their spots on the stack so that they may be loaded by dereferencing 3941 // the result of va_next. 3942 // If there is no regs to be stored, just point address after last 3943 // argument passed via stack. 3944 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3945 CCInfo.getInRegsParamsCount(), 3946 CCInfo.getNextStackOffset(), 3947 std::max(4U, TotalArgRegsSaveSize)); 3948 AFI->setVarArgsFrameIndex(FrameIndex); 3949 } 3950 3951 SDValue ARMTargetLowering::LowerFormalArguments( 3952 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3953 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3954 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3955 MachineFunction &MF = DAG.getMachineFunction(); 3956 MachineFrameInfo &MFI = MF.getFrameInfo(); 3957 3958 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3959 3960 // Assign locations to all of the incoming arguments. 3961 SmallVector<CCValAssign, 16> ArgLocs; 3962 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3963 *DAG.getContext()); 3964 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3965 3966 SmallVector<SDValue, 16> ArgValues; 3967 SDValue ArgValue; 3968 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3969 unsigned CurArgIdx = 0; 3970 3971 // Initially ArgRegsSaveSize is zero. 3972 // Then we increase this value each time we meet byval parameter. 3973 // We also increase this value in case of varargs function. 3974 AFI->setArgRegsSaveSize(0); 3975 3976 // Calculate the amount of stack space that we need to allocate to store 3977 // byval and variadic arguments that are passed in registers. 3978 // We need to know this before we allocate the first byval or variadic 3979 // argument, as they will be allocated a stack slot below the CFA (Canonical 3980 // Frame Address, the stack pointer at entry to the function). 3981 unsigned ArgRegBegin = ARM::R4; 3982 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3983 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3984 break; 3985 3986 CCValAssign &VA = ArgLocs[i]; 3987 unsigned Index = VA.getValNo(); 3988 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3989 if (!Flags.isByVal()) 3990 continue; 3991 3992 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3993 unsigned RBegin, REnd; 3994 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3995 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3996 3997 CCInfo.nextInRegsParam(); 3998 } 3999 CCInfo.rewindByValRegsInfo(); 4000 4001 int lastInsIndex = -1; 4002 if (isVarArg && MFI.hasVAStart()) { 4003 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4004 if (RegIdx != array_lengthof(GPRArgRegs)) 4005 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4006 } 4007 4008 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4009 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4010 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4011 4012 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4013 CCValAssign &VA = ArgLocs[i]; 4014 if (Ins[VA.getValNo()].isOrigArg()) { 4015 std::advance(CurOrigArg, 4016 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4017 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4018 } 4019 // Arguments stored in registers. 4020 if (VA.isRegLoc()) { 4021 EVT RegVT = VA.getLocVT(); 4022 4023 if (VA.needsCustom()) { 4024 // f64 and vector types are split up into multiple registers or 4025 // combinations of registers and stack slots. 4026 if (VA.getLocVT() == MVT::v2f64) { 4027 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 4028 Chain, DAG, dl); 4029 VA = ArgLocs[++i]; // skip ahead to next loc 4030 SDValue ArgValue2; 4031 if (VA.isMemLoc()) { 4032 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4033 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4034 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 4035 MachinePointerInfo::getFixedStack( 4036 DAG.getMachineFunction(), FI)); 4037 } else { 4038 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 4039 Chain, DAG, dl); 4040 } 4041 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4042 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4043 ArgValue, ArgValue1, 4044 DAG.getIntPtrConstant(0, dl)); 4045 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4046 ArgValue, ArgValue2, 4047 DAG.getIntPtrConstant(1, dl)); 4048 } else 4049 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4050 } else { 4051 const TargetRegisterClass *RC; 4052 4053 4054 if (RegVT == MVT::f16) 4055 RC = &ARM::HPRRegClass; 4056 else if (RegVT == MVT::f32) 4057 RC = &ARM::SPRRegClass; 4058 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 4059 RC = &ARM::DPRRegClass; 4060 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 4061 RC = &ARM::QPRRegClass; 4062 else if (RegVT == MVT::i32) 4063 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4064 : &ARM::GPRRegClass; 4065 else 4066 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4067 4068 // Transform the arguments in physical registers into virtual ones. 4069 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4070 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4071 4072 // If this value is passed in r0 and has the returned attribute (e.g. 4073 // C++ 'structors), record this fact for later use. 4074 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4075 AFI->setPreservesR0(); 4076 } 4077 } 4078 4079 // If this is an 8 or 16-bit value, it is really passed promoted 4080 // to 32 bits. Insert an assert[sz]ext to capture this, then 4081 // truncate to the right size. 4082 switch (VA.getLocInfo()) { 4083 default: llvm_unreachable("Unknown loc info!"); 4084 case CCValAssign::Full: break; 4085 case CCValAssign::BCvt: 4086 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4087 break; 4088 case CCValAssign::SExt: 4089 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4090 DAG.getValueType(VA.getValVT())); 4091 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4092 break; 4093 case CCValAssign::ZExt: 4094 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4095 DAG.getValueType(VA.getValVT())); 4096 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4097 break; 4098 } 4099 4100 InVals.push_back(ArgValue); 4101 } else { // VA.isRegLoc() 4102 // sanity check 4103 assert(VA.isMemLoc()); 4104 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4105 4106 int index = VA.getValNo(); 4107 4108 // Some Ins[] entries become multiple ArgLoc[] entries. 4109 // Process them only once. 4110 if (index != lastInsIndex) 4111 { 4112 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4113 // FIXME: For now, all byval parameter objects are marked mutable. 4114 // This can be changed with more analysis. 4115 // In case of tail call optimization mark all arguments mutable. 4116 // Since they could be overwritten by lowering of arguments in case of 4117 // a tail call. 4118 if (Flags.isByVal()) { 4119 assert(Ins[index].isOrigArg() && 4120 "Byval arguments cannot be implicit"); 4121 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4122 4123 int FrameIndex = StoreByValRegs( 4124 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4125 VA.getLocMemOffset(), Flags.getByValSize()); 4126 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4127 CCInfo.nextInRegsParam(); 4128 } else { 4129 unsigned FIOffset = VA.getLocMemOffset(); 4130 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4131 FIOffset, true); 4132 4133 // Create load nodes to retrieve arguments from the stack. 4134 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4135 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4136 MachinePointerInfo::getFixedStack( 4137 DAG.getMachineFunction(), FI))); 4138 } 4139 lastInsIndex = index; 4140 } 4141 } 4142 } 4143 4144 // varargs 4145 if (isVarArg && MFI.hasVAStart()) 4146 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 4147 CCInfo.getNextStackOffset(), 4148 TotalArgRegsSaveSize); 4149 4150 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4151 4152 return Chain; 4153 } 4154 4155 /// isFloatingPointZero - Return true if this is +0.0. 4156 static bool isFloatingPointZero(SDValue Op) { 4157 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4158 return CFP->getValueAPF().isPosZero(); 4159 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4160 // Maybe this has already been legalized into the constant pool? 4161 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4162 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4163 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4164 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4165 return CFP->getValueAPF().isPosZero(); 4166 } 4167 } else if (Op->getOpcode() == ISD::BITCAST && 4168 Op->getValueType(0) == MVT::f64) { 4169 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4170 // created by LowerConstantFP(). 4171 SDValue BitcastOp = Op->getOperand(0); 4172 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4173 isNullConstant(BitcastOp->getOperand(0))) 4174 return true; 4175 } 4176 return false; 4177 } 4178 4179 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4180 /// the given operands. 4181 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4182 SDValue &ARMcc, SelectionDAG &DAG, 4183 const SDLoc &dl) const { 4184 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4185 unsigned C = RHSC->getZExtValue(); 4186 if (!isLegalICmpImmediate((int32_t)C)) { 4187 // Constant does not fit, try adjusting it by one. 4188 switch (CC) { 4189 default: break; 4190 case ISD::SETLT: 4191 case ISD::SETGE: 4192 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4193 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4194 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4195 } 4196 break; 4197 case ISD::SETULT: 4198 case ISD::SETUGE: 4199 if (C != 0 && isLegalICmpImmediate(C-1)) { 4200 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4201 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4202 } 4203 break; 4204 case ISD::SETLE: 4205 case ISD::SETGT: 4206 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4207 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4208 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4209 } 4210 break; 4211 case ISD::SETULE: 4212 case ISD::SETUGT: 4213 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4214 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4215 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4216 } 4217 break; 4218 } 4219 } 4220 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4221 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4222 // In ARM and Thumb-2, the compare instructions can shift their second 4223 // operand. 4224 CC = ISD::getSetCCSwappedOperands(CC); 4225 std::swap(LHS, RHS); 4226 } 4227 4228 // Thumb1 has very limited immediate modes, so turning an "and" into a 4229 // shift can save multiple instructions. 4230 // 4231 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4232 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4233 // own. If it's the operand to an unsigned comparison with an immediate, 4234 // we can eliminate one of the shifts: we transform 4235 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4236 // 4237 // We avoid transforming cases which aren't profitable due to encoding 4238 // details: 4239 // 4240 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4241 // would not; in that case, we're essentially trading one immediate load for 4242 // another. 4243 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4244 // 3. C2 is zero; we have other code for this special case. 4245 // 4246 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4247 // instruction, since the AND is always one instruction anyway, but we could 4248 // use narrow instructions in some cases. 4249 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4250 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4251 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4252 !isSignedIntSetCC(CC)) { 4253 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4254 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4255 uint64_t RHSV = RHSC->getZExtValue(); 4256 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4257 unsigned ShiftBits = countLeadingZeros(Mask); 4258 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4259 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4260 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4261 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4262 } 4263 } 4264 } 4265 4266 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4267 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4268 // way a cmp would. 4269 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4270 // some tweaks to the heuristics for the previous and->shift transform. 4271 // FIXME: Optimize cases where the LHS isn't a shift. 4272 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4273 isa<ConstantSDNode>(RHS) && 4274 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4275 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4276 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4277 unsigned ShiftAmt = 4278 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4279 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4280 DAG.getVTList(MVT::i32, MVT::i32), 4281 LHS.getOperand(0), 4282 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4283 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4284 Shift.getValue(1), SDValue()); 4285 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4286 return Chain.getValue(1); 4287 } 4288 4289 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4290 4291 // If the RHS is a constant zero then the V (overflow) flag will never be 4292 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4293 // simpler for other passes (like the peephole optimiser) to deal with. 4294 if (isNullConstant(RHS)) { 4295 switch (CondCode) { 4296 default: break; 4297 case ARMCC::GE: 4298 CondCode = ARMCC::PL; 4299 break; 4300 case ARMCC::LT: 4301 CondCode = ARMCC::MI; 4302 break; 4303 } 4304 } 4305 4306 ARMISD::NodeType CompareType; 4307 switch (CondCode) { 4308 default: 4309 CompareType = ARMISD::CMP; 4310 break; 4311 case ARMCC::EQ: 4312 case ARMCC::NE: 4313 // Uses only Z Flag 4314 CompareType = ARMISD::CMPZ; 4315 break; 4316 } 4317 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4318 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4319 } 4320 4321 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4322 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4323 SelectionDAG &DAG, const SDLoc &dl) const { 4324 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4325 SDValue Cmp; 4326 if (!isFloatingPointZero(RHS)) 4327 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); 4328 else 4329 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); 4330 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4331 } 4332 4333 /// duplicateCmp - Glue values can have only one use, so this function 4334 /// duplicates a comparison node. 4335 SDValue 4336 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4337 unsigned Opc = Cmp.getOpcode(); 4338 SDLoc DL(Cmp); 4339 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4340 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4341 4342 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4343 Cmp = Cmp.getOperand(0); 4344 Opc = Cmp.getOpcode(); 4345 if (Opc == ARMISD::CMPFP) 4346 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4347 else { 4348 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4349 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4350 } 4351 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4352 } 4353 4354 // This function returns three things: the arithmetic computation itself 4355 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4356 // comparison and the condition code define the case in which the arithmetic 4357 // computation *does not* overflow. 4358 std::pair<SDValue, SDValue> 4359 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4360 SDValue &ARMcc) const { 4361 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4362 4363 SDValue Value, OverflowCmp; 4364 SDValue LHS = Op.getOperand(0); 4365 SDValue RHS = Op.getOperand(1); 4366 SDLoc dl(Op); 4367 4368 // FIXME: We are currently always generating CMPs because we don't support 4369 // generating CMN through the backend. This is not as good as the natural 4370 // CMP case because it causes a register dependency and cannot be folded 4371 // later. 4372 4373 switch (Op.getOpcode()) { 4374 default: 4375 llvm_unreachable("Unknown overflow instruction!"); 4376 case ISD::SADDO: 4377 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4378 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4379 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4380 break; 4381 case ISD::UADDO: 4382 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4383 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4384 // We do not use it in the USUBO case as Value may not be used. 4385 Value = DAG.getNode(ARMISD::ADDC, dl, 4386 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4387 .getValue(0); 4388 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4389 break; 4390 case ISD::SSUBO: 4391 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4392 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4393 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4394 break; 4395 case ISD::USUBO: 4396 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4397 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4398 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4399 break; 4400 case ISD::UMULO: 4401 // We generate a UMUL_LOHI and then check if the high word is 0. 4402 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4403 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4404 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4405 LHS, RHS); 4406 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4407 DAG.getConstant(0, dl, MVT::i32)); 4408 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4409 break; 4410 case ISD::SMULO: 4411 // We generate a SMUL_LOHI and then check if all the bits of the high word 4412 // are the same as the sign bit of the low word. 4413 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4414 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4415 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4416 LHS, RHS); 4417 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4418 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4419 Value.getValue(0), 4420 DAG.getConstant(31, dl, MVT::i32))); 4421 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4422 break; 4423 } // switch (...) 4424 4425 return std::make_pair(Value, OverflowCmp); 4426 } 4427 4428 SDValue 4429 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4430 // Let legalize expand this if it isn't a legal type yet. 4431 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4432 return SDValue(); 4433 4434 SDValue Value, OverflowCmp; 4435 SDValue ARMcc; 4436 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4437 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4438 SDLoc dl(Op); 4439 // We use 0 and 1 as false and true values. 4440 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4441 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4442 EVT VT = Op.getValueType(); 4443 4444 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4445 ARMcc, CCR, OverflowCmp); 4446 4447 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4448 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4449 } 4450 4451 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4452 SelectionDAG &DAG) { 4453 SDLoc DL(BoolCarry); 4454 EVT CarryVT = BoolCarry.getValueType(); 4455 4456 // This converts the boolean value carry into the carry flag by doing 4457 // ARMISD::SUBC Carry, 1 4458 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4459 DAG.getVTList(CarryVT, MVT::i32), 4460 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4461 return Carry.getValue(1); 4462 } 4463 4464 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4465 SelectionDAG &DAG) { 4466 SDLoc DL(Flags); 4467 4468 // Now convert the carry flag into a boolean carry. We do this 4469 // using ARMISD:ADDE 0, 0, Carry 4470 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4471 DAG.getConstant(0, DL, MVT::i32), 4472 DAG.getConstant(0, DL, MVT::i32), Flags); 4473 } 4474 4475 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4476 SelectionDAG &DAG) const { 4477 // Let legalize expand this if it isn't a legal type yet. 4478 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4479 return SDValue(); 4480 4481 SDValue LHS = Op.getOperand(0); 4482 SDValue RHS = Op.getOperand(1); 4483 SDLoc dl(Op); 4484 4485 EVT VT = Op.getValueType(); 4486 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4487 SDValue Value; 4488 SDValue Overflow; 4489 switch (Op.getOpcode()) { 4490 default: 4491 llvm_unreachable("Unknown overflow instruction!"); 4492 case ISD::UADDO: 4493 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4494 // Convert the carry flag into a boolean value. 4495 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4496 break; 4497 case ISD::USUBO: { 4498 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4499 // Convert the carry flag into a boolean value. 4500 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4501 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4502 // value. So compute 1 - C. 4503 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4504 DAG.getConstant(1, dl, MVT::i32), Overflow); 4505 break; 4506 } 4507 } 4508 4509 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4510 } 4511 4512 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4513 const ARMSubtarget *Subtarget) { 4514 EVT VT = Op.getValueType(); 4515 if (!Subtarget->hasDSP()) 4516 return SDValue(); 4517 if (!VT.isSimple()) 4518 return SDValue(); 4519 4520 unsigned NewOpcode; 4521 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4522 switch (VT.getSimpleVT().SimpleTy) { 4523 default: 4524 return SDValue(); 4525 case MVT::i8: 4526 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4527 break; 4528 case MVT::i16: 4529 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4530 break; 4531 } 4532 4533 SDLoc dl(Op); 4534 SDValue Add = 4535 DAG.getNode(NewOpcode, dl, MVT::i32, 4536 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4537 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4538 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4539 } 4540 4541 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4542 SDValue Cond = Op.getOperand(0); 4543 SDValue SelectTrue = Op.getOperand(1); 4544 SDValue SelectFalse = Op.getOperand(2); 4545 SDLoc dl(Op); 4546 unsigned Opc = Cond.getOpcode(); 4547 4548 if (Cond.getResNo() == 1 && 4549 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4550 Opc == ISD::USUBO)) { 4551 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4552 return SDValue(); 4553 4554 SDValue Value, OverflowCmp; 4555 SDValue ARMcc; 4556 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4557 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4558 EVT VT = Op.getValueType(); 4559 4560 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4561 OverflowCmp, DAG); 4562 } 4563 4564 // Convert: 4565 // 4566 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4567 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4568 // 4569 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4570 const ConstantSDNode *CMOVTrue = 4571 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4572 const ConstantSDNode *CMOVFalse = 4573 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4574 4575 if (CMOVTrue && CMOVFalse) { 4576 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4577 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4578 4579 SDValue True; 4580 SDValue False; 4581 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4582 True = SelectTrue; 4583 False = SelectFalse; 4584 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4585 True = SelectFalse; 4586 False = SelectTrue; 4587 } 4588 4589 if (True.getNode() && False.getNode()) { 4590 EVT VT = Op.getValueType(); 4591 SDValue ARMcc = Cond.getOperand(2); 4592 SDValue CCR = Cond.getOperand(3); 4593 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4594 assert(True.getValueType() == VT); 4595 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4596 } 4597 } 4598 } 4599 4600 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4601 // undefined bits before doing a full-word comparison with zero. 4602 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4603 DAG.getConstant(1, dl, Cond.getValueType())); 4604 4605 return DAG.getSelectCC(dl, Cond, 4606 DAG.getConstant(0, dl, Cond.getValueType()), 4607 SelectTrue, SelectFalse, ISD::SETNE); 4608 } 4609 4610 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4611 bool &swpCmpOps, bool &swpVselOps) { 4612 // Start by selecting the GE condition code for opcodes that return true for 4613 // 'equality' 4614 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4615 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4616 CondCode = ARMCC::GE; 4617 4618 // and GT for opcodes that return false for 'equality'. 4619 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4620 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4621 CondCode = ARMCC::GT; 4622 4623 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4624 // to swap the compare operands. 4625 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4626 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4627 swpCmpOps = true; 4628 4629 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4630 // If we have an unordered opcode, we need to swap the operands to the VSEL 4631 // instruction (effectively negating the condition). 4632 // 4633 // This also has the effect of swapping which one of 'less' or 'greater' 4634 // returns true, so we also swap the compare operands. It also switches 4635 // whether we return true for 'equality', so we compensate by picking the 4636 // opposite condition code to our original choice. 4637 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4638 CC == ISD::SETUGT) { 4639 swpCmpOps = !swpCmpOps; 4640 swpVselOps = !swpVselOps; 4641 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4642 } 4643 4644 // 'ordered' is 'anything but unordered', so use the VS condition code and 4645 // swap the VSEL operands. 4646 if (CC == ISD::SETO) { 4647 CondCode = ARMCC::VS; 4648 swpVselOps = true; 4649 } 4650 4651 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4652 // code and swap the VSEL operands. Also do this if we don't care about the 4653 // unordered case. 4654 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4655 CondCode = ARMCC::EQ; 4656 swpVselOps = true; 4657 } 4658 } 4659 4660 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4661 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4662 SDValue Cmp, SelectionDAG &DAG) const { 4663 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4664 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4665 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4666 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4667 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4668 4669 SDValue TrueLow = TrueVal.getValue(0); 4670 SDValue TrueHigh = TrueVal.getValue(1); 4671 SDValue FalseLow = FalseVal.getValue(0); 4672 SDValue FalseHigh = FalseVal.getValue(1); 4673 4674 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4675 ARMcc, CCR, Cmp); 4676 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4677 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4678 4679 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4680 } else { 4681 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4682 Cmp); 4683 } 4684 } 4685 4686 static bool isGTorGE(ISD::CondCode CC) { 4687 return CC == ISD::SETGT || CC == ISD::SETGE; 4688 } 4689 4690 static bool isLTorLE(ISD::CondCode CC) { 4691 return CC == ISD::SETLT || CC == ISD::SETLE; 4692 } 4693 4694 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4695 // All of these conditions (and their <= and >= counterparts) will do: 4696 // x < k ? k : x 4697 // x > k ? x : k 4698 // k < x ? x : k 4699 // k > x ? k : x 4700 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4701 const SDValue TrueVal, const SDValue FalseVal, 4702 const ISD::CondCode CC, const SDValue K) { 4703 return (isGTorGE(CC) && 4704 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4705 (isLTorLE(CC) && 4706 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4707 } 4708 4709 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4710 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4711 const SDValue TrueVal, const SDValue FalseVal, 4712 const ISD::CondCode CC, const SDValue K) { 4713 return (isGTorGE(CC) && 4714 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4715 (isLTorLE(CC) && 4716 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4717 } 4718 4719 // Check if two chained conditionals could be converted into SSAT or USAT. 4720 // 4721 // SSAT can replace a set of two conditional selectors that bound a number to an 4722 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4723 // 4724 // x < -k ? -k : (x > k ? k : x) 4725 // x < -k ? -k : (x < k ? x : k) 4726 // x > -k ? (x > k ? k : x) : -k 4727 // x < k ? (x < -k ? -k : x) : k 4728 // etc. 4729 // 4730 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4731 // a power of 2. 4732 // 4733 // It returns true if the conversion can be done, false otherwise. 4734 // Additionally, the variable is returned in parameter V, the constant in K and 4735 // usat is set to true if the conditional represents an unsigned saturation 4736 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4737 uint64_t &K, bool &usat) { 4738 SDValue LHS1 = Op.getOperand(0); 4739 SDValue RHS1 = Op.getOperand(1); 4740 SDValue TrueVal1 = Op.getOperand(2); 4741 SDValue FalseVal1 = Op.getOperand(3); 4742 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4743 4744 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4745 if (Op2.getOpcode() != ISD::SELECT_CC) 4746 return false; 4747 4748 SDValue LHS2 = Op2.getOperand(0); 4749 SDValue RHS2 = Op2.getOperand(1); 4750 SDValue TrueVal2 = Op2.getOperand(2); 4751 SDValue FalseVal2 = Op2.getOperand(3); 4752 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4753 4754 // Find out which are the constants and which are the variables 4755 // in each conditional 4756 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4757 ? &RHS1 4758 : nullptr; 4759 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4760 ? &RHS2 4761 : nullptr; 4762 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4763 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4764 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4765 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4766 4767 // We must detect cases where the original operations worked with 16- or 4768 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4769 // must work with sign-extended values but the select operations return 4770 // the original non-extended value. 4771 SDValue V2TmpReg = V2Tmp; 4772 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4773 V2TmpReg = V2Tmp->getOperand(0); 4774 4775 // Check that the registers and the constants have the correct values 4776 // in both conditionals 4777 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4778 V2TmpReg != V2) 4779 return false; 4780 4781 // Figure out which conditional is saturating the lower/upper bound. 4782 const SDValue *LowerCheckOp = 4783 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4784 ? &Op 4785 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4786 ? &Op2 4787 : nullptr; 4788 const SDValue *UpperCheckOp = 4789 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4790 ? &Op 4791 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4792 ? &Op2 4793 : nullptr; 4794 4795 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4796 return false; 4797 4798 // Check that the constant in the lower-bound check is 4799 // the opposite of the constant in the upper-bound check 4800 // in 1's complement. 4801 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4802 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4803 int64_t PosVal = std::max(Val1, Val2); 4804 int64_t NegVal = std::min(Val1, Val2); 4805 4806 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4807 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4808 isPowerOf2_64(PosVal + 1)) { 4809 4810 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4811 if (Val1 == ~Val2) 4812 usat = false; 4813 else if (NegVal == 0) 4814 usat = true; 4815 else 4816 return false; 4817 4818 V = V2; 4819 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4820 4821 return true; 4822 } 4823 4824 return false; 4825 } 4826 4827 // Check if a condition of the type x < k ? k : x can be converted into a 4828 // bit operation instead of conditional moves. 4829 // Currently this is allowed given: 4830 // - The conditions and values match up 4831 // - k is 0 or -1 (all ones) 4832 // This function will not check the last condition, thats up to the caller 4833 // It returns true if the transformation can be made, and in such case 4834 // returns x in V, and k in SatK. 4835 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4836 SDValue &SatK) 4837 { 4838 SDValue LHS = Op.getOperand(0); 4839 SDValue RHS = Op.getOperand(1); 4840 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4841 SDValue TrueVal = Op.getOperand(2); 4842 SDValue FalseVal = Op.getOperand(3); 4843 4844 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4845 ? &RHS 4846 : nullptr; 4847 4848 // No constant operation in comparison, early out 4849 if (!K) 4850 return false; 4851 4852 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4853 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4854 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4855 4856 // If the constant on left and right side, or variable on left and right, 4857 // does not match, early out 4858 if (*K != KTmp || V != VTmp) 4859 return false; 4860 4861 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4862 SatK = *K; 4863 return true; 4864 } 4865 4866 return false; 4867 } 4868 4869 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 4870 if (VT == MVT::f32) 4871 return !Subtarget->hasVFP2Base(); 4872 if (VT == MVT::f64) 4873 return !Subtarget->hasFP64(); 4874 if (VT == MVT::f16) 4875 return !Subtarget->hasFullFP16(); 4876 return false; 4877 } 4878 4879 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4880 EVT VT = Op.getValueType(); 4881 SDLoc dl(Op); 4882 4883 // Try to convert two saturating conditional selects into a single SSAT 4884 SDValue SatValue; 4885 uint64_t SatConstant; 4886 bool SatUSat; 4887 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4888 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4889 if (SatUSat) 4890 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4891 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4892 else 4893 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4894 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4895 } 4896 4897 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4898 // into more efficient bit operations, which is possible when k is 0 or -1 4899 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4900 // single instructions. On Thumb the shift and the bit operation will be two 4901 // instructions. 4902 // Only allow this transformation on full-width (32-bit) operations 4903 SDValue LowerSatConstant; 4904 if (VT == MVT::i32 && 4905 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4906 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4907 DAG.getConstant(31, dl, VT)); 4908 if (isNullConstant(LowerSatConstant)) { 4909 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4910 DAG.getAllOnesConstant(dl, VT)); 4911 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4912 } else if (isAllOnesConstant(LowerSatConstant)) 4913 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4914 } 4915 4916 SDValue LHS = Op.getOperand(0); 4917 SDValue RHS = Op.getOperand(1); 4918 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4919 SDValue TrueVal = Op.getOperand(2); 4920 SDValue FalseVal = Op.getOperand(3); 4921 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 4922 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 4923 4924 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 4925 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 4926 unsigned TVal = CTVal->getZExtValue(); 4927 unsigned FVal = CFVal->getZExtValue(); 4928 unsigned Opcode = 0; 4929 4930 if (TVal == ~FVal) { 4931 Opcode = ARMISD::CSINV; 4932 } else if (TVal == ~FVal + 1) { 4933 Opcode = ARMISD::CSNEG; 4934 } else if (TVal + 1 == FVal) { 4935 Opcode = ARMISD::CSINC; 4936 } else if (TVal == FVal + 1) { 4937 Opcode = ARMISD::CSINC; 4938 std::swap(TrueVal, FalseVal); 4939 std::swap(TVal, FVal); 4940 CC = ISD::getSetCCInverse(CC, true); 4941 } 4942 4943 if (Opcode) { 4944 // If one of the constants is cheaper than another, materialise the 4945 // cheaper one and let the csel generate the other. 4946 if (Opcode != ARMISD::CSINC && 4947 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 4948 std::swap(TrueVal, FalseVal); 4949 std::swap(TVal, FVal); 4950 CC = ISD::getSetCCInverse(CC, true); 4951 } 4952 4953 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 4954 // to get there. CSINC not is invertable like the other two (~(~a) == a, 4955 // -(-a) == a, but (a+1)+1 != a). 4956 if (FVal == 0 && Opcode != ARMISD::CSINC) { 4957 std::swap(TrueVal, FalseVal); 4958 std::swap(TVal, FVal); 4959 CC = ISD::getSetCCInverse(CC, true); 4960 } 4961 if (TVal == 0) 4962 TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); 4963 4964 // Drops F's value because we can get it by inverting/negating TVal. 4965 FalseVal = TrueVal; 4966 4967 SDValue ARMcc; 4968 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4969 EVT VT = TrueVal.getValueType(); 4970 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 4971 } 4972 } 4973 4974 if (isUnsupportedFloatingType(LHS.getValueType())) { 4975 DAG.getTargetLoweringInfo().softenSetCCOperands( 4976 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 4977 4978 // If softenSetCCOperands only returned one value, we should compare it to 4979 // zero. 4980 if (!RHS.getNode()) { 4981 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4982 CC = ISD::SETNE; 4983 } 4984 } 4985 4986 if (LHS.getValueType() == MVT::i32) { 4987 // Try to generate VSEL on ARMv8. 4988 // The VSEL instruction can't use all the usual ARM condition 4989 // codes: it only has two bits to select the condition code, so it's 4990 // constrained to use only GE, GT, VS and EQ. 4991 // 4992 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 4993 // swap the operands of the previous compare instruction (effectively 4994 // inverting the compare condition, swapping 'less' and 'greater') and 4995 // sometimes need to swap the operands to the VSEL (which inverts the 4996 // condition in the sense of firing whenever the previous condition didn't) 4997 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 4998 TrueVal.getValueType() == MVT::f32 || 4999 TrueVal.getValueType() == MVT::f64)) { 5000 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5001 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5002 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5003 CC = ISD::getSetCCInverse(CC, true); 5004 std::swap(TrueVal, FalseVal); 5005 } 5006 } 5007 5008 SDValue ARMcc; 5009 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5010 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5011 // Choose GE over PL, which vsel does now support 5012 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5013 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5014 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5015 } 5016 5017 ARMCC::CondCodes CondCode, CondCode2; 5018 FPCCToARMCC(CC, CondCode, CondCode2); 5019 5020 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5021 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5022 // must use VSEL (limited condition codes), due to not having conditional f16 5023 // moves. 5024 if (Subtarget->hasFPARMv8Base() && 5025 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5026 (TrueVal.getValueType() == MVT::f16 || 5027 TrueVal.getValueType() == MVT::f32 || 5028 TrueVal.getValueType() == MVT::f64)) { 5029 bool swpCmpOps = false; 5030 bool swpVselOps = false; 5031 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5032 5033 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5034 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5035 if (swpCmpOps) 5036 std::swap(LHS, RHS); 5037 if (swpVselOps) 5038 std::swap(TrueVal, FalseVal); 5039 } 5040 } 5041 5042 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5043 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5044 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5045 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5046 if (CondCode2 != ARMCC::AL) { 5047 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5048 // FIXME: Needs another CMP because flag can have but one use. 5049 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5050 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5051 } 5052 return Result; 5053 } 5054 5055 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5056 /// to morph to an integer compare sequence. 5057 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5058 const ARMSubtarget *Subtarget) { 5059 SDNode *N = Op.getNode(); 5060 if (!N->hasOneUse()) 5061 // Otherwise it requires moving the value from fp to integer registers. 5062 return false; 5063 if (!N->getNumValues()) 5064 return false; 5065 EVT VT = Op.getValueType(); 5066 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5067 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5068 // vmrs are very slow, e.g. cortex-a8. 5069 return false; 5070 5071 if (isFloatingPointZero(Op)) { 5072 SeenZero = true; 5073 return true; 5074 } 5075 return ISD::isNormalLoad(N); 5076 } 5077 5078 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5079 if (isFloatingPointZero(Op)) 5080 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5081 5082 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5083 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5084 Ld->getPointerInfo(), Ld->getAlignment(), 5085 Ld->getMemOperand()->getFlags()); 5086 5087 llvm_unreachable("Unknown VFP cmp argument!"); 5088 } 5089 5090 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5091 SDValue &RetVal1, SDValue &RetVal2) { 5092 SDLoc dl(Op); 5093 5094 if (isFloatingPointZero(Op)) { 5095 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5096 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5097 return; 5098 } 5099 5100 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5101 SDValue Ptr = Ld->getBasePtr(); 5102 RetVal1 = 5103 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5104 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5105 5106 EVT PtrType = Ptr.getValueType(); 5107 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5108 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5109 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5110 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5111 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5112 Ld->getMemOperand()->getFlags()); 5113 return; 5114 } 5115 5116 llvm_unreachable("Unknown VFP cmp argument!"); 5117 } 5118 5119 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5120 /// f32 and even f64 comparisons to integer ones. 5121 SDValue 5122 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5123 SDValue Chain = Op.getOperand(0); 5124 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5125 SDValue LHS = Op.getOperand(2); 5126 SDValue RHS = Op.getOperand(3); 5127 SDValue Dest = Op.getOperand(4); 5128 SDLoc dl(Op); 5129 5130 bool LHSSeenZero = false; 5131 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5132 bool RHSSeenZero = false; 5133 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5134 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5135 // If unsafe fp math optimization is enabled and there are no other uses of 5136 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5137 // to an integer comparison. 5138 if (CC == ISD::SETOEQ) 5139 CC = ISD::SETEQ; 5140 else if (CC == ISD::SETUNE) 5141 CC = ISD::SETNE; 5142 5143 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5144 SDValue ARMcc; 5145 if (LHS.getValueType() == MVT::f32) { 5146 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5147 bitcastf32Toi32(LHS, DAG), Mask); 5148 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5149 bitcastf32Toi32(RHS, DAG), Mask); 5150 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5151 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5152 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5153 Chain, Dest, ARMcc, CCR, Cmp); 5154 } 5155 5156 SDValue LHS1, LHS2; 5157 SDValue RHS1, RHS2; 5158 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5159 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5160 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5161 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5162 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5163 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5164 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5165 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5166 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5167 } 5168 5169 return SDValue(); 5170 } 5171 5172 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5173 SDValue Chain = Op.getOperand(0); 5174 SDValue Cond = Op.getOperand(1); 5175 SDValue Dest = Op.getOperand(2); 5176 SDLoc dl(Op); 5177 5178 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5179 // instruction. 5180 unsigned Opc = Cond.getOpcode(); 5181 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5182 !Subtarget->isThumb1Only(); 5183 if (Cond.getResNo() == 1 && 5184 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5185 Opc == ISD::USUBO || OptimizeMul)) { 5186 // Only lower legal XALUO ops. 5187 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5188 return SDValue(); 5189 5190 // The actual operation with overflow check. 5191 SDValue Value, OverflowCmp; 5192 SDValue ARMcc; 5193 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5194 5195 // Reverse the condition code. 5196 ARMCC::CondCodes CondCode = 5197 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5198 CondCode = ARMCC::getOppositeCondition(CondCode); 5199 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5200 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5201 5202 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5203 OverflowCmp); 5204 } 5205 5206 return SDValue(); 5207 } 5208 5209 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5210 SDValue Chain = Op.getOperand(0); 5211 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5212 SDValue LHS = Op.getOperand(2); 5213 SDValue RHS = Op.getOperand(3); 5214 SDValue Dest = Op.getOperand(4); 5215 SDLoc dl(Op); 5216 5217 if (isUnsupportedFloatingType(LHS.getValueType())) { 5218 DAG.getTargetLoweringInfo().softenSetCCOperands( 5219 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5220 5221 // If softenSetCCOperands only returned one value, we should compare it to 5222 // zero. 5223 if (!RHS.getNode()) { 5224 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5225 CC = ISD::SETNE; 5226 } 5227 } 5228 5229 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5230 // instruction. 5231 unsigned Opc = LHS.getOpcode(); 5232 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5233 !Subtarget->isThumb1Only(); 5234 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5235 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5236 Opc == ISD::USUBO || OptimizeMul) && 5237 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5238 // Only lower legal XALUO ops. 5239 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5240 return SDValue(); 5241 5242 // The actual operation with overflow check. 5243 SDValue Value, OverflowCmp; 5244 SDValue ARMcc; 5245 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5246 5247 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5248 // Reverse the condition code. 5249 ARMCC::CondCodes CondCode = 5250 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5251 CondCode = ARMCC::getOppositeCondition(CondCode); 5252 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5253 } 5254 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5255 5256 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5257 OverflowCmp); 5258 } 5259 5260 if (LHS.getValueType() == MVT::i32) { 5261 SDValue ARMcc; 5262 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5263 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5264 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5265 Chain, Dest, ARMcc, CCR, Cmp); 5266 } 5267 5268 if (getTargetMachine().Options.UnsafeFPMath && 5269 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5270 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5271 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5272 return Result; 5273 } 5274 5275 ARMCC::CondCodes CondCode, CondCode2; 5276 FPCCToARMCC(CC, CondCode, CondCode2); 5277 5278 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5279 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5280 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5281 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5282 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5283 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5284 if (CondCode2 != ARMCC::AL) { 5285 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5286 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5287 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5288 } 5289 return Res; 5290 } 5291 5292 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5293 SDValue Chain = Op.getOperand(0); 5294 SDValue Table = Op.getOperand(1); 5295 SDValue Index = Op.getOperand(2); 5296 SDLoc dl(Op); 5297 5298 EVT PTy = getPointerTy(DAG.getDataLayout()); 5299 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5300 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5301 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5302 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5303 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5304 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5305 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5306 // which does another jump to the destination. This also makes it easier 5307 // to translate it to TBB / TBH later (Thumb2 only). 5308 // FIXME: This might not work if the function is extremely large. 5309 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5310 Addr, Op.getOperand(2), JTI); 5311 } 5312 if (isPositionIndependent() || Subtarget->isROPI()) { 5313 Addr = 5314 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5315 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5316 Chain = Addr.getValue(1); 5317 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5318 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5319 } else { 5320 Addr = 5321 DAG.getLoad(PTy, dl, Chain, Addr, 5322 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5323 Chain = Addr.getValue(1); 5324 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5325 } 5326 } 5327 5328 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5329 EVT VT = Op.getValueType(); 5330 SDLoc dl(Op); 5331 5332 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5333 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5334 return Op; 5335 return DAG.UnrollVectorOp(Op.getNode()); 5336 } 5337 5338 const bool HasFullFP16 = 5339 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5340 5341 EVT NewTy; 5342 const EVT OpTy = Op.getOperand(0).getValueType(); 5343 if (OpTy == MVT::v4f32) 5344 NewTy = MVT::v4i32; 5345 else if (OpTy == MVT::v4f16 && HasFullFP16) 5346 NewTy = MVT::v4i16; 5347 else if (OpTy == MVT::v8f16 && HasFullFP16) 5348 NewTy = MVT::v8i16; 5349 else 5350 llvm_unreachable("Invalid type for custom lowering!"); 5351 5352 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5353 return DAG.UnrollVectorOp(Op.getNode()); 5354 5355 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5356 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5357 } 5358 5359 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5360 EVT VT = Op.getValueType(); 5361 if (VT.isVector()) 5362 return LowerVectorFP_TO_INT(Op, DAG); 5363 if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) { 5364 RTLIB::Libcall LC; 5365 if (Op.getOpcode() == ISD::FP_TO_SINT) 5366 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 5367 Op.getValueType()); 5368 else 5369 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 5370 Op.getValueType()); 5371 MakeLibCallOptions CallOptions; 5372 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5373 CallOptions, SDLoc(Op)).first; 5374 } 5375 5376 return Op; 5377 } 5378 5379 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5380 EVT VT = Op.getValueType(); 5381 SDLoc dl(Op); 5382 5383 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5384 if (VT.getVectorElementType() == MVT::f32) 5385 return Op; 5386 return DAG.UnrollVectorOp(Op.getNode()); 5387 } 5388 5389 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5390 Op.getOperand(0).getValueType() == MVT::v8i16) && 5391 "Invalid type for custom lowering!"); 5392 5393 const bool HasFullFP16 = 5394 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5395 5396 EVT DestVecType; 5397 if (VT == MVT::v4f32) 5398 DestVecType = MVT::v4i32; 5399 else if (VT == MVT::v4f16 && HasFullFP16) 5400 DestVecType = MVT::v4i16; 5401 else if (VT == MVT::v8f16 && HasFullFP16) 5402 DestVecType = MVT::v8i16; 5403 else 5404 return DAG.UnrollVectorOp(Op.getNode()); 5405 5406 unsigned CastOpc; 5407 unsigned Opc; 5408 switch (Op.getOpcode()) { 5409 default: llvm_unreachable("Invalid opcode!"); 5410 case ISD::SINT_TO_FP: 5411 CastOpc = ISD::SIGN_EXTEND; 5412 Opc = ISD::SINT_TO_FP; 5413 break; 5414 case ISD::UINT_TO_FP: 5415 CastOpc = ISD::ZERO_EXTEND; 5416 Opc = ISD::UINT_TO_FP; 5417 break; 5418 } 5419 5420 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5421 return DAG.getNode(Opc, dl, VT, Op); 5422 } 5423 5424 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5425 EVT VT = Op.getValueType(); 5426 if (VT.isVector()) 5427 return LowerVectorINT_TO_FP(Op, DAG); 5428 if (isUnsupportedFloatingType(VT)) { 5429 RTLIB::Libcall LC; 5430 if (Op.getOpcode() == ISD::SINT_TO_FP) 5431 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5432 Op.getValueType()); 5433 else 5434 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5435 Op.getValueType()); 5436 MakeLibCallOptions CallOptions; 5437 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5438 CallOptions, SDLoc(Op)).first; 5439 } 5440 5441 return Op; 5442 } 5443 5444 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5445 // Implement fcopysign with a fabs and a conditional fneg. 5446 SDValue Tmp0 = Op.getOperand(0); 5447 SDValue Tmp1 = Op.getOperand(1); 5448 SDLoc dl(Op); 5449 EVT VT = Op.getValueType(); 5450 EVT SrcVT = Tmp1.getValueType(); 5451 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5452 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5453 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5454 5455 if (UseNEON) { 5456 // Use VBSL to copy the sign bit. 5457 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5458 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5459 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5460 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5461 if (VT == MVT::f64) 5462 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5463 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5464 DAG.getConstant(32, dl, MVT::i32)); 5465 else /*if (VT == MVT::f32)*/ 5466 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5467 if (SrcVT == MVT::f32) { 5468 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5469 if (VT == MVT::f64) 5470 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5471 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5472 DAG.getConstant(32, dl, MVT::i32)); 5473 } else if (VT == MVT::f32) 5474 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5475 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5476 DAG.getConstant(32, dl, MVT::i32)); 5477 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5478 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5479 5480 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5481 dl, MVT::i32); 5482 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5483 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5484 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5485 5486 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5487 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5488 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5489 if (VT == MVT::f32) { 5490 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5491 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5492 DAG.getConstant(0, dl, MVT::i32)); 5493 } else { 5494 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5495 } 5496 5497 return Res; 5498 } 5499 5500 // Bitcast operand 1 to i32. 5501 if (SrcVT == MVT::f64) 5502 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5503 Tmp1).getValue(1); 5504 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5505 5506 // Or in the signbit with integer operations. 5507 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5508 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5509 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5510 if (VT == MVT::f32) { 5511 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5512 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5513 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5514 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5515 } 5516 5517 // f64: Or the high part with signbit and then combine two parts. 5518 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5519 Tmp0); 5520 SDValue Lo = Tmp0.getValue(0); 5521 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5522 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5523 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5524 } 5525 5526 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5527 MachineFunction &MF = DAG.getMachineFunction(); 5528 MachineFrameInfo &MFI = MF.getFrameInfo(); 5529 MFI.setReturnAddressIsTaken(true); 5530 5531 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5532 return SDValue(); 5533 5534 EVT VT = Op.getValueType(); 5535 SDLoc dl(Op); 5536 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5537 if (Depth) { 5538 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5539 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5540 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5541 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5542 MachinePointerInfo()); 5543 } 5544 5545 // Return LR, which contains the return address. Mark it an implicit live-in. 5546 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5547 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5548 } 5549 5550 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5551 const ARMBaseRegisterInfo &ARI = 5552 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5553 MachineFunction &MF = DAG.getMachineFunction(); 5554 MachineFrameInfo &MFI = MF.getFrameInfo(); 5555 MFI.setFrameAddressIsTaken(true); 5556 5557 EVT VT = Op.getValueType(); 5558 SDLoc dl(Op); // FIXME probably not meaningful 5559 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5560 Register FrameReg = ARI.getFrameRegister(MF); 5561 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5562 while (Depth--) 5563 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5564 MachinePointerInfo()); 5565 return FrameAddr; 5566 } 5567 5568 // FIXME? Maybe this could be a TableGen attribute on some registers and 5569 // this table could be generated automatically from RegInfo. 5570 Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 5571 const MachineFunction &MF) const { 5572 Register Reg = StringSwitch<unsigned>(RegName) 5573 .Case("sp", ARM::SP) 5574 .Default(0); 5575 if (Reg) 5576 return Reg; 5577 report_fatal_error(Twine("Invalid register name \"" 5578 + StringRef(RegName) + "\".")); 5579 } 5580 5581 // Result is 64 bit value so split into two 32 bit values and return as a 5582 // pair of values. 5583 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5584 SelectionDAG &DAG) { 5585 SDLoc DL(N); 5586 5587 // This function is only supposed to be called for i64 type destination. 5588 assert(N->getValueType(0) == MVT::i64 5589 && "ExpandREAD_REGISTER called for non-i64 type result."); 5590 5591 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5592 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5593 N->getOperand(0), 5594 N->getOperand(1)); 5595 5596 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5597 Read.getValue(1))); 5598 Results.push_back(Read.getOperand(0)); 5599 } 5600 5601 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5602 /// When \p DstVT, the destination type of \p BC, is on the vector 5603 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5604 /// it might be possible to combine them, such that everything stays on the 5605 /// vector register bank. 5606 /// \p return The node that would replace \p BT, if the combine 5607 /// is possible. 5608 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5609 SelectionDAG &DAG) { 5610 SDValue Op = BC->getOperand(0); 5611 EVT DstVT = BC->getValueType(0); 5612 5613 // The only vector instruction that can produce a scalar (remember, 5614 // since the bitcast was about to be turned into VMOVDRR, the source 5615 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5616 // Moreover, we can do this combine only if there is one use. 5617 // Finally, if the destination type is not a vector, there is not 5618 // much point on forcing everything on the vector bank. 5619 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5620 !Op.hasOneUse()) 5621 return SDValue(); 5622 5623 // If the index is not constant, we will introduce an additional 5624 // multiply that will stick. 5625 // Give up in that case. 5626 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5627 if (!Index) 5628 return SDValue(); 5629 unsigned DstNumElt = DstVT.getVectorNumElements(); 5630 5631 // Compute the new index. 5632 const APInt &APIntIndex = Index->getAPIntValue(); 5633 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5634 NewIndex *= APIntIndex; 5635 // Check if the new constant index fits into i32. 5636 if (NewIndex.getBitWidth() > 32) 5637 return SDValue(); 5638 5639 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5640 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5641 SDLoc dl(Op); 5642 SDValue ExtractSrc = Op.getOperand(0); 5643 EVT VecVT = EVT::getVectorVT( 5644 *DAG.getContext(), DstVT.getScalarType(), 5645 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5646 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5647 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5648 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5649 } 5650 5651 /// ExpandBITCAST - If the target supports VFP, this function is called to 5652 /// expand a bit convert where either the source or destination type is i64 to 5653 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5654 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5655 /// vectors), since the legalizer won't know what to do with that. 5656 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5657 const ARMSubtarget *Subtarget) { 5658 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5659 SDLoc dl(N); 5660 SDValue Op = N->getOperand(0); 5661 5662 // This function is only supposed to be called for i64 types, either as the 5663 // source or destination of the bit convert. 5664 EVT SrcVT = Op.getValueType(); 5665 EVT DstVT = N->getValueType(0); 5666 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5667 5668 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5669 // FullFP16: half values are passed in S-registers, and we don't 5670 // need any of the bitcast and moves: 5671 // 5672 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5673 // t5: i32 = bitcast t2 5674 // t18: f16 = ARMISD::VMOVhr t5 5675 if (Op.getOpcode() != ISD::CopyFromReg || 5676 Op.getValueType() != MVT::f32) 5677 return SDValue(); 5678 5679 auto Move = N->use_begin(); 5680 if (Move->getOpcode() != ARMISD::VMOVhr) 5681 return SDValue(); 5682 5683 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5684 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5685 DAG.ReplaceAllUsesWith(*Move, &Copy); 5686 return Copy; 5687 } 5688 5689 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5690 if (!HasFullFP16) 5691 return SDValue(); 5692 // SoftFP: read half-precision arguments: 5693 // 5694 // t2: i32,ch = ... 5695 // t7: i16 = truncate t2 <~~~~ Op 5696 // t8: f16 = bitcast t7 <~~~~ N 5697 // 5698 if (Op.getOperand(0).getValueType() == MVT::i32) 5699 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5700 MVT::f16, Op.getOperand(0)); 5701 5702 return SDValue(); 5703 } 5704 5705 // Half-precision return values 5706 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5707 if (!HasFullFP16) 5708 return SDValue(); 5709 // 5710 // t11: f16 = fadd t8, t10 5711 // t12: i16 = bitcast t11 <~~~ SDNode N 5712 // t13: i32 = zero_extend t12 5713 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5714 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5715 // 5716 // transform this into: 5717 // 5718 // t20: i32 = ARMISD::VMOVrh t11 5719 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5720 // 5721 auto ZeroExtend = N->use_begin(); 5722 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5723 ZeroExtend->getValueType(0) != MVT::i32) 5724 return SDValue(); 5725 5726 auto Copy = ZeroExtend->use_begin(); 5727 if (Copy->getOpcode() == ISD::CopyToReg && 5728 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5729 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5730 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5731 return Cvt; 5732 } 5733 return SDValue(); 5734 } 5735 5736 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5737 return SDValue(); 5738 5739 // Turn i64->f64 into VMOVDRR. 5740 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5741 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5742 // if we can combine the bitcast with its source. 5743 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5744 return Val; 5745 5746 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5747 DAG.getConstant(0, dl, MVT::i32)); 5748 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5749 DAG.getConstant(1, dl, MVT::i32)); 5750 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5751 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5752 } 5753 5754 // Turn f64->i64 into VMOVRRD. 5755 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5756 SDValue Cvt; 5757 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5758 SrcVT.getVectorNumElements() > 1) 5759 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5760 DAG.getVTList(MVT::i32, MVT::i32), 5761 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5762 else 5763 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5764 DAG.getVTList(MVT::i32, MVT::i32), Op); 5765 // Merge the pieces into a single i64 value. 5766 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5767 } 5768 5769 return SDValue(); 5770 } 5771 5772 /// getZeroVector - Returns a vector of specified type with all zero elements. 5773 /// Zero vectors are used to represent vector negation and in those cases 5774 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5775 /// not support i64 elements, so sometimes the zero vectors will need to be 5776 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5777 /// zero vector. 5778 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5779 assert(VT.isVector() && "Expected a vector type"); 5780 // The canonical modified immediate encoding of a zero vector is....0! 5781 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5782 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5783 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5784 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5785 } 5786 5787 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5788 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5789 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5790 SelectionDAG &DAG) const { 5791 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5792 EVT VT = Op.getValueType(); 5793 unsigned VTBits = VT.getSizeInBits(); 5794 SDLoc dl(Op); 5795 SDValue ShOpLo = Op.getOperand(0); 5796 SDValue ShOpHi = Op.getOperand(1); 5797 SDValue ShAmt = Op.getOperand(2); 5798 SDValue ARMcc; 5799 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5800 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5801 5802 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5803 5804 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5805 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5806 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5807 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5808 DAG.getConstant(VTBits, dl, MVT::i32)); 5809 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5810 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5811 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5812 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5813 ISD::SETGE, ARMcc, DAG, dl); 5814 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5815 ARMcc, CCR, CmpLo); 5816 5817 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5818 SDValue HiBigShift = Opc == ISD::SRA 5819 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5820 DAG.getConstant(VTBits - 1, dl, VT)) 5821 : DAG.getConstant(0, dl, VT); 5822 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5823 ISD::SETGE, ARMcc, DAG, dl); 5824 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5825 ARMcc, CCR, CmpHi); 5826 5827 SDValue Ops[2] = { Lo, Hi }; 5828 return DAG.getMergeValues(Ops, dl); 5829 } 5830 5831 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5832 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5833 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5834 SelectionDAG &DAG) const { 5835 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5836 EVT VT = Op.getValueType(); 5837 unsigned VTBits = VT.getSizeInBits(); 5838 SDLoc dl(Op); 5839 SDValue ShOpLo = Op.getOperand(0); 5840 SDValue ShOpHi = Op.getOperand(1); 5841 SDValue ShAmt = Op.getOperand(2); 5842 SDValue ARMcc; 5843 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5844 5845 assert(Op.getOpcode() == ISD::SHL_PARTS); 5846 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5847 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5848 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5849 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5850 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5851 5852 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5853 DAG.getConstant(VTBits, dl, MVT::i32)); 5854 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5855 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5856 ISD::SETGE, ARMcc, DAG, dl); 5857 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5858 ARMcc, CCR, CmpHi); 5859 5860 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5861 ISD::SETGE, ARMcc, DAG, dl); 5862 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5863 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5864 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5865 5866 SDValue Ops[2] = { Lo, Hi }; 5867 return DAG.getMergeValues(Ops, dl); 5868 } 5869 5870 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5871 SelectionDAG &DAG) const { 5872 // The rounding mode is in bits 23:22 of the FPSCR. 5873 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5874 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5875 // so that the shift + and get folded into a bitfield extract. 5876 SDLoc dl(Op); 5877 SDValue Ops[] = { DAG.getEntryNode(), 5878 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5879 5880 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5881 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5882 DAG.getConstant(1U << 22, dl, MVT::i32)); 5883 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5884 DAG.getConstant(22, dl, MVT::i32)); 5885 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5886 DAG.getConstant(3, dl, MVT::i32)); 5887 } 5888 5889 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5890 const ARMSubtarget *ST) { 5891 SDLoc dl(N); 5892 EVT VT = N->getValueType(0); 5893 if (VT.isVector() && ST->hasNEON()) { 5894 5895 // Compute the least significant set bit: LSB = X & -X 5896 SDValue X = N->getOperand(0); 5897 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5898 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5899 5900 EVT ElemTy = VT.getVectorElementType(); 5901 5902 if (ElemTy == MVT::i8) { 5903 // Compute with: cttz(x) = ctpop(lsb - 1) 5904 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5905 DAG.getTargetConstant(1, dl, ElemTy)); 5906 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5907 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5908 } 5909 5910 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5911 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5912 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5913 unsigned NumBits = ElemTy.getSizeInBits(); 5914 SDValue WidthMinus1 = 5915 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5916 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5917 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5918 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5919 } 5920 5921 // Compute with: cttz(x) = ctpop(lsb - 1) 5922 5923 // Compute LSB - 1. 5924 SDValue Bits; 5925 if (ElemTy == MVT::i64) { 5926 // Load constant 0xffff'ffff'ffff'ffff to register. 5927 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5928 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5929 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5930 } else { 5931 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5932 DAG.getTargetConstant(1, dl, ElemTy)); 5933 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5934 } 5935 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5936 } 5937 5938 if (!ST->hasV6T2Ops()) 5939 return SDValue(); 5940 5941 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5942 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 5943 } 5944 5945 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 5946 const ARMSubtarget *ST) { 5947 EVT VT = N->getValueType(0); 5948 SDLoc DL(N); 5949 5950 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 5951 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 5952 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 5953 "Unexpected type for custom ctpop lowering"); 5954 5955 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5956 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 5957 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 5958 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 5959 5960 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 5961 unsigned EltSize = 8; 5962 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 5963 while (EltSize != VT.getScalarSizeInBits()) { 5964 SmallVector<SDValue, 8> Ops; 5965 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 5966 TLI.getPointerTy(DAG.getDataLayout()))); 5967 Ops.push_back(Res); 5968 5969 EltSize *= 2; 5970 NumElts /= 2; 5971 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 5972 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 5973 } 5974 5975 return Res; 5976 } 5977 5978 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 5979 /// operand of a vector shift operation, where all the elements of the 5980 /// build_vector must have the same constant integer value. 5981 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 5982 // Ignore bit_converts. 5983 while (Op.getOpcode() == ISD::BITCAST) 5984 Op = Op.getOperand(0); 5985 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 5986 APInt SplatBits, SplatUndef; 5987 unsigned SplatBitSize; 5988 bool HasAnyUndefs; 5989 if (!BVN || 5990 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 5991 ElementBits) || 5992 SplatBitSize > ElementBits) 5993 return false; 5994 Cnt = SplatBits.getSExtValue(); 5995 return true; 5996 } 5997 5998 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 5999 /// operand of a vector shift left operation. That value must be in the range: 6000 /// 0 <= Value < ElementBits for a left shift; or 6001 /// 0 <= Value <= ElementBits for a long left shift. 6002 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6003 assert(VT.isVector() && "vector shift count is not a vector type"); 6004 int64_t ElementBits = VT.getScalarSizeInBits(); 6005 if (!getVShiftImm(Op, ElementBits, Cnt)) 6006 return false; 6007 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6008 } 6009 6010 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6011 /// operand of a vector shift right operation. For a shift opcode, the value 6012 /// is positive, but for an intrinsic the value count must be negative. The 6013 /// absolute value must be in the range: 6014 /// 1 <= |Value| <= ElementBits for a right shift; or 6015 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6016 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6017 int64_t &Cnt) { 6018 assert(VT.isVector() && "vector shift count is not a vector type"); 6019 int64_t ElementBits = VT.getScalarSizeInBits(); 6020 if (!getVShiftImm(Op, ElementBits, Cnt)) 6021 return false; 6022 if (!isIntrinsic) 6023 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6024 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6025 Cnt = -Cnt; 6026 return true; 6027 } 6028 return false; 6029 } 6030 6031 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6032 const ARMSubtarget *ST) { 6033 EVT VT = N->getValueType(0); 6034 SDLoc dl(N); 6035 int64_t Cnt; 6036 6037 if (!VT.isVector()) 6038 return SDValue(); 6039 6040 // We essentially have two forms here. Shift by an immediate and shift by a 6041 // vector register (there are also shift by a gpr, but that is just handled 6042 // with a tablegen pattern). We cannot easily match shift by an immediate in 6043 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6044 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6045 // signed or unsigned, and a negative shift indicates a shift right). 6046 if (N->getOpcode() == ISD::SHL) { 6047 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6048 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6049 DAG.getConstant(Cnt, dl, MVT::i32)); 6050 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6051 N->getOperand(1)); 6052 } 6053 6054 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6055 "unexpected vector shift opcode"); 6056 6057 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6058 unsigned VShiftOpc = 6059 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6060 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6061 DAG.getConstant(Cnt, dl, MVT::i32)); 6062 } 6063 6064 // Other right shifts we don't have operations for (we use a shift left by a 6065 // negative number). 6066 EVT ShiftVT = N->getOperand(1).getValueType(); 6067 SDValue NegatedCount = DAG.getNode( 6068 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6069 unsigned VShiftOpc = 6070 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6071 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6072 } 6073 6074 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6075 const ARMSubtarget *ST) { 6076 EVT VT = N->getValueType(0); 6077 SDLoc dl(N); 6078 6079 // We can get here for a node like i32 = ISD::SHL i32, i64 6080 if (VT != MVT::i64) 6081 return SDValue(); 6082 6083 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6084 N->getOpcode() == ISD::SHL) && 6085 "Unknown shift to lower!"); 6086 6087 unsigned ShOpc = N->getOpcode(); 6088 if (ST->hasMVEIntegerOps()) { 6089 SDValue ShAmt = N->getOperand(1); 6090 unsigned ShPartsOpc = ARMISD::LSLL; 6091 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6092 6093 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6094 // then do the default optimisation 6095 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6096 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6097 return SDValue(); 6098 6099 // Extract the lower 32 bits of the shift amount if it's not an i32 6100 if (ShAmt->getValueType(0) != MVT::i32) 6101 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6102 6103 if (ShOpc == ISD::SRL) { 6104 if (!Con) 6105 // There is no t2LSRLr instruction so negate and perform an lsll if the 6106 // shift amount is in a register, emulating a right shift. 6107 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6108 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6109 else 6110 // Else generate an lsrl on the immediate shift amount 6111 ShPartsOpc = ARMISD::LSRL; 6112 } else if (ShOpc == ISD::SRA) 6113 ShPartsOpc = ARMISD::ASRL; 6114 6115 // Lower 32 bits of the destination/source 6116 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6117 DAG.getConstant(0, dl, MVT::i32)); 6118 // Upper 32 bits of the destination/source 6119 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6120 DAG.getConstant(1, dl, MVT::i32)); 6121 6122 // Generate the shift operation as computed above 6123 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6124 ShAmt); 6125 // The upper 32 bits come from the second return value of lsll 6126 Hi = SDValue(Lo.getNode(), 1); 6127 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6128 } 6129 6130 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6131 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6132 return SDValue(); 6133 6134 // If we are in thumb mode, we don't have RRX. 6135 if (ST->isThumb1Only()) 6136 return SDValue(); 6137 6138 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6139 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6140 DAG.getConstant(0, dl, MVT::i32)); 6141 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6142 DAG.getConstant(1, dl, MVT::i32)); 6143 6144 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6145 // captures the result into a carry flag. 6146 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6147 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6148 6149 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6150 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6151 6152 // Merge the pieces into a single i64 value. 6153 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6154 } 6155 6156 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6157 const ARMSubtarget *ST) { 6158 bool Invert = false; 6159 bool Swap = false; 6160 unsigned Opc = ARMCC::AL; 6161 6162 SDValue Op0 = Op.getOperand(0); 6163 SDValue Op1 = Op.getOperand(1); 6164 SDValue CC = Op.getOperand(2); 6165 EVT VT = Op.getValueType(); 6166 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6167 SDLoc dl(Op); 6168 6169 EVT CmpVT; 6170 if (ST->hasNEON()) 6171 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6172 else { 6173 assert(ST->hasMVEIntegerOps() && 6174 "No hardware support for integer vector comparison!"); 6175 6176 if (Op.getValueType().getVectorElementType() != MVT::i1) 6177 return SDValue(); 6178 6179 // Make sure we expand floating point setcc to scalar if we do not have 6180 // mve.fp, so that we can handle them from there. 6181 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6182 return SDValue(); 6183 6184 CmpVT = VT; 6185 } 6186 6187 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6188 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6189 // Special-case integer 64-bit equality comparisons. They aren't legal, 6190 // but they can be lowered with a few vector instructions. 6191 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6192 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6193 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6194 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6195 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6196 DAG.getCondCode(ISD::SETEQ)); 6197 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6198 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6199 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6200 if (SetCCOpcode == ISD::SETNE) 6201 Merged = DAG.getNOT(dl, Merged, CmpVT); 6202 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6203 return Merged; 6204 } 6205 6206 if (CmpVT.getVectorElementType() == MVT::i64) 6207 // 64-bit comparisons are not legal in general. 6208 return SDValue(); 6209 6210 if (Op1.getValueType().isFloatingPoint()) { 6211 switch (SetCCOpcode) { 6212 default: llvm_unreachable("Illegal FP comparison"); 6213 case ISD::SETUNE: 6214 case ISD::SETNE: 6215 if (ST->hasMVEFloatOps()) { 6216 Opc = ARMCC::NE; break; 6217 } else { 6218 Invert = true; LLVM_FALLTHROUGH; 6219 } 6220 case ISD::SETOEQ: 6221 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6222 case ISD::SETOLT: 6223 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6224 case ISD::SETOGT: 6225 case ISD::SETGT: Opc = ARMCC::GT; break; 6226 case ISD::SETOLE: 6227 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6228 case ISD::SETOGE: 6229 case ISD::SETGE: Opc = ARMCC::GE; break; 6230 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6231 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6232 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6233 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6234 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6235 case ISD::SETONE: { 6236 // Expand this to (OLT | OGT). 6237 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6238 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6239 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6240 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6241 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6242 if (Invert) 6243 Result = DAG.getNOT(dl, Result, VT); 6244 return Result; 6245 } 6246 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6247 case ISD::SETO: { 6248 // Expand this to (OLT | OGE). 6249 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6250 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6251 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6252 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6253 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6254 if (Invert) 6255 Result = DAG.getNOT(dl, Result, VT); 6256 return Result; 6257 } 6258 } 6259 } else { 6260 // Integer comparisons. 6261 switch (SetCCOpcode) { 6262 default: llvm_unreachable("Illegal integer comparison"); 6263 case ISD::SETNE: 6264 if (ST->hasMVEIntegerOps()) { 6265 Opc = ARMCC::NE; break; 6266 } else { 6267 Invert = true; LLVM_FALLTHROUGH; 6268 } 6269 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6270 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6271 case ISD::SETGT: Opc = ARMCC::GT; break; 6272 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6273 case ISD::SETGE: Opc = ARMCC::GE; break; 6274 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6275 case ISD::SETUGT: Opc = ARMCC::HI; break; 6276 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6277 case ISD::SETUGE: Opc = ARMCC::HS; break; 6278 } 6279 6280 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6281 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6282 SDValue AndOp; 6283 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6284 AndOp = Op0; 6285 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6286 AndOp = Op1; 6287 6288 // Ignore bitconvert. 6289 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6290 AndOp = AndOp.getOperand(0); 6291 6292 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6293 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6294 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6295 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6296 if (!Invert) 6297 Result = DAG.getNOT(dl, Result, VT); 6298 return Result; 6299 } 6300 } 6301 } 6302 6303 if (Swap) 6304 std::swap(Op0, Op1); 6305 6306 // If one of the operands is a constant vector zero, attempt to fold the 6307 // comparison to a specialized compare-against-zero form. 6308 SDValue SingleOp; 6309 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6310 SingleOp = Op0; 6311 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6312 if (Opc == ARMCC::GE) 6313 Opc = ARMCC::LE; 6314 else if (Opc == ARMCC::GT) 6315 Opc = ARMCC::LT; 6316 SingleOp = Op1; 6317 } 6318 6319 SDValue Result; 6320 if (SingleOp.getNode()) { 6321 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6322 DAG.getConstant(Opc, dl, MVT::i32)); 6323 } else { 6324 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6325 DAG.getConstant(Opc, dl, MVT::i32)); 6326 } 6327 6328 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6329 6330 if (Invert) 6331 Result = DAG.getNOT(dl, Result, VT); 6332 6333 return Result; 6334 } 6335 6336 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6337 SDValue LHS = Op.getOperand(0); 6338 SDValue RHS = Op.getOperand(1); 6339 SDValue Carry = Op.getOperand(2); 6340 SDValue Cond = Op.getOperand(3); 6341 SDLoc DL(Op); 6342 6343 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6344 6345 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6346 // have to invert the carry first. 6347 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6348 DAG.getConstant(1, DL, MVT::i32), Carry); 6349 // This converts the boolean value carry into the carry flag. 6350 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6351 6352 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6353 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6354 6355 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6356 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6357 SDValue ARMcc = DAG.getConstant( 6358 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6359 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6360 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6361 Cmp.getValue(1), SDValue()); 6362 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6363 CCR, Chain.getValue(1)); 6364 } 6365 6366 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6367 /// valid vector constant for a NEON or MVE instruction with a "modified 6368 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6369 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6370 unsigned SplatBitSize, SelectionDAG &DAG, 6371 const SDLoc &dl, EVT &VT, bool is128Bits, 6372 VMOVModImmType type) { 6373 unsigned OpCmode, Imm; 6374 6375 // SplatBitSize is set to the smallest size that splats the vector, so a 6376 // zero vector will always have SplatBitSize == 8. However, NEON modified 6377 // immediate instructions others than VMOV do not support the 8-bit encoding 6378 // of a zero vector, and the default encoding of zero is supposed to be the 6379 // 32-bit version. 6380 if (SplatBits == 0) 6381 SplatBitSize = 32; 6382 6383 switch (SplatBitSize) { 6384 case 8: 6385 if (type != VMOVModImm) 6386 return SDValue(); 6387 // Any 1-byte value is OK. Op=0, Cmode=1110. 6388 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6389 OpCmode = 0xe; 6390 Imm = SplatBits; 6391 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6392 break; 6393 6394 case 16: 6395 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6396 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6397 if ((SplatBits & ~0xff) == 0) { 6398 // Value = 0x00nn: Op=x, Cmode=100x. 6399 OpCmode = 0x8; 6400 Imm = SplatBits; 6401 break; 6402 } 6403 if ((SplatBits & ~0xff00) == 0) { 6404 // Value = 0xnn00: Op=x, Cmode=101x. 6405 OpCmode = 0xa; 6406 Imm = SplatBits >> 8; 6407 break; 6408 } 6409 return SDValue(); 6410 6411 case 32: 6412 // NEON's 32-bit VMOV supports splat values where: 6413 // * only one byte is nonzero, or 6414 // * the least significant byte is 0xff and the second byte is nonzero, or 6415 // * the least significant 2 bytes are 0xff and the third is nonzero. 6416 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6417 if ((SplatBits & ~0xff) == 0) { 6418 // Value = 0x000000nn: Op=x, Cmode=000x. 6419 OpCmode = 0; 6420 Imm = SplatBits; 6421 break; 6422 } 6423 if ((SplatBits & ~0xff00) == 0) { 6424 // Value = 0x0000nn00: Op=x, Cmode=001x. 6425 OpCmode = 0x2; 6426 Imm = SplatBits >> 8; 6427 break; 6428 } 6429 if ((SplatBits & ~0xff0000) == 0) { 6430 // Value = 0x00nn0000: Op=x, Cmode=010x. 6431 OpCmode = 0x4; 6432 Imm = SplatBits >> 16; 6433 break; 6434 } 6435 if ((SplatBits & ~0xff000000) == 0) { 6436 // Value = 0xnn000000: Op=x, Cmode=011x. 6437 OpCmode = 0x6; 6438 Imm = SplatBits >> 24; 6439 break; 6440 } 6441 6442 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6443 if (type == OtherModImm) return SDValue(); 6444 6445 if ((SplatBits & ~0xffff) == 0 && 6446 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6447 // Value = 0x0000nnff: Op=x, Cmode=1100. 6448 OpCmode = 0xc; 6449 Imm = SplatBits >> 8; 6450 break; 6451 } 6452 6453 // cmode == 0b1101 is not supported for MVE VMVN 6454 if (type == MVEVMVNModImm) 6455 return SDValue(); 6456 6457 if ((SplatBits & ~0xffffff) == 0 && 6458 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6459 // Value = 0x00nnffff: Op=x, Cmode=1101. 6460 OpCmode = 0xd; 6461 Imm = SplatBits >> 16; 6462 break; 6463 } 6464 6465 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6466 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6467 // VMOV.I32. A (very) minor optimization would be to replicate the value 6468 // and fall through here to test for a valid 64-bit splat. But, then the 6469 // caller would also need to check and handle the change in size. 6470 return SDValue(); 6471 6472 case 64: { 6473 if (type != VMOVModImm) 6474 return SDValue(); 6475 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6476 uint64_t BitMask = 0xff; 6477 uint64_t Val = 0; 6478 unsigned ImmMask = 1; 6479 Imm = 0; 6480 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6481 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6482 Val |= BitMask; 6483 Imm |= ImmMask; 6484 } else if ((SplatBits & BitMask) != 0) { 6485 return SDValue(); 6486 } 6487 BitMask <<= 8; 6488 ImmMask <<= 1; 6489 } 6490 6491 if (DAG.getDataLayout().isBigEndian()) 6492 // swap higher and lower 32 bit word 6493 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 6494 6495 // Op=1, Cmode=1110. 6496 OpCmode = 0x1e; 6497 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6498 break; 6499 } 6500 6501 default: 6502 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6503 } 6504 6505 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6506 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6507 } 6508 6509 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6510 const ARMSubtarget *ST) const { 6511 EVT VT = Op.getValueType(); 6512 bool IsDouble = (VT == MVT::f64); 6513 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6514 const APFloat &FPVal = CFP->getValueAPF(); 6515 6516 // Prevent floating-point constants from using literal loads 6517 // when execute-only is enabled. 6518 if (ST->genExecuteOnly()) { 6519 // If we can represent the constant as an immediate, don't lower it 6520 if (isFPImmLegal(FPVal, VT)) 6521 return Op; 6522 // Otherwise, construct as integer, and move to float register 6523 APInt INTVal = FPVal.bitcastToAPInt(); 6524 SDLoc DL(CFP); 6525 switch (VT.getSimpleVT().SimpleTy) { 6526 default: 6527 llvm_unreachable("Unknown floating point type!"); 6528 break; 6529 case MVT::f64: { 6530 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6531 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6532 if (!ST->isLittle()) 6533 std::swap(Lo, Hi); 6534 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6535 } 6536 case MVT::f32: 6537 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6538 DAG.getConstant(INTVal, DL, MVT::i32)); 6539 } 6540 } 6541 6542 if (!ST->hasVFP3Base()) 6543 return SDValue(); 6544 6545 // Use the default (constant pool) lowering for double constants when we have 6546 // an SP-only FPU 6547 if (IsDouble && !Subtarget->hasFP64()) 6548 return SDValue(); 6549 6550 // Try splatting with a VMOV.f32... 6551 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6552 6553 if (ImmVal != -1) { 6554 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6555 // We have code in place to select a valid ConstantFP already, no need to 6556 // do any mangling. 6557 return Op; 6558 } 6559 6560 // It's a float and we are trying to use NEON operations where 6561 // possible. Lower it to a splat followed by an extract. 6562 SDLoc DL(Op); 6563 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6564 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6565 NewVal); 6566 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6567 DAG.getConstant(0, DL, MVT::i32)); 6568 } 6569 6570 // The rest of our options are NEON only, make sure that's allowed before 6571 // proceeding.. 6572 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6573 return SDValue(); 6574 6575 EVT VMovVT; 6576 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6577 6578 // It wouldn't really be worth bothering for doubles except for one very 6579 // important value, which does happen to match: 0.0. So make sure we don't do 6580 // anything stupid. 6581 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6582 return SDValue(); 6583 6584 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6585 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6586 VMovVT, false, VMOVModImm); 6587 if (NewVal != SDValue()) { 6588 SDLoc DL(Op); 6589 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6590 NewVal); 6591 if (IsDouble) 6592 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6593 6594 // It's a float: cast and extract a vector element. 6595 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6596 VecConstant); 6597 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6598 DAG.getConstant(0, DL, MVT::i32)); 6599 } 6600 6601 // Finally, try a VMVN.i32 6602 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6603 false, VMVNModImm); 6604 if (NewVal != SDValue()) { 6605 SDLoc DL(Op); 6606 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6607 6608 if (IsDouble) 6609 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6610 6611 // It's a float: cast and extract a vector element. 6612 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6613 VecConstant); 6614 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6615 DAG.getConstant(0, DL, MVT::i32)); 6616 } 6617 6618 return SDValue(); 6619 } 6620 6621 // check if an VEXT instruction can handle the shuffle mask when the 6622 // vector sources of the shuffle are the same. 6623 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6624 unsigned NumElts = VT.getVectorNumElements(); 6625 6626 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6627 if (M[0] < 0) 6628 return false; 6629 6630 Imm = M[0]; 6631 6632 // If this is a VEXT shuffle, the immediate value is the index of the first 6633 // element. The other shuffle indices must be the successive elements after 6634 // the first one. 6635 unsigned ExpectedElt = Imm; 6636 for (unsigned i = 1; i < NumElts; ++i) { 6637 // Increment the expected index. If it wraps around, just follow it 6638 // back to index zero and keep going. 6639 ++ExpectedElt; 6640 if (ExpectedElt == NumElts) 6641 ExpectedElt = 0; 6642 6643 if (M[i] < 0) continue; // ignore UNDEF indices 6644 if (ExpectedElt != static_cast<unsigned>(M[i])) 6645 return false; 6646 } 6647 6648 return true; 6649 } 6650 6651 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6652 bool &ReverseVEXT, unsigned &Imm) { 6653 unsigned NumElts = VT.getVectorNumElements(); 6654 ReverseVEXT = false; 6655 6656 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6657 if (M[0] < 0) 6658 return false; 6659 6660 Imm = M[0]; 6661 6662 // If this is a VEXT shuffle, the immediate value is the index of the first 6663 // element. The other shuffle indices must be the successive elements after 6664 // the first one. 6665 unsigned ExpectedElt = Imm; 6666 for (unsigned i = 1; i < NumElts; ++i) { 6667 // Increment the expected index. If it wraps around, it may still be 6668 // a VEXT but the source vectors must be swapped. 6669 ExpectedElt += 1; 6670 if (ExpectedElt == NumElts * 2) { 6671 ExpectedElt = 0; 6672 ReverseVEXT = true; 6673 } 6674 6675 if (M[i] < 0) continue; // ignore UNDEF indices 6676 if (ExpectedElt != static_cast<unsigned>(M[i])) 6677 return false; 6678 } 6679 6680 // Adjust the index value if the source operands will be swapped. 6681 if (ReverseVEXT) 6682 Imm -= NumElts; 6683 6684 return true; 6685 } 6686 6687 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6688 /// instruction with the specified blocksize. (The order of the elements 6689 /// within each block of the vector is reversed.) 6690 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6691 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6692 "Only possible block sizes for VREV are: 16, 32, 64"); 6693 6694 unsigned EltSz = VT.getScalarSizeInBits(); 6695 if (EltSz == 64) 6696 return false; 6697 6698 unsigned NumElts = VT.getVectorNumElements(); 6699 unsigned BlockElts = M[0] + 1; 6700 // If the first shuffle index is UNDEF, be optimistic. 6701 if (M[0] < 0) 6702 BlockElts = BlockSize / EltSz; 6703 6704 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6705 return false; 6706 6707 for (unsigned i = 0; i < NumElts; ++i) { 6708 if (M[i] < 0) continue; // ignore UNDEF indices 6709 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6710 return false; 6711 } 6712 6713 return true; 6714 } 6715 6716 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6717 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6718 // range, then 0 is placed into the resulting vector. So pretty much any mask 6719 // of 8 elements can work here. 6720 return VT == MVT::v8i8 && M.size() == 8; 6721 } 6722 6723 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6724 unsigned Index) { 6725 if (Mask.size() == Elements * 2) 6726 return Index / Elements; 6727 return Mask[Index] == 0 ? 0 : 1; 6728 } 6729 6730 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6731 // checking that pairs of elements in the shuffle mask represent the same index 6732 // in each vector, incrementing the expected index by 2 at each step. 6733 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6734 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6735 // v2={e,f,g,h} 6736 // WhichResult gives the offset for each element in the mask based on which 6737 // of the two results it belongs to. 6738 // 6739 // The transpose can be represented either as: 6740 // result1 = shufflevector v1, v2, result1_shuffle_mask 6741 // result2 = shufflevector v1, v2, result2_shuffle_mask 6742 // where v1/v2 and the shuffle masks have the same number of elements 6743 // (here WhichResult (see below) indicates which result is being checked) 6744 // 6745 // or as: 6746 // results = shufflevector v1, v2, shuffle_mask 6747 // where both results are returned in one vector and the shuffle mask has twice 6748 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6749 // want to check the low half and high half of the shuffle mask as if it were 6750 // the other case 6751 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6752 unsigned EltSz = VT.getScalarSizeInBits(); 6753 if (EltSz == 64) 6754 return false; 6755 6756 unsigned NumElts = VT.getVectorNumElements(); 6757 if (M.size() != NumElts && M.size() != NumElts*2) 6758 return false; 6759 6760 // If the mask is twice as long as the input vector then we need to check the 6761 // upper and lower parts of the mask with a matching value for WhichResult 6762 // FIXME: A mask with only even values will be rejected in case the first 6763 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6764 // M[0] is used to determine WhichResult 6765 for (unsigned i = 0; i < M.size(); i += NumElts) { 6766 WhichResult = SelectPairHalf(NumElts, M, i); 6767 for (unsigned j = 0; j < NumElts; j += 2) { 6768 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6769 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6770 return false; 6771 } 6772 } 6773 6774 if (M.size() == NumElts*2) 6775 WhichResult = 0; 6776 6777 return true; 6778 } 6779 6780 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6781 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6782 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6783 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6784 unsigned EltSz = VT.getScalarSizeInBits(); 6785 if (EltSz == 64) 6786 return false; 6787 6788 unsigned NumElts = VT.getVectorNumElements(); 6789 if (M.size() != NumElts && M.size() != NumElts*2) 6790 return false; 6791 6792 for (unsigned i = 0; i < M.size(); i += NumElts) { 6793 WhichResult = SelectPairHalf(NumElts, M, i); 6794 for (unsigned j = 0; j < NumElts; j += 2) { 6795 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6796 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6797 return false; 6798 } 6799 } 6800 6801 if (M.size() == NumElts*2) 6802 WhichResult = 0; 6803 6804 return true; 6805 } 6806 6807 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6808 // that the mask elements are either all even and in steps of size 2 or all odd 6809 // and in steps of size 2. 6810 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6811 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6812 // v2={e,f,g,h} 6813 // Requires similar checks to that of isVTRNMask with 6814 // respect the how results are returned. 6815 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6816 unsigned EltSz = VT.getScalarSizeInBits(); 6817 if (EltSz == 64) 6818 return false; 6819 6820 unsigned NumElts = VT.getVectorNumElements(); 6821 if (M.size() != NumElts && M.size() != NumElts*2) 6822 return false; 6823 6824 for (unsigned i = 0; i < M.size(); i += NumElts) { 6825 WhichResult = SelectPairHalf(NumElts, M, i); 6826 for (unsigned j = 0; j < NumElts; ++j) { 6827 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6828 return false; 6829 } 6830 } 6831 6832 if (M.size() == NumElts*2) 6833 WhichResult = 0; 6834 6835 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6836 if (VT.is64BitVector() && EltSz == 32) 6837 return false; 6838 6839 return true; 6840 } 6841 6842 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6843 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6844 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6845 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6846 unsigned EltSz = VT.getScalarSizeInBits(); 6847 if (EltSz == 64) 6848 return false; 6849 6850 unsigned NumElts = VT.getVectorNumElements(); 6851 if (M.size() != NumElts && M.size() != NumElts*2) 6852 return false; 6853 6854 unsigned Half = NumElts / 2; 6855 for (unsigned i = 0; i < M.size(); i += NumElts) { 6856 WhichResult = SelectPairHalf(NumElts, M, i); 6857 for (unsigned j = 0; j < NumElts; j += Half) { 6858 unsigned Idx = WhichResult; 6859 for (unsigned k = 0; k < Half; ++k) { 6860 int MIdx = M[i + j + k]; 6861 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6862 return false; 6863 Idx += 2; 6864 } 6865 } 6866 } 6867 6868 if (M.size() == NumElts*2) 6869 WhichResult = 0; 6870 6871 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6872 if (VT.is64BitVector() && EltSz == 32) 6873 return false; 6874 6875 return true; 6876 } 6877 6878 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6879 // that pairs of elements of the shufflemask represent the same index in each 6880 // vector incrementing sequentially through the vectors. 6881 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6882 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6883 // v2={e,f,g,h} 6884 // Requires similar checks to that of isVTRNMask with respect the how results 6885 // are returned. 6886 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6887 unsigned EltSz = VT.getScalarSizeInBits(); 6888 if (EltSz == 64) 6889 return false; 6890 6891 unsigned NumElts = VT.getVectorNumElements(); 6892 if (M.size() != NumElts && M.size() != NumElts*2) 6893 return false; 6894 6895 for (unsigned i = 0; i < M.size(); i += NumElts) { 6896 WhichResult = SelectPairHalf(NumElts, M, i); 6897 unsigned Idx = WhichResult * NumElts / 2; 6898 for (unsigned j = 0; j < NumElts; j += 2) { 6899 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6900 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6901 return false; 6902 Idx += 1; 6903 } 6904 } 6905 6906 if (M.size() == NumElts*2) 6907 WhichResult = 0; 6908 6909 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6910 if (VT.is64BitVector() && EltSz == 32) 6911 return false; 6912 6913 return true; 6914 } 6915 6916 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6917 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6918 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6919 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6920 unsigned EltSz = VT.getScalarSizeInBits(); 6921 if (EltSz == 64) 6922 return false; 6923 6924 unsigned NumElts = VT.getVectorNumElements(); 6925 if (M.size() != NumElts && M.size() != NumElts*2) 6926 return false; 6927 6928 for (unsigned i = 0; i < M.size(); i += NumElts) { 6929 WhichResult = SelectPairHalf(NumElts, M, i); 6930 unsigned Idx = WhichResult * NumElts / 2; 6931 for (unsigned j = 0; j < NumElts; j += 2) { 6932 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6933 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6934 return false; 6935 Idx += 1; 6936 } 6937 } 6938 6939 if (M.size() == NumElts*2) 6940 WhichResult = 0; 6941 6942 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6943 if (VT.is64BitVector() && EltSz == 32) 6944 return false; 6945 6946 return true; 6947 } 6948 6949 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 6950 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 6951 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 6952 unsigned &WhichResult, 6953 bool &isV_UNDEF) { 6954 isV_UNDEF = false; 6955 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 6956 return ARMISD::VTRN; 6957 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 6958 return ARMISD::VUZP; 6959 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 6960 return ARMISD::VZIP; 6961 6962 isV_UNDEF = true; 6963 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6964 return ARMISD::VTRN; 6965 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6966 return ARMISD::VUZP; 6967 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6968 return ARMISD::VZIP; 6969 6970 return 0; 6971 } 6972 6973 /// \return true if this is a reverse operation on an vector. 6974 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 6975 unsigned NumElts = VT.getVectorNumElements(); 6976 // Make sure the mask has the right size. 6977 if (NumElts != M.size()) 6978 return false; 6979 6980 // Look for <15, ..., 3, -1, 1, 0>. 6981 for (unsigned i = 0; i != NumElts; ++i) 6982 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 6983 return false; 6984 6985 return true; 6986 } 6987 6988 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { 6989 unsigned NumElts = VT.getVectorNumElements(); 6990 // Make sure the mask has the right size. 6991 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 6992 return false; 6993 6994 // If Top 6995 // Look for <0, N, 2, N+2, 4, N+4, ..>. 6996 // This inserts Input2 into Input1 6997 // else if not Top 6998 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 6999 // This inserts Input1 into Input2 7000 unsigned Offset = Top ? 0 : 1; 7001 for (unsigned i = 0; i < NumElts; i+=2) { 7002 if (M[i] >= 0 && M[i] != (int)i) 7003 return false; 7004 if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) 7005 return false; 7006 } 7007 7008 return true; 7009 } 7010 7011 // If N is an integer constant that can be moved into a register in one 7012 // instruction, return an SDValue of such a constant (will become a MOV 7013 // instruction). Otherwise return null. 7014 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7015 const ARMSubtarget *ST, const SDLoc &dl) { 7016 uint64_t Val; 7017 if (!isa<ConstantSDNode>(N)) 7018 return SDValue(); 7019 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7020 7021 if (ST->isThumb1Only()) { 7022 if (Val <= 255 || ~Val <= 255) 7023 return DAG.getConstant(Val, dl, MVT::i32); 7024 } else { 7025 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7026 return DAG.getConstant(Val, dl, MVT::i32); 7027 } 7028 return SDValue(); 7029 } 7030 7031 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7032 const ARMSubtarget *ST) { 7033 SDLoc dl(Op); 7034 EVT VT = Op.getValueType(); 7035 7036 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7037 7038 unsigned NumElts = VT.getVectorNumElements(); 7039 unsigned BoolMask; 7040 unsigned BitsPerBool; 7041 if (NumElts == 4) { 7042 BitsPerBool = 4; 7043 BoolMask = 0xf; 7044 } else if (NumElts == 8) { 7045 BitsPerBool = 2; 7046 BoolMask = 0x3; 7047 } else if (NumElts == 16) { 7048 BitsPerBool = 1; 7049 BoolMask = 0x1; 7050 } else 7051 return SDValue(); 7052 7053 // If this is a single value copied into all lanes (a splat), we can just sign 7054 // extend that single value 7055 SDValue FirstOp = Op.getOperand(0); 7056 if (!isa<ConstantSDNode>(FirstOp) && 7057 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7058 [&FirstOp](SDUse &U) { 7059 return U.get().isUndef() || U.get() == FirstOp; 7060 })) { 7061 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7062 DAG.getValueType(MVT::i1)); 7063 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7064 } 7065 7066 // First create base with bits set where known 7067 unsigned Bits32 = 0; 7068 for (unsigned i = 0; i < NumElts; ++i) { 7069 SDValue V = Op.getOperand(i); 7070 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7071 continue; 7072 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7073 if (BitSet) 7074 Bits32 |= BoolMask << (i * BitsPerBool); 7075 } 7076 7077 // Add in unknown nodes 7078 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7079 DAG.getConstant(Bits32, dl, MVT::i32)); 7080 for (unsigned i = 0; i < NumElts; ++i) { 7081 SDValue V = Op.getOperand(i); 7082 if (isa<ConstantSDNode>(V) || V.isUndef()) 7083 continue; 7084 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7085 DAG.getConstant(i, dl, MVT::i32)); 7086 } 7087 7088 return Base; 7089 } 7090 7091 // If this is a case we can't handle, return null and let the default 7092 // expansion code take care of it. 7093 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7094 const ARMSubtarget *ST) const { 7095 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7096 SDLoc dl(Op); 7097 EVT VT = Op.getValueType(); 7098 7099 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7100 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7101 7102 APInt SplatBits, SplatUndef; 7103 unsigned SplatBitSize; 7104 bool HasAnyUndefs; 7105 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7106 if (SplatUndef.isAllOnesValue()) 7107 return DAG.getUNDEF(VT); 7108 7109 if ((ST->hasNEON() && SplatBitSize <= 64) || 7110 (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { 7111 // Check if an immediate VMOV works. 7112 EVT VmovVT; 7113 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 7114 SplatUndef.getZExtValue(), SplatBitSize, 7115 DAG, dl, VmovVT, VT.is128BitVector(), 7116 VMOVModImm); 7117 7118 if (Val.getNode()) { 7119 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7120 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7121 } 7122 7123 // Try an immediate VMVN. 7124 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7125 Val = isVMOVModifiedImm( 7126 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 7127 DAG, dl, VmovVT, VT.is128BitVector(), 7128 ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7129 if (Val.getNode()) { 7130 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7131 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7132 } 7133 7134 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7135 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7136 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7137 if (ImmVal != -1) { 7138 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7139 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7140 } 7141 } 7142 } 7143 } 7144 7145 // Scan through the operands to see if only one value is used. 7146 // 7147 // As an optimisation, even if more than one value is used it may be more 7148 // profitable to splat with one value then change some lanes. 7149 // 7150 // Heuristically we decide to do this if the vector has a "dominant" value, 7151 // defined as splatted to more than half of the lanes. 7152 unsigned NumElts = VT.getVectorNumElements(); 7153 bool isOnlyLowElement = true; 7154 bool usesOnlyOneValue = true; 7155 bool hasDominantValue = false; 7156 bool isConstant = true; 7157 7158 // Map of the number of times a particular SDValue appears in the 7159 // element list. 7160 DenseMap<SDValue, unsigned> ValueCounts; 7161 SDValue Value; 7162 for (unsigned i = 0; i < NumElts; ++i) { 7163 SDValue V = Op.getOperand(i); 7164 if (V.isUndef()) 7165 continue; 7166 if (i > 0) 7167 isOnlyLowElement = false; 7168 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7169 isConstant = false; 7170 7171 ValueCounts.insert(std::make_pair(V, 0)); 7172 unsigned &Count = ValueCounts[V]; 7173 7174 // Is this value dominant? (takes up more than half of the lanes) 7175 if (++Count > (NumElts / 2)) { 7176 hasDominantValue = true; 7177 Value = V; 7178 } 7179 } 7180 if (ValueCounts.size() != 1) 7181 usesOnlyOneValue = false; 7182 if (!Value.getNode() && !ValueCounts.empty()) 7183 Value = ValueCounts.begin()->first; 7184 7185 if (ValueCounts.empty()) 7186 return DAG.getUNDEF(VT); 7187 7188 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7189 // Keep going if we are hitting this case. 7190 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7191 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7192 7193 unsigned EltSize = VT.getScalarSizeInBits(); 7194 7195 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7196 // i32 and try again. 7197 if (hasDominantValue && EltSize <= 32) { 7198 if (!isConstant) { 7199 SDValue N; 7200 7201 // If we are VDUPing a value that comes directly from a vector, that will 7202 // cause an unnecessary move to and from a GPR, where instead we could 7203 // just use VDUPLANE. We can only do this if the lane being extracted 7204 // is at a constant index, as the VDUP from lane instructions only have 7205 // constant-index forms. 7206 ConstantSDNode *constIndex; 7207 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7208 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7209 // We need to create a new undef vector to use for the VDUPLANE if the 7210 // size of the vector from which we get the value is different than the 7211 // size of the vector that we need to create. We will insert the element 7212 // such that the register coalescer will remove unnecessary copies. 7213 if (VT != Value->getOperand(0).getValueType()) { 7214 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7215 VT.getVectorNumElements(); 7216 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7217 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7218 Value, DAG.getConstant(index, dl, MVT::i32)), 7219 DAG.getConstant(index, dl, MVT::i32)); 7220 } else 7221 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7222 Value->getOperand(0), Value->getOperand(1)); 7223 } else 7224 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7225 7226 if (!usesOnlyOneValue) { 7227 // The dominant value was splatted as 'N', but we now have to insert 7228 // all differing elements. 7229 for (unsigned I = 0; I < NumElts; ++I) { 7230 if (Op.getOperand(I) == Value) 7231 continue; 7232 SmallVector<SDValue, 3> Ops; 7233 Ops.push_back(N); 7234 Ops.push_back(Op.getOperand(I)); 7235 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7236 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7237 } 7238 } 7239 return N; 7240 } 7241 if (VT.getVectorElementType().isFloatingPoint()) { 7242 SmallVector<SDValue, 8> Ops; 7243 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7244 assert(FVT == MVT::f32 || FVT == MVT::f16); 7245 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7246 for (unsigned i = 0; i < NumElts; ++i) 7247 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7248 Op.getOperand(i))); 7249 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7250 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7251 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7252 if (Val.getNode()) 7253 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7254 } 7255 if (usesOnlyOneValue) { 7256 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7257 if (isConstant && Val.getNode()) 7258 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7259 } 7260 } 7261 7262 // If all elements are constants and the case above didn't get hit, fall back 7263 // to the default expansion, which will generate a load from the constant 7264 // pool. 7265 if (isConstant) 7266 return SDValue(); 7267 7268 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7269 if (NumElts >= 4) { 7270 SDValue shuffle = ReconstructShuffle(Op, DAG); 7271 if (shuffle != SDValue()) 7272 return shuffle; 7273 } 7274 7275 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7276 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7277 // into two 64-bit vectors; we might discover a better way to lower it. 7278 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7279 EVT ExtVT = VT.getVectorElementType(); 7280 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7281 SDValue Lower = 7282 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7283 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7284 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7285 SDValue Upper = DAG.getBuildVector( 7286 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7287 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7288 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7289 if (Lower && Upper) 7290 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7291 } 7292 7293 // Vectors with 32- or 64-bit elements can be built by directly assigning 7294 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7295 // will be legalized. 7296 if (EltSize >= 32) { 7297 // Do the expansion with floating-point types, since that is what the VFP 7298 // registers are defined to use, and since i64 is not legal. 7299 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7300 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7301 SmallVector<SDValue, 8> Ops; 7302 for (unsigned i = 0; i < NumElts; ++i) 7303 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7304 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7305 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7306 } 7307 7308 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7309 // know the default expansion would otherwise fall back on something even 7310 // worse. For a vector with one or two non-undef values, that's 7311 // scalar_to_vector for the elements followed by a shuffle (provided the 7312 // shuffle is valid for the target) and materialization element by element 7313 // on the stack followed by a load for everything else. 7314 if (!isConstant && !usesOnlyOneValue) { 7315 SDValue Vec = DAG.getUNDEF(VT); 7316 for (unsigned i = 0 ; i < NumElts; ++i) { 7317 SDValue V = Op.getOperand(i); 7318 if (V.isUndef()) 7319 continue; 7320 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7321 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7322 } 7323 return Vec; 7324 } 7325 7326 return SDValue(); 7327 } 7328 7329 // Gather data to see if the operation can be modelled as a 7330 // shuffle in combination with VEXTs. 7331 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7332 SelectionDAG &DAG) const { 7333 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7334 SDLoc dl(Op); 7335 EVT VT = Op.getValueType(); 7336 unsigned NumElts = VT.getVectorNumElements(); 7337 7338 struct ShuffleSourceInfo { 7339 SDValue Vec; 7340 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7341 unsigned MaxElt = 0; 7342 7343 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7344 // be compatible with the shuffle we intend to construct. As a result 7345 // ShuffleVec will be some sliding window into the original Vec. 7346 SDValue ShuffleVec; 7347 7348 // Code should guarantee that element i in Vec starts at element "WindowBase 7349 // + i * WindowScale in ShuffleVec". 7350 int WindowBase = 0; 7351 int WindowScale = 1; 7352 7353 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7354 7355 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7356 }; 7357 7358 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7359 // node. 7360 SmallVector<ShuffleSourceInfo, 2> Sources; 7361 for (unsigned i = 0; i < NumElts; ++i) { 7362 SDValue V = Op.getOperand(i); 7363 if (V.isUndef()) 7364 continue; 7365 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7366 // A shuffle can only come from building a vector from various 7367 // elements of other vectors. 7368 return SDValue(); 7369 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7370 // Furthermore, shuffles require a constant mask, whereas extractelts 7371 // accept variable indices. 7372 return SDValue(); 7373 } 7374 7375 // Add this element source to the list if it's not already there. 7376 SDValue SourceVec = V.getOperand(0); 7377 auto Source = llvm::find(Sources, SourceVec); 7378 if (Source == Sources.end()) 7379 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7380 7381 // Update the minimum and maximum lane number seen. 7382 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7383 Source->MinElt = std::min(Source->MinElt, EltNo); 7384 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7385 } 7386 7387 // Currently only do something sane when at most two source vectors 7388 // are involved. 7389 if (Sources.size() > 2) 7390 return SDValue(); 7391 7392 // Find out the smallest element size among result and two sources, and use 7393 // it as element size to build the shuffle_vector. 7394 EVT SmallestEltTy = VT.getVectorElementType(); 7395 for (auto &Source : Sources) { 7396 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7397 if (SrcEltTy.bitsLT(SmallestEltTy)) 7398 SmallestEltTy = SrcEltTy; 7399 } 7400 unsigned ResMultiplier = 7401 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7402 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7403 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7404 7405 // If the source vector is too wide or too narrow, we may nevertheless be able 7406 // to construct a compatible shuffle either by concatenating it with UNDEF or 7407 // extracting a suitable range of elements. 7408 for (auto &Src : Sources) { 7409 EVT SrcVT = Src.ShuffleVec.getValueType(); 7410 7411 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7412 continue; 7413 7414 // This stage of the search produces a source with the same element type as 7415 // the original, but with a total width matching the BUILD_VECTOR output. 7416 EVT EltVT = SrcVT.getVectorElementType(); 7417 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7418 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7419 7420 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7421 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 7422 return SDValue(); 7423 // We can pad out the smaller vector for free, so if it's part of a 7424 // shuffle... 7425 Src.ShuffleVec = 7426 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7427 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7428 continue; 7429 } 7430 7431 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 7432 return SDValue(); 7433 7434 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7435 // Span too large for a VEXT to cope 7436 return SDValue(); 7437 } 7438 7439 if (Src.MinElt >= NumSrcElts) { 7440 // The extraction can just take the second half 7441 Src.ShuffleVec = 7442 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7443 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7444 Src.WindowBase = -NumSrcElts; 7445 } else if (Src.MaxElt < NumSrcElts) { 7446 // The extraction can just take the first half 7447 Src.ShuffleVec = 7448 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7449 DAG.getConstant(0, dl, MVT::i32)); 7450 } else { 7451 // An actual VEXT is needed 7452 SDValue VEXTSrc1 = 7453 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7454 DAG.getConstant(0, dl, MVT::i32)); 7455 SDValue VEXTSrc2 = 7456 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7457 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7458 7459 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7460 VEXTSrc2, 7461 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7462 Src.WindowBase = -Src.MinElt; 7463 } 7464 } 7465 7466 // Another possible incompatibility occurs from the vector element types. We 7467 // can fix this by bitcasting the source vectors to the same type we intend 7468 // for the shuffle. 7469 for (auto &Src : Sources) { 7470 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7471 if (SrcEltTy == SmallestEltTy) 7472 continue; 7473 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7474 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 7475 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7476 Src.WindowBase *= Src.WindowScale; 7477 } 7478 7479 // Final sanity check before we try to actually produce a shuffle. 7480 LLVM_DEBUG(for (auto Src 7481 : Sources) 7482 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7483 7484 // The stars all align, our next step is to produce the mask for the shuffle. 7485 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7486 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7487 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7488 SDValue Entry = Op.getOperand(i); 7489 if (Entry.isUndef()) 7490 continue; 7491 7492 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7493 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7494 7495 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7496 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7497 // segment. 7498 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7499 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 7500 VT.getScalarSizeInBits()); 7501 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7502 7503 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7504 // starting at the appropriate offset. 7505 int *LaneMask = &Mask[i * ResMultiplier]; 7506 7507 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7508 ExtractBase += NumElts * (Src - Sources.begin()); 7509 for (int j = 0; j < LanesDefined; ++j) 7510 LaneMask[j] = ExtractBase + j; 7511 } 7512 7513 7514 // We can't handle more than two sources. This should have already 7515 // been checked before this point. 7516 assert(Sources.size() <= 2 && "Too many sources!"); 7517 7518 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7519 for (unsigned i = 0; i < Sources.size(); ++i) 7520 ShuffleOps[i] = Sources[i].ShuffleVec; 7521 7522 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7523 ShuffleOps[1], Mask, DAG); 7524 if (!Shuffle) 7525 return SDValue(); 7526 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 7527 } 7528 7529 enum ShuffleOpCodes { 7530 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7531 OP_VREV, 7532 OP_VDUP0, 7533 OP_VDUP1, 7534 OP_VDUP2, 7535 OP_VDUP3, 7536 OP_VEXT1, 7537 OP_VEXT2, 7538 OP_VEXT3, 7539 OP_VUZPL, // VUZP, left result 7540 OP_VUZPR, // VUZP, right result 7541 OP_VZIPL, // VZIP, left result 7542 OP_VZIPR, // VZIP, right result 7543 OP_VTRNL, // VTRN, left result 7544 OP_VTRNR // VTRN, right result 7545 }; 7546 7547 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7548 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7549 switch (OpNum) { 7550 case OP_COPY: 7551 case OP_VREV: 7552 case OP_VDUP0: 7553 case OP_VDUP1: 7554 case OP_VDUP2: 7555 case OP_VDUP3: 7556 return true; 7557 } 7558 return false; 7559 } 7560 7561 /// isShuffleMaskLegal - Targets can use this to indicate that they only 7562 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7563 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7564 /// are assumed to be legal. 7565 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7566 if (VT.getVectorNumElements() == 4 && 7567 (VT.is128BitVector() || VT.is64BitVector())) { 7568 unsigned PFIndexes[4]; 7569 for (unsigned i = 0; i != 4; ++i) { 7570 if (M[i] < 0) 7571 PFIndexes[i] = 8; 7572 else 7573 PFIndexes[i] = M[i]; 7574 } 7575 7576 // Compute the index in the perfect shuffle table. 7577 unsigned PFTableIndex = 7578 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7579 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7580 unsigned Cost = (PFEntry >> 30); 7581 7582 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7583 return true; 7584 } 7585 7586 bool ReverseVEXT, isV_UNDEF; 7587 unsigned Imm, WhichResult; 7588 7589 unsigned EltSize = VT.getScalarSizeInBits(); 7590 if (EltSize >= 32 || 7591 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7592 ShuffleVectorInst::isIdentityMask(M) || 7593 isVREVMask(M, VT, 64) || 7594 isVREVMask(M, VT, 32) || 7595 isVREVMask(M, VT, 16)) 7596 return true; 7597 else if (Subtarget->hasNEON() && 7598 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7599 isVTBLMask(M, VT) || 7600 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7601 return true; 7602 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7603 isReverseMask(M, VT)) 7604 return true; 7605 else if (Subtarget->hasMVEIntegerOps() && 7606 (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) 7607 return true; 7608 else 7609 return false; 7610 } 7611 7612 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7613 /// the specified operations to build the shuffle. 7614 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7615 SDValue RHS, SelectionDAG &DAG, 7616 const SDLoc &dl) { 7617 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7618 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7619 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7620 7621 if (OpNum == OP_COPY) { 7622 if (LHSID == (1*9+2)*9+3) return LHS; 7623 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7624 return RHS; 7625 } 7626 7627 SDValue OpLHS, OpRHS; 7628 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7629 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7630 EVT VT = OpLHS.getValueType(); 7631 7632 switch (OpNum) { 7633 default: llvm_unreachable("Unknown shuffle opcode!"); 7634 case OP_VREV: 7635 // VREV divides the vector in half and swaps within the half. 7636 if (VT.getVectorElementType() == MVT::i32 || 7637 VT.getVectorElementType() == MVT::f32) 7638 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 7639 // vrev <4 x i16> -> VREV32 7640 if (VT.getVectorElementType() == MVT::i16) 7641 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 7642 // vrev <4 x i8> -> VREV16 7643 assert(VT.getVectorElementType() == MVT::i8); 7644 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 7645 case OP_VDUP0: 7646 case OP_VDUP1: 7647 case OP_VDUP2: 7648 case OP_VDUP3: 7649 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7650 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 7651 case OP_VEXT1: 7652 case OP_VEXT2: 7653 case OP_VEXT3: 7654 return DAG.getNode(ARMISD::VEXT, dl, VT, 7655 OpLHS, OpRHS, 7656 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 7657 case OP_VUZPL: 7658 case OP_VUZPR: 7659 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 7660 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 7661 case OP_VZIPL: 7662 case OP_VZIPR: 7663 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 7664 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 7665 case OP_VTRNL: 7666 case OP_VTRNR: 7667 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 7668 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 7669 } 7670 } 7671 7672 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7673 ArrayRef<int> ShuffleMask, 7674 SelectionDAG &DAG) { 7675 // Check to see if we can use the VTBL instruction. 7676 SDValue V1 = Op.getOperand(0); 7677 SDValue V2 = Op.getOperand(1); 7678 SDLoc DL(Op); 7679 7680 SmallVector<SDValue, 8> VTBLMask; 7681 for (ArrayRef<int>::iterator 7682 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7683 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7684 7685 if (V2.getNode()->isUndef()) 7686 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7687 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7688 7689 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7690 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7691 } 7692 7693 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7694 SelectionDAG &DAG) { 7695 SDLoc DL(Op); 7696 SDValue OpLHS = Op.getOperand(0); 7697 EVT VT = OpLHS.getValueType(); 7698 7699 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7700 "Expect an v8i16/v16i8 type"); 7701 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7702 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7703 // extract the first 8 bytes into the top double word and the last 8 bytes 7704 // into the bottom double word. The v8i16 case is similar. 7705 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7706 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7707 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7708 } 7709 7710 static EVT getVectorTyFromPredicateVector(EVT VT) { 7711 switch (VT.getSimpleVT().SimpleTy) { 7712 case MVT::v4i1: 7713 return MVT::v4i32; 7714 case MVT::v8i1: 7715 return MVT::v8i16; 7716 case MVT::v16i1: 7717 return MVT::v16i8; 7718 default: 7719 llvm_unreachable("Unexpected vector predicate type"); 7720 } 7721 } 7722 7723 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 7724 SelectionDAG &DAG) { 7725 // Converting from boolean predicates to integers involves creating a vector 7726 // of all ones or all zeroes and selecting the lanes based upon the real 7727 // predicate. 7728 SDValue AllOnes = 7729 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 7730 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 7731 7732 SDValue AllZeroes = 7733 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 7734 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 7735 7736 // Get full vector type from predicate type 7737 EVT NewVT = getVectorTyFromPredicateVector(VT); 7738 7739 SDValue RecastV1; 7740 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 7741 // this to a v16i1. This cannot be done with an ordinary bitcast because the 7742 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 7743 // since we know in hardware the sizes are really the same. 7744 if (VT != MVT::v16i1) 7745 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 7746 else 7747 RecastV1 = Pred; 7748 7749 // Select either all ones or zeroes depending upon the real predicate bits. 7750 SDValue PredAsVector = 7751 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 7752 7753 // Recast our new predicate-as-integer v16i8 vector into something 7754 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 7755 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 7756 } 7757 7758 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 7759 const ARMSubtarget *ST) { 7760 EVT VT = Op.getValueType(); 7761 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7762 ArrayRef<int> ShuffleMask = SVN->getMask(); 7763 7764 assert(ST->hasMVEIntegerOps() && 7765 "No support for vector shuffle of boolean predicates"); 7766 7767 SDValue V1 = Op.getOperand(0); 7768 SDLoc dl(Op); 7769 if (isReverseMask(ShuffleMask, VT)) { 7770 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 7771 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 7772 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 7773 DAG.getConstant(16, dl, MVT::i32)); 7774 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 7775 } 7776 7777 // Until we can come up with optimised cases for every single vector 7778 // shuffle in existence we have chosen the least painful strategy. This is 7779 // to essentially promote the boolean predicate to a 8-bit integer, where 7780 // each predicate represents a byte. Then we fall back on a normal integer 7781 // vector shuffle and convert the result back into a predicate vector. In 7782 // many cases the generated code might be even better than scalar code 7783 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 7784 // fields in a register into 8 other arbitrary 2-bit fields! 7785 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 7786 EVT NewVT = PredAsVector.getValueType(); 7787 7788 // Do the shuffle! 7789 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 7790 DAG.getUNDEF(NewVT), ShuffleMask); 7791 7792 // Now return the result of comparing the shuffled vector with zero, 7793 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 7794 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 7795 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 7796 } 7797 7798 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 7799 const ARMSubtarget *ST) { 7800 SDValue V1 = Op.getOperand(0); 7801 SDValue V2 = Op.getOperand(1); 7802 SDLoc dl(Op); 7803 EVT VT = Op.getValueType(); 7804 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7805 unsigned EltSize = VT.getScalarSizeInBits(); 7806 7807 if (ST->hasMVEIntegerOps() && EltSize == 1) 7808 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 7809 7810 // Convert shuffles that are directly supported on NEON to target-specific 7811 // DAG nodes, instead of keeping them as shuffles and matching them again 7812 // during code selection. This is more efficient and avoids the possibility 7813 // of inconsistencies between legalization and selection. 7814 // FIXME: floating-point vectors should be canonicalized to integer vectors 7815 // of the same time so that they get CSEd properly. 7816 ArrayRef<int> ShuffleMask = SVN->getMask(); 7817 7818 if (EltSize <= 32) { 7819 if (SVN->isSplat()) { 7820 int Lane = SVN->getSplatIndex(); 7821 // If this is undef splat, generate it via "just" vdup, if possible. 7822 if (Lane == -1) Lane = 0; 7823 7824 // Test if V1 is a SCALAR_TO_VECTOR. 7825 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7826 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7827 } 7828 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 7829 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 7830 // reaches it). 7831 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 7832 !isa<ConstantSDNode>(V1.getOperand(0))) { 7833 bool IsScalarToVector = true; 7834 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7835 if (!V1.getOperand(i).isUndef()) { 7836 IsScalarToVector = false; 7837 break; 7838 } 7839 if (IsScalarToVector) 7840 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7841 } 7842 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 7843 DAG.getConstant(Lane, dl, MVT::i32)); 7844 } 7845 7846 bool ReverseVEXT = false; 7847 unsigned Imm = 0; 7848 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7849 if (ReverseVEXT) 7850 std::swap(V1, V2); 7851 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7852 DAG.getConstant(Imm, dl, MVT::i32)); 7853 } 7854 7855 if (isVREVMask(ShuffleMask, VT, 64)) 7856 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 7857 if (isVREVMask(ShuffleMask, VT, 32)) 7858 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 7859 if (isVREVMask(ShuffleMask, VT, 16)) 7860 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 7861 7862 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 7863 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 7864 DAG.getConstant(Imm, dl, MVT::i32)); 7865 } 7866 7867 // Check for Neon shuffles that modify both input vectors in place. 7868 // If both results are used, i.e., if there are two shuffles with the same 7869 // source operands and with masks corresponding to both results of one of 7870 // these operations, DAG memoization will ensure that a single node is 7871 // used for both shuffles. 7872 unsigned WhichResult = 0; 7873 bool isV_UNDEF = false; 7874 if (ST->hasNEON()) { 7875 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7876 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 7877 if (isV_UNDEF) 7878 V2 = V1; 7879 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 7880 .getValue(WhichResult); 7881 } 7882 } 7883 if (ST->hasMVEIntegerOps()) { 7884 if (isVMOVNMask(ShuffleMask, VT, 0)) 7885 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 7886 DAG.getConstant(0, dl, MVT::i32)); 7887 if (isVMOVNMask(ShuffleMask, VT, 1)) 7888 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 7889 DAG.getConstant(1, dl, MVT::i32)); 7890 } 7891 7892 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 7893 // shuffles that produce a result larger than their operands with: 7894 // shuffle(concat(v1, undef), concat(v2, undef)) 7895 // -> 7896 // shuffle(concat(v1, v2), undef) 7897 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 7898 // 7899 // This is useful in the general case, but there are special cases where 7900 // native shuffles produce larger results: the two-result ops. 7901 // 7902 // Look through the concat when lowering them: 7903 // shuffle(concat(v1, v2), undef) 7904 // -> 7905 // concat(VZIP(v1, v2):0, :1) 7906 // 7907 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 7908 SDValue SubV1 = V1->getOperand(0); 7909 SDValue SubV2 = V1->getOperand(1); 7910 EVT SubVT = SubV1.getValueType(); 7911 7912 // We expect these to have been canonicalized to -1. 7913 assert(llvm::all_of(ShuffleMask, [&](int i) { 7914 return i < (int)VT.getVectorNumElements(); 7915 }) && "Unexpected shuffle index into UNDEF operand!"); 7916 7917 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7918 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 7919 if (isV_UNDEF) 7920 SubV2 = SubV1; 7921 assert((WhichResult == 0) && 7922 "In-place shuffle of concat can only have one result!"); 7923 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 7924 SubV1, SubV2); 7925 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 7926 Res.getValue(1)); 7927 } 7928 } 7929 } 7930 7931 // If the shuffle is not directly supported and it has 4 elements, use 7932 // the PerfectShuffle-generated table to synthesize it from other shuffles. 7933 unsigned NumElts = VT.getVectorNumElements(); 7934 if (NumElts == 4) { 7935 unsigned PFIndexes[4]; 7936 for (unsigned i = 0; i != 4; ++i) { 7937 if (ShuffleMask[i] < 0) 7938 PFIndexes[i] = 8; 7939 else 7940 PFIndexes[i] = ShuffleMask[i]; 7941 } 7942 7943 // Compute the index in the perfect shuffle table. 7944 unsigned PFTableIndex = 7945 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7946 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7947 unsigned Cost = (PFEntry >> 30); 7948 7949 if (Cost <= 4) { 7950 if (ST->hasNEON()) 7951 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 7952 else if (isLegalMVEShuffleOp(PFEntry)) { 7953 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7954 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7955 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 7956 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 7957 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 7958 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 7959 } 7960 } 7961 } 7962 7963 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 7964 if (EltSize >= 32) { 7965 // Do the expansion with floating-point types, since that is what the VFP 7966 // registers are defined to use, and since i64 is not legal. 7967 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7968 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7969 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 7970 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 7971 SmallVector<SDValue, 8> Ops; 7972 for (unsigned i = 0; i < NumElts; ++i) { 7973 if (ShuffleMask[i] < 0) 7974 Ops.push_back(DAG.getUNDEF(EltVT)); 7975 else 7976 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 7977 ShuffleMask[i] < (int)NumElts ? V1 : V2, 7978 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 7979 dl, MVT::i32))); 7980 } 7981 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7982 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7983 } 7984 7985 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 7986 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 7987 7988 if (ST->hasNEON() && VT == MVT::v8i8) 7989 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 7990 return NewOp; 7991 7992 return SDValue(); 7993 } 7994 7995 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 7996 const ARMSubtarget *ST) { 7997 EVT VecVT = Op.getOperand(0).getValueType(); 7998 SDLoc dl(Op); 7999 8000 assert(ST->hasMVEIntegerOps() && 8001 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8002 8003 SDValue Conv = 8004 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8005 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8006 unsigned LaneWidth = 8007 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8008 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8009 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8010 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8011 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8012 DAG.getConstant(~Mask, dl, MVT::i32)); 8013 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8014 } 8015 8016 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8017 SelectionDAG &DAG) const { 8018 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8019 SDValue Lane = Op.getOperand(2); 8020 if (!isa<ConstantSDNode>(Lane)) 8021 return SDValue(); 8022 8023 SDValue Elt = Op.getOperand(1); 8024 EVT EltVT = Elt.getValueType(); 8025 8026 if (Subtarget->hasMVEIntegerOps() && 8027 Op.getValueType().getScalarSizeInBits() == 1) 8028 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8029 8030 if (getTypeAction(*DAG.getContext(), EltVT) == 8031 TargetLowering::TypePromoteFloat) { 8032 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8033 // but the type system will try to do that if we don't intervene. 8034 // Reinterpret any such vector-element insertion as one with the 8035 // corresponding integer types. 8036 8037 SDLoc dl(Op); 8038 8039 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8040 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8041 TargetLowering::TypePromoteFloat); 8042 8043 SDValue VecIn = Op.getOperand(0); 8044 EVT VecVT = VecIn.getValueType(); 8045 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8046 VecVT.getVectorNumElements()); 8047 8048 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8049 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8050 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8051 IVecIn, IElt, Lane); 8052 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8053 } 8054 8055 return Op; 8056 } 8057 8058 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8059 const ARMSubtarget *ST) { 8060 EVT VecVT = Op.getOperand(0).getValueType(); 8061 SDLoc dl(Op); 8062 8063 assert(ST->hasMVEIntegerOps() && 8064 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8065 8066 SDValue Conv = 8067 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8068 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8069 unsigned LaneWidth = 8070 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8071 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8072 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8073 return Shift; 8074 } 8075 8076 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8077 const ARMSubtarget *ST) { 8078 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8079 SDValue Lane = Op.getOperand(1); 8080 if (!isa<ConstantSDNode>(Lane)) 8081 return SDValue(); 8082 8083 SDValue Vec = Op.getOperand(0); 8084 EVT VT = Vec.getValueType(); 8085 8086 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8087 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8088 8089 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8090 SDLoc dl(Op); 8091 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8092 } 8093 8094 return Op; 8095 } 8096 8097 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8098 const ARMSubtarget *ST) { 8099 SDValue V1 = Op.getOperand(0); 8100 SDValue V2 = Op.getOperand(1); 8101 SDLoc dl(Op); 8102 EVT VT = Op.getValueType(); 8103 EVT Op1VT = V1.getValueType(); 8104 EVT Op2VT = V2.getValueType(); 8105 unsigned NumElts = VT.getVectorNumElements(); 8106 8107 assert(Op1VT == Op2VT && "Operand types don't match!"); 8108 assert(VT.getScalarSizeInBits() == 1 && 8109 "Unexpected custom CONCAT_VECTORS lowering"); 8110 assert(ST->hasMVEIntegerOps() && 8111 "CONCAT_VECTORS lowering only supported for MVE"); 8112 8113 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8114 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8115 8116 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8117 // promoted to v8i16, etc. 8118 8119 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8120 8121 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8122 // to be the right size for the destination. For example, if Op1 is v4i1 then 8123 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8124 // which when promoted is v8i16. That means each i32 element from Op1 needs 8125 // truncating to i16 and inserting in the result. 8126 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8127 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8128 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8129 EVT NewVT = NewV.getValueType(); 8130 EVT ConcatVT = ConVec.getValueType(); 8131 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8132 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8133 DAG.getIntPtrConstant(i, dl)); 8134 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8135 DAG.getConstant(j, dl, MVT::i32)); 8136 } 8137 return ConVec; 8138 }; 8139 unsigned j = 0; 8140 ConVec = ExractInto(NewV1, ConVec, j); 8141 ConVec = ExractInto(NewV2, ConVec, j); 8142 8143 // Now return the result of comparing the subvector with zero, 8144 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8145 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8146 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8147 } 8148 8149 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8150 const ARMSubtarget *ST) { 8151 EVT VT = Op->getValueType(0); 8152 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8153 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8154 8155 // The only time a CONCAT_VECTORS operation can have legal types is when 8156 // two 64-bit vectors are concatenated to a 128-bit vector. 8157 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8158 "unexpected CONCAT_VECTORS"); 8159 SDLoc dl(Op); 8160 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8161 SDValue Op0 = Op.getOperand(0); 8162 SDValue Op1 = Op.getOperand(1); 8163 if (!Op0.isUndef()) 8164 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8165 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8166 DAG.getIntPtrConstant(0, dl)); 8167 if (!Op1.isUndef()) 8168 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8169 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8170 DAG.getIntPtrConstant(1, dl)); 8171 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8172 } 8173 8174 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8175 const ARMSubtarget *ST) { 8176 SDValue V1 = Op.getOperand(0); 8177 SDValue V2 = Op.getOperand(1); 8178 SDLoc dl(Op); 8179 EVT VT = Op.getValueType(); 8180 EVT Op1VT = V1.getValueType(); 8181 unsigned NumElts = VT.getVectorNumElements(); 8182 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8183 8184 assert(VT.getScalarSizeInBits() == 1 && 8185 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8186 assert(ST->hasMVEIntegerOps() && 8187 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8188 8189 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8190 8191 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8192 // promoted to v8i16, etc. 8193 8194 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8195 8196 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8197 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8198 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8199 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8200 DAG.getIntPtrConstant(i, dl)); 8201 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8202 DAG.getConstant(j, dl, MVT::i32)); 8203 } 8204 8205 // Now return the result of comparing the subvector with zero, 8206 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8207 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8208 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8209 } 8210 8211 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8212 /// element has been zero/sign-extended, depending on the isSigned parameter, 8213 /// from an integer type half its size. 8214 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8215 bool isSigned) { 8216 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8217 EVT VT = N->getValueType(0); 8218 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8219 SDNode *BVN = N->getOperand(0).getNode(); 8220 if (BVN->getValueType(0) != MVT::v4i32 || 8221 BVN->getOpcode() != ISD::BUILD_VECTOR) 8222 return false; 8223 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8224 unsigned HiElt = 1 - LoElt; 8225 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8226 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8227 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8228 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8229 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8230 return false; 8231 if (isSigned) { 8232 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8233 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8234 return true; 8235 } else { 8236 if (Hi0->isNullValue() && Hi1->isNullValue()) 8237 return true; 8238 } 8239 return false; 8240 } 8241 8242 if (N->getOpcode() != ISD::BUILD_VECTOR) 8243 return false; 8244 8245 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8246 SDNode *Elt = N->getOperand(i).getNode(); 8247 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8248 unsigned EltSize = VT.getScalarSizeInBits(); 8249 unsigned HalfSize = EltSize / 2; 8250 if (isSigned) { 8251 if (!isIntN(HalfSize, C->getSExtValue())) 8252 return false; 8253 } else { 8254 if (!isUIntN(HalfSize, C->getZExtValue())) 8255 return false; 8256 } 8257 continue; 8258 } 8259 return false; 8260 } 8261 8262 return true; 8263 } 8264 8265 /// isSignExtended - Check if a node is a vector value that is sign-extended 8266 /// or a constant BUILD_VECTOR with sign-extended elements. 8267 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8268 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8269 return true; 8270 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8271 return true; 8272 return false; 8273 } 8274 8275 /// isZeroExtended - Check if a node is a vector value that is zero-extended 8276 /// or a constant BUILD_VECTOR with zero-extended elements. 8277 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8278 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 8279 return true; 8280 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8281 return true; 8282 return false; 8283 } 8284 8285 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8286 if (OrigVT.getSizeInBits() >= 64) 8287 return OrigVT; 8288 8289 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8290 8291 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8292 switch (OrigSimpleTy) { 8293 default: llvm_unreachable("Unexpected Vector Type"); 8294 case MVT::v2i8: 8295 case MVT::v2i16: 8296 return MVT::v2i32; 8297 case MVT::v4i8: 8298 return MVT::v4i16; 8299 } 8300 } 8301 8302 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8303 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8304 /// We insert the required extension here to get the vector to fill a D register. 8305 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8306 const EVT &OrigTy, 8307 const EVT &ExtTy, 8308 unsigned ExtOpcode) { 8309 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8310 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8311 // 64-bits we need to insert a new extension so that it will be 64-bits. 8312 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8313 if (OrigTy.getSizeInBits() >= 64) 8314 return N; 8315 8316 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8317 EVT NewVT = getExtensionTo64Bits(OrigTy); 8318 8319 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8320 } 8321 8322 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 8323 /// does not do any sign/zero extension. If the original vector is less 8324 /// than 64 bits, an appropriate extension will be added after the load to 8325 /// reach a total size of 64 bits. We have to add the extension separately 8326 /// because ARM does not have a sign/zero extending load for vectors. 8327 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8328 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8329 8330 // The load already has the right type. 8331 if (ExtendedTy == LD->getMemoryVT()) 8332 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8333 LD->getBasePtr(), LD->getPointerInfo(), 8334 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8335 8336 // We need to create a zextload/sextload. We cannot just create a load 8337 // followed by a zext/zext node because LowerMUL is also run during normal 8338 // operation legalization where we can't create illegal types. 8339 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8340 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8341 LD->getMemoryVT(), LD->getAlignment(), 8342 LD->getMemOperand()->getFlags()); 8343 } 8344 8345 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8346 /// extending load, or BUILD_VECTOR with extended elements, return the 8347 /// unextended value. The unextended vector should be 64 bits so that it can 8348 /// be used as an operand to a VMULL instruction. If the original vector size 8349 /// before extension is less than 64 bits we add a an extension to resize 8350 /// the vector to 64 bits. 8351 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8352 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 8353 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8354 N->getOperand(0)->getValueType(0), 8355 N->getValueType(0), 8356 N->getOpcode()); 8357 8358 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8359 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8360 "Expected extending load"); 8361 8362 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8363 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8364 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8365 SDValue extLoad = 8366 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8367 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8368 8369 return newLoad; 8370 } 8371 8372 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8373 // have been legalized as a BITCAST from v4i32. 8374 if (N->getOpcode() == ISD::BITCAST) { 8375 SDNode *BVN = N->getOperand(0).getNode(); 8376 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8377 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8378 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8379 return DAG.getBuildVector( 8380 MVT::v2i32, SDLoc(N), 8381 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8382 } 8383 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8384 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8385 EVT VT = N->getValueType(0); 8386 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8387 unsigned NumElts = VT.getVectorNumElements(); 8388 MVT TruncVT = MVT::getIntegerVT(EltSize); 8389 SmallVector<SDValue, 8> Ops; 8390 SDLoc dl(N); 8391 for (unsigned i = 0; i != NumElts; ++i) { 8392 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8393 const APInt &CInt = C->getAPIntValue(); 8394 // Element types smaller than 32 bits are not legal, so use i32 elements. 8395 // The values are implicitly truncated so sext vs. zext doesn't matter. 8396 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8397 } 8398 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8399 } 8400 8401 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8402 unsigned Opcode = N->getOpcode(); 8403 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8404 SDNode *N0 = N->getOperand(0).getNode(); 8405 SDNode *N1 = N->getOperand(1).getNode(); 8406 return N0->hasOneUse() && N1->hasOneUse() && 8407 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8408 } 8409 return false; 8410 } 8411 8412 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8413 unsigned Opcode = N->getOpcode(); 8414 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8415 SDNode *N0 = N->getOperand(0).getNode(); 8416 SDNode *N1 = N->getOperand(1).getNode(); 8417 return N0->hasOneUse() && N1->hasOneUse() && 8418 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8419 } 8420 return false; 8421 } 8422 8423 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8424 // Multiplications are only custom-lowered for 128-bit vectors so that 8425 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8426 EVT VT = Op.getValueType(); 8427 assert(VT.is128BitVector() && VT.isInteger() && 8428 "unexpected type for custom-lowering ISD::MUL"); 8429 SDNode *N0 = Op.getOperand(0).getNode(); 8430 SDNode *N1 = Op.getOperand(1).getNode(); 8431 unsigned NewOpc = 0; 8432 bool isMLA = false; 8433 bool isN0SExt = isSignExtended(N0, DAG); 8434 bool isN1SExt = isSignExtended(N1, DAG); 8435 if (isN0SExt && isN1SExt) 8436 NewOpc = ARMISD::VMULLs; 8437 else { 8438 bool isN0ZExt = isZeroExtended(N0, DAG); 8439 bool isN1ZExt = isZeroExtended(N1, DAG); 8440 if (isN0ZExt && isN1ZExt) 8441 NewOpc = ARMISD::VMULLu; 8442 else if (isN1SExt || isN1ZExt) { 8443 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 8444 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 8445 if (isN1SExt && isAddSubSExt(N0, DAG)) { 8446 NewOpc = ARMISD::VMULLs; 8447 isMLA = true; 8448 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 8449 NewOpc = ARMISD::VMULLu; 8450 isMLA = true; 8451 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 8452 std::swap(N0, N1); 8453 NewOpc = ARMISD::VMULLu; 8454 isMLA = true; 8455 } 8456 } 8457 8458 if (!NewOpc) { 8459 if (VT == MVT::v2i64) 8460 // Fall through to expand this. It is not legal. 8461 return SDValue(); 8462 else 8463 // Other vector multiplications are legal. 8464 return Op; 8465 } 8466 } 8467 8468 // Legalize to a VMULL instruction. 8469 SDLoc DL(Op); 8470 SDValue Op0; 8471 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 8472 if (!isMLA) { 8473 Op0 = SkipExtensionForVMULL(N0, DAG); 8474 assert(Op0.getValueType().is64BitVector() && 8475 Op1.getValueType().is64BitVector() && 8476 "unexpected types for extended operands to VMULL"); 8477 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 8478 } 8479 8480 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 8481 // isel lowering to take advantage of no-stall back to back vmul + vmla. 8482 // vmull q0, d4, d6 8483 // vmlal q0, d5, d6 8484 // is faster than 8485 // vaddl q0, d4, d5 8486 // vmovl q1, d6 8487 // vmul q0, q0, q1 8488 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 8489 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 8490 EVT Op1VT = Op1.getValueType(); 8491 return DAG.getNode(N0->getOpcode(), DL, VT, 8492 DAG.getNode(NewOpc, DL, VT, 8493 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 8494 DAG.getNode(NewOpc, DL, VT, 8495 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 8496 } 8497 8498 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 8499 SelectionDAG &DAG) { 8500 // TODO: Should this propagate fast-math-flags? 8501 8502 // Convert to float 8503 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 8504 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 8505 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 8506 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 8507 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 8508 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 8509 // Get reciprocal estimate. 8510 // float4 recip = vrecpeq_f32(yf); 8511 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8512 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8513 Y); 8514 // Because char has a smaller range than uchar, we can actually get away 8515 // without any newton steps. This requires that we use a weird bias 8516 // of 0xb000, however (again, this has been exhaustively tested). 8517 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 8518 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 8519 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 8520 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 8521 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 8522 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 8523 // Convert back to short. 8524 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 8525 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 8526 return X; 8527 } 8528 8529 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 8530 SelectionDAG &DAG) { 8531 // TODO: Should this propagate fast-math-flags? 8532 8533 SDValue N2; 8534 // Convert to float. 8535 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 8536 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 8537 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 8538 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 8539 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8540 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8541 8542 // Use reciprocal estimate and one refinement step. 8543 // float4 recip = vrecpeq_f32(yf); 8544 // recip *= vrecpsq_f32(yf, recip); 8545 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8546 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8547 N1); 8548 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8549 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8550 N1, N2); 8551 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8552 // Because short has a smaller range than ushort, we can actually get away 8553 // with only a single newton step. This requires that we use a weird bias 8554 // of 89, however (again, this has been exhaustively tested). 8555 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 8556 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8557 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8558 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 8559 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8560 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8561 // Convert back to integer and return. 8562 // return vmovn_s32(vcvt_s32_f32(result)); 8563 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8564 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8565 return N0; 8566 } 8567 8568 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 8569 const ARMSubtarget *ST) { 8570 EVT VT = Op.getValueType(); 8571 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8572 "unexpected type for custom-lowering ISD::SDIV"); 8573 8574 SDLoc dl(Op); 8575 SDValue N0 = Op.getOperand(0); 8576 SDValue N1 = Op.getOperand(1); 8577 SDValue N2, N3; 8578 8579 if (VT == MVT::v8i8) { 8580 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 8581 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 8582 8583 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8584 DAG.getIntPtrConstant(4, dl)); 8585 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8586 DAG.getIntPtrConstant(4, dl)); 8587 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8588 DAG.getIntPtrConstant(0, dl)); 8589 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8590 DAG.getIntPtrConstant(0, dl)); 8591 8592 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 8593 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 8594 8595 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8596 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8597 8598 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 8599 return N0; 8600 } 8601 return LowerSDIV_v4i16(N0, N1, dl, DAG); 8602 } 8603 8604 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 8605 const ARMSubtarget *ST) { 8606 // TODO: Should this propagate fast-math-flags? 8607 EVT VT = Op.getValueType(); 8608 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8609 "unexpected type for custom-lowering ISD::UDIV"); 8610 8611 SDLoc dl(Op); 8612 SDValue N0 = Op.getOperand(0); 8613 SDValue N1 = Op.getOperand(1); 8614 SDValue N2, N3; 8615 8616 if (VT == MVT::v8i8) { 8617 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 8618 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 8619 8620 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8621 DAG.getIntPtrConstant(4, dl)); 8622 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8623 DAG.getIntPtrConstant(4, dl)); 8624 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8625 DAG.getIntPtrConstant(0, dl)); 8626 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8627 DAG.getIntPtrConstant(0, dl)); 8628 8629 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 8630 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 8631 8632 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8633 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8634 8635 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 8636 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 8637 MVT::i32), 8638 N0); 8639 return N0; 8640 } 8641 8642 // v4i16 sdiv ... Convert to float. 8643 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 8644 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 8645 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 8646 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 8647 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8648 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8649 8650 // Use reciprocal estimate and two refinement steps. 8651 // float4 recip = vrecpeq_f32(yf); 8652 // recip *= vrecpsq_f32(yf, recip); 8653 // recip *= vrecpsq_f32(yf, recip); 8654 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8655 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8656 BN1); 8657 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8658 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8659 BN1, N2); 8660 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8661 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8662 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8663 BN1, N2); 8664 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8665 // Simply multiplying by the reciprocal estimate can leave us a few ulps 8666 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 8667 // and that it will never cause us to return an answer too large). 8668 // float4 result = as_float4(as_int4(xf*recip) + 2); 8669 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8670 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8671 N1 = DAG.getConstant(2, dl, MVT::v4i32); 8672 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8673 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8674 // Convert back to integer and return. 8675 // return vmovn_u32(vcvt_s32_f32(result)); 8676 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8677 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8678 return N0; 8679 } 8680 8681 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 8682 SDNode *N = Op.getNode(); 8683 EVT VT = N->getValueType(0); 8684 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8685 8686 SDValue Carry = Op.getOperand(2); 8687 8688 SDLoc DL(Op); 8689 8690 SDValue Result; 8691 if (Op.getOpcode() == ISD::ADDCARRY) { 8692 // This converts the boolean value carry into the carry flag. 8693 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8694 8695 // Do the addition proper using the carry flag we wanted. 8696 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 8697 Op.getOperand(1), Carry); 8698 8699 // Now convert the carry flag into a boolean value. 8700 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8701 } else { 8702 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 8703 // have to invert the carry first. 8704 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8705 DAG.getConstant(1, DL, MVT::i32), Carry); 8706 // This converts the boolean value carry into the carry flag. 8707 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8708 8709 // Do the subtraction proper using the carry flag we wanted. 8710 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 8711 Op.getOperand(1), Carry); 8712 8713 // Now convert the carry flag into a boolean value. 8714 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8715 // But the carry returned by ARMISD::SUBE is not a borrow as expected 8716 // by ISD::SUBCARRY, so compute 1 - C. 8717 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8718 DAG.getConstant(1, DL, MVT::i32), Carry); 8719 } 8720 8721 // Return both values. 8722 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 8723 } 8724 8725 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 8726 assert(Subtarget->isTargetDarwin()); 8727 8728 // For iOS, we want to call an alternative entry point: __sincos_stret, 8729 // return values are passed via sret. 8730 SDLoc dl(Op); 8731 SDValue Arg = Op.getOperand(0); 8732 EVT ArgVT = Arg.getValueType(); 8733 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8734 auto PtrVT = getPointerTy(DAG.getDataLayout()); 8735 8736 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8737 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8738 8739 // Pair of floats / doubles used to pass the result. 8740 Type *RetTy = StructType::get(ArgTy, ArgTy); 8741 auto &DL = DAG.getDataLayout(); 8742 8743 ArgListTy Args; 8744 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 8745 SDValue SRet; 8746 if (ShouldUseSRet) { 8747 // Create stack object for sret. 8748 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 8749 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 8750 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 8751 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 8752 8753 ArgListEntry Entry; 8754 Entry.Node = SRet; 8755 Entry.Ty = RetTy->getPointerTo(); 8756 Entry.IsSExt = false; 8757 Entry.IsZExt = false; 8758 Entry.IsSRet = true; 8759 Args.push_back(Entry); 8760 RetTy = Type::getVoidTy(*DAG.getContext()); 8761 } 8762 8763 ArgListEntry Entry; 8764 Entry.Node = Arg; 8765 Entry.Ty = ArgTy; 8766 Entry.IsSExt = false; 8767 Entry.IsZExt = false; 8768 Args.push_back(Entry); 8769 8770 RTLIB::Libcall LC = 8771 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 8772 const char *LibcallName = getLibcallName(LC); 8773 CallingConv::ID CC = getLibcallCallingConv(LC); 8774 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 8775 8776 TargetLowering::CallLoweringInfo CLI(DAG); 8777 CLI.setDebugLoc(dl) 8778 .setChain(DAG.getEntryNode()) 8779 .setCallee(CC, RetTy, Callee, std::move(Args)) 8780 .setDiscardResult(ShouldUseSRet); 8781 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 8782 8783 if (!ShouldUseSRet) 8784 return CallResult.first; 8785 8786 SDValue LoadSin = 8787 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 8788 8789 // Address of cos field. 8790 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 8791 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 8792 SDValue LoadCos = 8793 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 8794 8795 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 8796 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 8797 LoadSin.getValue(0), LoadCos.getValue(0)); 8798 } 8799 8800 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 8801 bool Signed, 8802 SDValue &Chain) const { 8803 EVT VT = Op.getValueType(); 8804 assert((VT == MVT::i32 || VT == MVT::i64) && 8805 "unexpected type for custom lowering DIV"); 8806 SDLoc dl(Op); 8807 8808 const auto &DL = DAG.getDataLayout(); 8809 const auto &TLI = DAG.getTargetLoweringInfo(); 8810 8811 const char *Name = nullptr; 8812 if (Signed) 8813 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 8814 else 8815 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 8816 8817 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 8818 8819 ARMTargetLowering::ArgListTy Args; 8820 8821 for (auto AI : {1, 0}) { 8822 ArgListEntry Arg; 8823 Arg.Node = Op.getOperand(AI); 8824 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 8825 Args.push_back(Arg); 8826 } 8827 8828 CallLoweringInfo CLI(DAG); 8829 CLI.setDebugLoc(dl) 8830 .setChain(Chain) 8831 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 8832 ES, std::move(Args)); 8833 8834 return LowerCallTo(CLI).first; 8835 } 8836 8837 // This is a code size optimisation: return the original SDIV node to 8838 // DAGCombiner when we don't want to expand SDIV into a sequence of 8839 // instructions, and an empty node otherwise which will cause the 8840 // SDIV to be expanded in DAGCombine. 8841 SDValue 8842 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 8843 SelectionDAG &DAG, 8844 SmallVectorImpl<SDNode *> &Created) const { 8845 // TODO: Support SREM 8846 if (N->getOpcode() != ISD::SDIV) 8847 return SDValue(); 8848 8849 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 8850 const bool MinSize = ST.hasMinSize(); 8851 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 8852 : ST.hasDivideInARMMode(); 8853 8854 // Don't touch vector types; rewriting this may lead to scalarizing 8855 // the int divs. 8856 if (N->getOperand(0).getValueType().isVector()) 8857 return SDValue(); 8858 8859 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 8860 // hwdiv support for this to be really profitable. 8861 if (!(MinSize && HasDivide)) 8862 return SDValue(); 8863 8864 // ARM mode is a bit simpler than Thumb: we can handle large power 8865 // of 2 immediates with 1 mov instruction; no further checks required, 8866 // just return the sdiv node. 8867 if (!ST.isThumb()) 8868 return SDValue(N, 0); 8869 8870 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 8871 // and thus lose the code size benefits of a MOVS that requires only 2. 8872 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 8873 // but as it's doing exactly this, it's not worth the trouble to get TTI. 8874 if (Divisor.sgt(128)) 8875 return SDValue(); 8876 8877 return SDValue(N, 0); 8878 } 8879 8880 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 8881 bool Signed) const { 8882 assert(Op.getValueType() == MVT::i32 && 8883 "unexpected type for custom lowering DIV"); 8884 SDLoc dl(Op); 8885 8886 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 8887 DAG.getEntryNode(), Op.getOperand(1)); 8888 8889 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 8890 } 8891 8892 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 8893 SDLoc DL(N); 8894 SDValue Op = N->getOperand(1); 8895 if (N->getValueType(0) == MVT::i32) 8896 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 8897 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 8898 DAG.getConstant(0, DL, MVT::i32)); 8899 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 8900 DAG.getConstant(1, DL, MVT::i32)); 8901 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 8902 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 8903 } 8904 8905 void ARMTargetLowering::ExpandDIV_Windows( 8906 SDValue Op, SelectionDAG &DAG, bool Signed, 8907 SmallVectorImpl<SDValue> &Results) const { 8908 const auto &DL = DAG.getDataLayout(); 8909 const auto &TLI = DAG.getTargetLoweringInfo(); 8910 8911 assert(Op.getValueType() == MVT::i64 && 8912 "unexpected type for custom lowering DIV"); 8913 SDLoc dl(Op); 8914 8915 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 8916 8917 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 8918 8919 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 8920 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 8921 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 8922 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 8923 8924 Results.push_back(Lower); 8925 Results.push_back(Upper); 8926 } 8927 8928 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 8929 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 8930 EVT MemVT = LD->getMemoryVT(); 8931 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 8932 "Expected a predicate type!"); 8933 assert(MemVT == Op.getValueType()); 8934 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 8935 "Expected a non-extending load"); 8936 assert(LD->isUnindexed() && "Expected a unindexed load"); 8937 8938 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 8939 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 8940 // need to make sure that 8/4 bits are actually loaded into the correct 8941 // place, which means loading the value and then shuffling the values into 8942 // the bottom bits of the predicate. 8943 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 8944 // for BE). 8945 8946 SDLoc dl(Op); 8947 SDValue Load = DAG.getExtLoad( 8948 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 8949 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 8950 LD->getMemOperand()); 8951 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); 8952 if (MemVT != MVT::v16i1) 8953 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 8954 DAG.getConstant(0, dl, MVT::i32)); 8955 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 8956 } 8957 8958 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 8959 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 8960 EVT MemVT = ST->getMemoryVT(); 8961 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 8962 "Expected a predicate type!"); 8963 assert(MemVT == ST->getValue().getValueType()); 8964 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 8965 assert(ST->isUnindexed() && "Expected a unindexed store"); 8966 8967 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 8968 // unset and a scalar store. 8969 SDLoc dl(Op); 8970 SDValue Build = ST->getValue(); 8971 if (MemVT != MVT::v16i1) { 8972 SmallVector<SDValue, 16> Ops; 8973 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) 8974 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 8975 DAG.getConstant(I, dl, MVT::i32))); 8976 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 8977 Ops.push_back(DAG.getUNDEF(MVT::i32)); 8978 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 8979 } 8980 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 8981 return DAG.getTruncStore( 8982 ST->getChain(), dl, GRP, ST->getBasePtr(), 8983 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 8984 ST->getMemOperand()); 8985 } 8986 8987 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 8988 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 8989 MVT VT = Op.getSimpleValueType(); 8990 SDValue Mask = N->getMask(); 8991 SDValue PassThru = N->getPassThru(); 8992 SDLoc dl(Op); 8993 8994 auto IsZero = [](SDValue PassThru) { 8995 return (ISD::isBuildVectorAllZeros(PassThru.getNode()) || 8996 (PassThru->getOpcode() == ARMISD::VMOVIMM && 8997 isNullConstant(PassThru->getOperand(0)))); 8998 }; 8999 9000 if (IsZero(PassThru)) 9001 return Op; 9002 9003 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9004 // zero too, and other values are lowered to a select. 9005 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9006 DAG.getTargetConstant(0, dl, MVT::i32)); 9007 SDValue NewLoad = DAG.getMaskedLoad( 9008 VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(), 9009 N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); 9010 SDValue Combo = NewLoad; 9011 if (!PassThru.isUndef() && 9012 (PassThru.getOpcode() != ISD::BITCAST || 9013 !IsZero(PassThru->getOperand(0)))) 9014 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9015 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9016 } 9017 9018 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9019 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9020 // Acquire/Release load/store is not legal for targets without a dmb or 9021 // equivalent available. 9022 return SDValue(); 9023 9024 // Monotonic load/store is legal for all targets. 9025 return Op; 9026 } 9027 9028 static void ReplaceREADCYCLECOUNTER(SDNode *N, 9029 SmallVectorImpl<SDValue> &Results, 9030 SelectionDAG &DAG, 9031 const ARMSubtarget *Subtarget) { 9032 SDLoc DL(N); 9033 // Under Power Management extensions, the cycle-count is: 9034 // mrc p15, #0, <Rt>, c9, c13, #0 9035 SDValue Ops[] = { N->getOperand(0), // Chain 9036 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9037 DAG.getTargetConstant(15, DL, MVT::i32), 9038 DAG.getTargetConstant(0, DL, MVT::i32), 9039 DAG.getTargetConstant(9, DL, MVT::i32), 9040 DAG.getTargetConstant(13, DL, MVT::i32), 9041 DAG.getTargetConstant(0, DL, MVT::i32) 9042 }; 9043 9044 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9045 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9046 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9047 DAG.getConstant(0, DL, MVT::i32))); 9048 Results.push_back(Cycles32.getValue(1)); 9049 } 9050 9051 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9052 SDLoc dl(V.getNode()); 9053 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9054 SDValue VHi = DAG.getAnyExtOrTrunc( 9055 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9056 dl, MVT::i32); 9057 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9058 if (isBigEndian) 9059 std::swap (VLo, VHi); 9060 SDValue RegClass = 9061 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9062 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9063 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9064 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9065 return SDValue( 9066 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9067 } 9068 9069 static void ReplaceCMP_SWAP_64Results(SDNode *N, 9070 SmallVectorImpl<SDValue> & Results, 9071 SelectionDAG &DAG) { 9072 assert(N->getValueType(0) == MVT::i64 && 9073 "AtomicCmpSwap on types less than 64 should be legal"); 9074 SDValue Ops[] = {N->getOperand(1), 9075 createGPRPairNode(DAG, N->getOperand(2)), 9076 createGPRPairNode(DAG, N->getOperand(3)), 9077 N->getOperand(0)}; 9078 SDNode *CmpSwap = DAG.getMachineNode( 9079 ARM::CMP_SWAP_64, SDLoc(N), 9080 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9081 9082 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9083 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9084 9085 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9086 9087 Results.push_back( 9088 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9089 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9090 Results.push_back( 9091 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9092 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 9093 Results.push_back(SDValue(CmpSwap, 2)); 9094 } 9095 9096 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, 9097 SelectionDAG &DAG) { 9098 const auto &TLI = DAG.getTargetLoweringInfo(); 9099 9100 assert(Subtarget.getTargetTriple().isOSMSVCRT() && 9101 "Custom lowering is MSVCRT specific!"); 9102 9103 SDLoc dl(Op); 9104 SDValue Val = Op.getOperand(0); 9105 MVT Ty = Val->getSimpleValueType(0); 9106 SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); 9107 SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", 9108 TLI.getPointerTy(DAG.getDataLayout())); 9109 9110 TargetLowering::ArgListTy Args; 9111 TargetLowering::ArgListEntry Entry; 9112 9113 Entry.Node = Val; 9114 Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); 9115 Entry.IsZExt = true; 9116 Args.push_back(Entry); 9117 9118 Entry.Node = Exponent; 9119 Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); 9120 Entry.IsZExt = true; 9121 Args.push_back(Entry); 9122 9123 Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); 9124 9125 // In the in-chain to the call is the entry node If we are emitting a 9126 // tailcall, the chain will be mutated if the node has a non-entry input 9127 // chain. 9128 SDValue InChain = DAG.getEntryNode(); 9129 SDValue TCChain = InChain; 9130 9131 const Function &F = DAG.getMachineFunction().getFunction(); 9132 bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && 9133 F.getReturnType() == LCRTy; 9134 if (IsTC) 9135 InChain = TCChain; 9136 9137 TargetLowering::CallLoweringInfo CLI(DAG); 9138 CLI.setDebugLoc(dl) 9139 .setChain(InChain) 9140 .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) 9141 .setTailCall(IsTC); 9142 std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI); 9143 9144 // Return the chain (the DAG root) if it is a tail call 9145 return !CI.second.getNode() ? DAG.getRoot() : CI.first; 9146 } 9147 9148 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9149 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9150 switch (Op.getOpcode()) { 9151 default: llvm_unreachable("Don't know how to custom lower this!"); 9152 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9153 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9154 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9155 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9156 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9157 case ISD::SELECT: return LowerSELECT(Op, DAG); 9158 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9159 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9160 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9161 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9162 case ISD::VASTART: return LowerVASTART(Op, DAG); 9163 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9164 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9165 case ISD::SINT_TO_FP: 9166 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9167 case ISD::FP_TO_SINT: 9168 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9169 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9170 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9171 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9172 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9173 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9174 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9175 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9176 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9177 Subtarget); 9178 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9179 case ISD::SHL: 9180 case ISD::SRL: 9181 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9182 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9183 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9184 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9185 case ISD::SRL_PARTS: 9186 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9187 case ISD::CTTZ: 9188 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9189 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9190 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9191 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9192 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9193 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9194 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9195 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9196 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9197 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9198 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9199 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9200 case ISD::MUL: return LowerMUL(Op, DAG); 9201 case ISD::SDIV: 9202 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9203 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9204 return LowerSDIV(Op, DAG, Subtarget); 9205 case ISD::UDIV: 9206 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9207 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9208 return LowerUDIV(Op, DAG, Subtarget); 9209 case ISD::ADDCARRY: 9210 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9211 case ISD::SADDO: 9212 case ISD::SSUBO: 9213 return LowerSignedALUO(Op, DAG); 9214 case ISD::UADDO: 9215 case ISD::USUBO: 9216 return LowerUnsignedALUO(Op, DAG); 9217 case ISD::SADDSAT: 9218 case ISD::SSUBSAT: 9219 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9220 case ISD::LOAD: 9221 return LowerPredicateLoad(Op, DAG); 9222 case ISD::STORE: 9223 return LowerPredicateStore(Op, DAG); 9224 case ISD::MLOAD: 9225 return LowerMLOAD(Op, DAG); 9226 case ISD::ATOMIC_LOAD: 9227 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9228 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9229 case ISD::SDIVREM: 9230 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9231 case ISD::DYNAMIC_STACKALLOC: 9232 if (Subtarget->isTargetWindows()) 9233 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9234 llvm_unreachable("Don't know how to custom lower this!"); 9235 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9236 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9237 case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); 9238 case ARMISD::WIN__DBZCHK: return SDValue(); 9239 } 9240 } 9241 9242 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9243 SelectionDAG &DAG) { 9244 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9245 unsigned Opc = 0; 9246 if (IntNo == Intrinsic::arm_smlald) 9247 Opc = ARMISD::SMLALD; 9248 else if (IntNo == Intrinsic::arm_smlaldx) 9249 Opc = ARMISD::SMLALDX; 9250 else if (IntNo == Intrinsic::arm_smlsld) 9251 Opc = ARMISD::SMLSLD; 9252 else if (IntNo == Intrinsic::arm_smlsldx) 9253 Opc = ARMISD::SMLSLDX; 9254 else 9255 return; 9256 9257 SDLoc dl(N); 9258 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9259 N->getOperand(3), 9260 DAG.getConstant(0, dl, MVT::i32)); 9261 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9262 N->getOperand(3), 9263 DAG.getConstant(1, dl, MVT::i32)); 9264 9265 SDValue LongMul = DAG.getNode(Opc, dl, 9266 DAG.getVTList(MVT::i32, MVT::i32), 9267 N->getOperand(1), N->getOperand(2), 9268 Lo, Hi); 9269 Results.push_back(LongMul.getValue(0)); 9270 Results.push_back(LongMul.getValue(1)); 9271 } 9272 9273 /// ReplaceNodeResults - Replace the results of node with an illegal result 9274 /// type with new values built out of custom code. 9275 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9276 SmallVectorImpl<SDValue> &Results, 9277 SelectionDAG &DAG) const { 9278 SDValue Res; 9279 switch (N->getOpcode()) { 9280 default: 9281 llvm_unreachable("Don't know how to custom expand this!"); 9282 case ISD::READ_REGISTER: 9283 ExpandREAD_REGISTER(N, Results, DAG); 9284 break; 9285 case ISD::BITCAST: 9286 Res = ExpandBITCAST(N, DAG, Subtarget); 9287 break; 9288 case ISD::SRL: 9289 case ISD::SRA: 9290 case ISD::SHL: 9291 Res = Expand64BitShift(N, DAG, Subtarget); 9292 break; 9293 case ISD::SREM: 9294 case ISD::UREM: 9295 Res = LowerREM(N, DAG); 9296 break; 9297 case ISD::SDIVREM: 9298 case ISD::UDIVREM: 9299 Res = LowerDivRem(SDValue(N, 0), DAG); 9300 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 9301 Results.push_back(Res.getValue(0)); 9302 Results.push_back(Res.getValue(1)); 9303 return; 9304 case ISD::SADDSAT: 9305 case ISD::SSUBSAT: 9306 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 9307 break; 9308 case ISD::READCYCLECOUNTER: 9309 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 9310 return; 9311 case ISD::UDIV: 9312 case ISD::SDIV: 9313 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 9314 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 9315 Results); 9316 case ISD::ATOMIC_CMP_SWAP: 9317 ReplaceCMP_SWAP_64Results(N, Results, DAG); 9318 return; 9319 case ISD::INTRINSIC_WO_CHAIN: 9320 return ReplaceLongIntrinsic(N, Results, DAG); 9321 case ISD::ABS: 9322 lowerABS(N, Results, DAG); 9323 return ; 9324 9325 } 9326 if (Res.getNode()) 9327 Results.push_back(Res); 9328 } 9329 9330 //===----------------------------------------------------------------------===// 9331 // ARM Scheduler Hooks 9332 //===----------------------------------------------------------------------===// 9333 9334 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 9335 /// registers the function context. 9336 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 9337 MachineBasicBlock *MBB, 9338 MachineBasicBlock *DispatchBB, 9339 int FI) const { 9340 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 9341 "ROPI/RWPI not currently supported with SjLj"); 9342 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9343 DebugLoc dl = MI.getDebugLoc(); 9344 MachineFunction *MF = MBB->getParent(); 9345 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9346 MachineConstantPool *MCP = MF->getConstantPool(); 9347 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 9348 const Function &F = MF->getFunction(); 9349 9350 bool isThumb = Subtarget->isThumb(); 9351 bool isThumb2 = Subtarget->isThumb2(); 9352 9353 unsigned PCLabelId = AFI->createPICLabelUId(); 9354 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 9355 ARMConstantPoolValue *CPV = 9356 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 9357 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 9358 9359 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 9360 : &ARM::GPRRegClass; 9361 9362 // Grab constant pool and fixed stack memory operands. 9363 MachineMemOperand *CPMMO = 9364 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9365 MachineMemOperand::MOLoad, 4, 4); 9366 9367 MachineMemOperand *FIMMOSt = 9368 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 9369 MachineMemOperand::MOStore, 4, 4); 9370 9371 // Load the address of the dispatch MBB into the jump buffer. 9372 if (isThumb2) { 9373 // Incoming value: jbuf 9374 // ldr.n r5, LCPI1_1 9375 // orr r5, r5, #1 9376 // add r5, pc 9377 // str r5, [$jbuf, #+4] ; &jbuf[1] 9378 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9379 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 9380 .addConstantPoolIndex(CPI) 9381 .addMemOperand(CPMMO) 9382 .add(predOps(ARMCC::AL)); 9383 // Set the low bit because of thumb mode. 9384 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9385 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 9386 .addReg(NewVReg1, RegState::Kill) 9387 .addImm(0x01) 9388 .add(predOps(ARMCC::AL)) 9389 .add(condCodeOp()); 9390 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9391 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 9392 .addReg(NewVReg2, RegState::Kill) 9393 .addImm(PCLabelId); 9394 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 9395 .addReg(NewVReg3, RegState::Kill) 9396 .addFrameIndex(FI) 9397 .addImm(36) // &jbuf[1] :: pc 9398 .addMemOperand(FIMMOSt) 9399 .add(predOps(ARMCC::AL)); 9400 } else if (isThumb) { 9401 // Incoming value: jbuf 9402 // ldr.n r1, LCPI1_4 9403 // add r1, pc 9404 // mov r2, #1 9405 // orrs r1, r2 9406 // add r2, $jbuf, #+4 ; &jbuf[1] 9407 // str r1, [r2] 9408 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9409 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 9410 .addConstantPoolIndex(CPI) 9411 .addMemOperand(CPMMO) 9412 .add(predOps(ARMCC::AL)); 9413 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9414 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 9415 .addReg(NewVReg1, RegState::Kill) 9416 .addImm(PCLabelId); 9417 // Set the low bit because of thumb mode. 9418 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9419 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 9420 .addReg(ARM::CPSR, RegState::Define) 9421 .addImm(1) 9422 .add(predOps(ARMCC::AL)); 9423 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9424 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 9425 .addReg(ARM::CPSR, RegState::Define) 9426 .addReg(NewVReg2, RegState::Kill) 9427 .addReg(NewVReg3, RegState::Kill) 9428 .add(predOps(ARMCC::AL)); 9429 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9430 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 9431 .addFrameIndex(FI) 9432 .addImm(36); // &jbuf[1] :: pc 9433 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 9434 .addReg(NewVReg4, RegState::Kill) 9435 .addReg(NewVReg5, RegState::Kill) 9436 .addImm(0) 9437 .addMemOperand(FIMMOSt) 9438 .add(predOps(ARMCC::AL)); 9439 } else { 9440 // Incoming value: jbuf 9441 // ldr r1, LCPI1_1 9442 // add r1, pc, r1 9443 // str r1, [$jbuf, #+4] ; &jbuf[1] 9444 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9445 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 9446 .addConstantPoolIndex(CPI) 9447 .addImm(0) 9448 .addMemOperand(CPMMO) 9449 .add(predOps(ARMCC::AL)); 9450 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9451 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 9452 .addReg(NewVReg1, RegState::Kill) 9453 .addImm(PCLabelId) 9454 .add(predOps(ARMCC::AL)); 9455 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 9456 .addReg(NewVReg2, RegState::Kill) 9457 .addFrameIndex(FI) 9458 .addImm(36) // &jbuf[1] :: pc 9459 .addMemOperand(FIMMOSt) 9460 .add(predOps(ARMCC::AL)); 9461 } 9462 } 9463 9464 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 9465 MachineBasicBlock *MBB) const { 9466 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9467 DebugLoc dl = MI.getDebugLoc(); 9468 MachineFunction *MF = MBB->getParent(); 9469 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9470 MachineFrameInfo &MFI = MF->getFrameInfo(); 9471 int FI = MFI.getFunctionContextIndex(); 9472 9473 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 9474 : &ARM::GPRnopcRegClass; 9475 9476 // Get a mapping of the call site numbers to all of the landing pads they're 9477 // associated with. 9478 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 9479 unsigned MaxCSNum = 0; 9480 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 9481 ++BB) { 9482 if (!BB->isEHPad()) continue; 9483 9484 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 9485 // pad. 9486 for (MachineBasicBlock::iterator 9487 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 9488 if (!II->isEHLabel()) continue; 9489 9490 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 9491 if (!MF->hasCallSiteLandingPad(Sym)) continue; 9492 9493 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 9494 for (SmallVectorImpl<unsigned>::iterator 9495 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 9496 CSI != CSE; ++CSI) { 9497 CallSiteNumToLPad[*CSI].push_back(&*BB); 9498 MaxCSNum = std::max(MaxCSNum, *CSI); 9499 } 9500 break; 9501 } 9502 } 9503 9504 // Get an ordered list of the machine basic blocks for the jump table. 9505 std::vector<MachineBasicBlock*> LPadList; 9506 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 9507 LPadList.reserve(CallSiteNumToLPad.size()); 9508 for (unsigned I = 1; I <= MaxCSNum; ++I) { 9509 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 9510 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9511 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 9512 LPadList.push_back(*II); 9513 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 9514 } 9515 } 9516 9517 assert(!LPadList.empty() && 9518 "No landing pad destinations for the dispatch jump table!"); 9519 9520 // Create the jump table and associated information. 9521 MachineJumpTableInfo *JTI = 9522 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 9523 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 9524 9525 // Create the MBBs for the dispatch code. 9526 9527 // Shove the dispatch's address into the return slot in the function context. 9528 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 9529 DispatchBB->setIsEHPad(); 9530 9531 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9532 unsigned trap_opcode; 9533 if (Subtarget->isThumb()) 9534 trap_opcode = ARM::tTRAP; 9535 else 9536 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 9537 9538 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 9539 DispatchBB->addSuccessor(TrapBB); 9540 9541 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 9542 DispatchBB->addSuccessor(DispContBB); 9543 9544 // Insert and MBBs. 9545 MF->insert(MF->end(), DispatchBB); 9546 MF->insert(MF->end(), DispContBB); 9547 MF->insert(MF->end(), TrapBB); 9548 9549 // Insert code into the entry block that creates and registers the function 9550 // context. 9551 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 9552 9553 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 9554 MachinePointerInfo::getFixedStack(*MF, FI), 9555 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 9556 9557 MachineInstrBuilder MIB; 9558 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 9559 9560 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 9561 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 9562 9563 // Add a register mask with no preserved registers. This results in all 9564 // registers being marked as clobbered. This can't work if the dispatch block 9565 // is in a Thumb1 function and is linked with ARM code which uses the FP 9566 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 9567 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 9568 9569 bool IsPositionIndependent = isPositionIndependent(); 9570 unsigned NumLPads = LPadList.size(); 9571 if (Subtarget->isThumb2()) { 9572 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9573 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 9574 .addFrameIndex(FI) 9575 .addImm(4) 9576 .addMemOperand(FIMMOLd) 9577 .add(predOps(ARMCC::AL)); 9578 9579 if (NumLPads < 256) { 9580 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 9581 .addReg(NewVReg1) 9582 .addImm(LPadList.size()) 9583 .add(predOps(ARMCC::AL)); 9584 } else { 9585 Register VReg1 = MRI->createVirtualRegister(TRC); 9586 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 9587 .addImm(NumLPads & 0xFFFF) 9588 .add(predOps(ARMCC::AL)); 9589 9590 unsigned VReg2 = VReg1; 9591 if ((NumLPads & 0xFFFF0000) != 0) { 9592 VReg2 = MRI->createVirtualRegister(TRC); 9593 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 9594 .addReg(VReg1) 9595 .addImm(NumLPads >> 16) 9596 .add(predOps(ARMCC::AL)); 9597 } 9598 9599 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 9600 .addReg(NewVReg1) 9601 .addReg(VReg2) 9602 .add(predOps(ARMCC::AL)); 9603 } 9604 9605 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 9606 .addMBB(TrapBB) 9607 .addImm(ARMCC::HI) 9608 .addReg(ARM::CPSR); 9609 9610 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9611 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 9612 .addJumpTableIndex(MJTI) 9613 .add(predOps(ARMCC::AL)); 9614 9615 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9616 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 9617 .addReg(NewVReg3, RegState::Kill) 9618 .addReg(NewVReg1) 9619 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9620 .add(predOps(ARMCC::AL)) 9621 .add(condCodeOp()); 9622 9623 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 9624 .addReg(NewVReg4, RegState::Kill) 9625 .addReg(NewVReg1) 9626 .addJumpTableIndex(MJTI); 9627 } else if (Subtarget->isThumb()) { 9628 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9629 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 9630 .addFrameIndex(FI) 9631 .addImm(1) 9632 .addMemOperand(FIMMOLd) 9633 .add(predOps(ARMCC::AL)); 9634 9635 if (NumLPads < 256) { 9636 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 9637 .addReg(NewVReg1) 9638 .addImm(NumLPads) 9639 .add(predOps(ARMCC::AL)); 9640 } else { 9641 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9642 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9643 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9644 9645 // MachineConstantPool wants an explicit alignment. 9646 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9647 if (Align == 0) 9648 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9649 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9650 9651 Register VReg1 = MRI->createVirtualRegister(TRC); 9652 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 9653 .addReg(VReg1, RegState::Define) 9654 .addConstantPoolIndex(Idx) 9655 .add(predOps(ARMCC::AL)); 9656 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 9657 .addReg(NewVReg1) 9658 .addReg(VReg1) 9659 .add(predOps(ARMCC::AL)); 9660 } 9661 9662 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 9663 .addMBB(TrapBB) 9664 .addImm(ARMCC::HI) 9665 .addReg(ARM::CPSR); 9666 9667 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9668 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 9669 .addReg(ARM::CPSR, RegState::Define) 9670 .addReg(NewVReg1) 9671 .addImm(2) 9672 .add(predOps(ARMCC::AL)); 9673 9674 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9675 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 9676 .addJumpTableIndex(MJTI) 9677 .add(predOps(ARMCC::AL)); 9678 9679 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9680 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 9681 .addReg(ARM::CPSR, RegState::Define) 9682 .addReg(NewVReg2, RegState::Kill) 9683 .addReg(NewVReg3) 9684 .add(predOps(ARMCC::AL)); 9685 9686 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9687 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9688 9689 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9690 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 9691 .addReg(NewVReg4, RegState::Kill) 9692 .addImm(0) 9693 .addMemOperand(JTMMOLd) 9694 .add(predOps(ARMCC::AL)); 9695 9696 unsigned NewVReg6 = NewVReg5; 9697 if (IsPositionIndependent) { 9698 NewVReg6 = MRI->createVirtualRegister(TRC); 9699 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 9700 .addReg(ARM::CPSR, RegState::Define) 9701 .addReg(NewVReg5, RegState::Kill) 9702 .addReg(NewVReg3) 9703 .add(predOps(ARMCC::AL)); 9704 } 9705 9706 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 9707 .addReg(NewVReg6, RegState::Kill) 9708 .addJumpTableIndex(MJTI); 9709 } else { 9710 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9711 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 9712 .addFrameIndex(FI) 9713 .addImm(4) 9714 .addMemOperand(FIMMOLd) 9715 .add(predOps(ARMCC::AL)); 9716 9717 if (NumLPads < 256) { 9718 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 9719 .addReg(NewVReg1) 9720 .addImm(NumLPads) 9721 .add(predOps(ARMCC::AL)); 9722 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 9723 Register VReg1 = MRI->createVirtualRegister(TRC); 9724 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 9725 .addImm(NumLPads & 0xFFFF) 9726 .add(predOps(ARMCC::AL)); 9727 9728 unsigned VReg2 = VReg1; 9729 if ((NumLPads & 0xFFFF0000) != 0) { 9730 VReg2 = MRI->createVirtualRegister(TRC); 9731 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 9732 .addReg(VReg1) 9733 .addImm(NumLPads >> 16) 9734 .add(predOps(ARMCC::AL)); 9735 } 9736 9737 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9738 .addReg(NewVReg1) 9739 .addReg(VReg2) 9740 .add(predOps(ARMCC::AL)); 9741 } else { 9742 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9743 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9744 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9745 9746 // MachineConstantPool wants an explicit alignment. 9747 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9748 if (Align == 0) 9749 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9750 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9751 9752 Register VReg1 = MRI->createVirtualRegister(TRC); 9753 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 9754 .addReg(VReg1, RegState::Define) 9755 .addConstantPoolIndex(Idx) 9756 .addImm(0) 9757 .add(predOps(ARMCC::AL)); 9758 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9759 .addReg(NewVReg1) 9760 .addReg(VReg1, RegState::Kill) 9761 .add(predOps(ARMCC::AL)); 9762 } 9763 9764 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 9765 .addMBB(TrapBB) 9766 .addImm(ARMCC::HI) 9767 .addReg(ARM::CPSR); 9768 9769 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9770 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 9771 .addReg(NewVReg1) 9772 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9773 .add(predOps(ARMCC::AL)) 9774 .add(condCodeOp()); 9775 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9776 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 9777 .addJumpTableIndex(MJTI) 9778 .add(predOps(ARMCC::AL)); 9779 9780 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9781 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9782 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9783 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 9784 .addReg(NewVReg3, RegState::Kill) 9785 .addReg(NewVReg4) 9786 .addImm(0) 9787 .addMemOperand(JTMMOLd) 9788 .add(predOps(ARMCC::AL)); 9789 9790 if (IsPositionIndependent) { 9791 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 9792 .addReg(NewVReg5, RegState::Kill) 9793 .addReg(NewVReg4) 9794 .addJumpTableIndex(MJTI); 9795 } else { 9796 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 9797 .addReg(NewVReg5, RegState::Kill) 9798 .addJumpTableIndex(MJTI); 9799 } 9800 } 9801 9802 // Add the jump table entries as successors to the MBB. 9803 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 9804 for (std::vector<MachineBasicBlock*>::iterator 9805 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 9806 MachineBasicBlock *CurMBB = *I; 9807 if (SeenMBBs.insert(CurMBB).second) 9808 DispContBB->addSuccessor(CurMBB); 9809 } 9810 9811 // N.B. the order the invoke BBs are processed in doesn't matter here. 9812 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 9813 SmallVector<MachineBasicBlock*, 64> MBBLPads; 9814 for (MachineBasicBlock *BB : InvokeBBs) { 9815 9816 // Remove the landing pad successor from the invoke block and replace it 9817 // with the new dispatch block. 9818 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 9819 BB->succ_end()); 9820 while (!Successors.empty()) { 9821 MachineBasicBlock *SMBB = Successors.pop_back_val(); 9822 if (SMBB->isEHPad()) { 9823 BB->removeSuccessor(SMBB); 9824 MBBLPads.push_back(SMBB); 9825 } 9826 } 9827 9828 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 9829 BB->normalizeSuccProbs(); 9830 9831 // Find the invoke call and mark all of the callee-saved registers as 9832 // 'implicit defined' so that they're spilled. This prevents code from 9833 // moving instructions to before the EH block, where they will never be 9834 // executed. 9835 for (MachineBasicBlock::reverse_iterator 9836 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 9837 if (!II->isCall()) continue; 9838 9839 DenseMap<unsigned, bool> DefRegs; 9840 for (MachineInstr::mop_iterator 9841 OI = II->operands_begin(), OE = II->operands_end(); 9842 OI != OE; ++OI) { 9843 if (!OI->isReg()) continue; 9844 DefRegs[OI->getReg()] = true; 9845 } 9846 9847 MachineInstrBuilder MIB(*MF, &*II); 9848 9849 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 9850 unsigned Reg = SavedRegs[i]; 9851 if (Subtarget->isThumb2() && 9852 !ARM::tGPRRegClass.contains(Reg) && 9853 !ARM::hGPRRegClass.contains(Reg)) 9854 continue; 9855 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 9856 continue; 9857 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 9858 continue; 9859 if (!DefRegs[Reg]) 9860 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 9861 } 9862 9863 break; 9864 } 9865 } 9866 9867 // Mark all former landing pads as non-landing pads. The dispatch is the only 9868 // landing pad now. 9869 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9870 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 9871 (*I)->setIsEHPad(false); 9872 9873 // The instruction is gone now. 9874 MI.eraseFromParent(); 9875 } 9876 9877 static 9878 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 9879 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 9880 E = MBB->succ_end(); I != E; ++I) 9881 if (*I != Succ) 9882 return *I; 9883 llvm_unreachable("Expecting a BB with two successors!"); 9884 } 9885 9886 /// Return the load opcode for a given load size. If load size >= 8, 9887 /// neon opcode will be returned. 9888 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 9889 if (LdSize >= 8) 9890 return LdSize == 16 ? ARM::VLD1q32wb_fixed 9891 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 9892 if (IsThumb1) 9893 return LdSize == 4 ? ARM::tLDRi 9894 : LdSize == 2 ? ARM::tLDRHi 9895 : LdSize == 1 ? ARM::tLDRBi : 0; 9896 if (IsThumb2) 9897 return LdSize == 4 ? ARM::t2LDR_POST 9898 : LdSize == 2 ? ARM::t2LDRH_POST 9899 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 9900 return LdSize == 4 ? ARM::LDR_POST_IMM 9901 : LdSize == 2 ? ARM::LDRH_POST 9902 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 9903 } 9904 9905 /// Return the store opcode for a given store size. If store size >= 8, 9906 /// neon opcode will be returned. 9907 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 9908 if (StSize >= 8) 9909 return StSize == 16 ? ARM::VST1q32wb_fixed 9910 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 9911 if (IsThumb1) 9912 return StSize == 4 ? ARM::tSTRi 9913 : StSize == 2 ? ARM::tSTRHi 9914 : StSize == 1 ? ARM::tSTRBi : 0; 9915 if (IsThumb2) 9916 return StSize == 4 ? ARM::t2STR_POST 9917 : StSize == 2 ? ARM::t2STRH_POST 9918 : StSize == 1 ? ARM::t2STRB_POST : 0; 9919 return StSize == 4 ? ARM::STR_POST_IMM 9920 : StSize == 2 ? ARM::STRH_POST 9921 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 9922 } 9923 9924 /// Emit a post-increment load operation with given size. The instructions 9925 /// will be added to BB at Pos. 9926 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 9927 const TargetInstrInfo *TII, const DebugLoc &dl, 9928 unsigned LdSize, unsigned Data, unsigned AddrIn, 9929 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 9930 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 9931 assert(LdOpc != 0 && "Should have a load opcode"); 9932 if (LdSize >= 8) { 9933 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 9934 .addReg(AddrOut, RegState::Define) 9935 .addReg(AddrIn) 9936 .addImm(0) 9937 .add(predOps(ARMCC::AL)); 9938 } else if (IsThumb1) { 9939 // load + update AddrIn 9940 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 9941 .addReg(AddrIn) 9942 .addImm(0) 9943 .add(predOps(ARMCC::AL)); 9944 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 9945 .add(t1CondCodeOp()) 9946 .addReg(AddrIn) 9947 .addImm(LdSize) 9948 .add(predOps(ARMCC::AL)); 9949 } else if (IsThumb2) { 9950 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 9951 .addReg(AddrOut, RegState::Define) 9952 .addReg(AddrIn) 9953 .addImm(LdSize) 9954 .add(predOps(ARMCC::AL)); 9955 } else { // arm 9956 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 9957 .addReg(AddrOut, RegState::Define) 9958 .addReg(AddrIn) 9959 .addReg(0) 9960 .addImm(LdSize) 9961 .add(predOps(ARMCC::AL)); 9962 } 9963 } 9964 9965 /// Emit a post-increment store operation with given size. The instructions 9966 /// will be added to BB at Pos. 9967 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 9968 const TargetInstrInfo *TII, const DebugLoc &dl, 9969 unsigned StSize, unsigned Data, unsigned AddrIn, 9970 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 9971 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 9972 assert(StOpc != 0 && "Should have a store opcode"); 9973 if (StSize >= 8) { 9974 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 9975 .addReg(AddrIn) 9976 .addImm(0) 9977 .addReg(Data) 9978 .add(predOps(ARMCC::AL)); 9979 } else if (IsThumb1) { 9980 // store + update AddrIn 9981 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 9982 .addReg(Data) 9983 .addReg(AddrIn) 9984 .addImm(0) 9985 .add(predOps(ARMCC::AL)); 9986 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 9987 .add(t1CondCodeOp()) 9988 .addReg(AddrIn) 9989 .addImm(StSize) 9990 .add(predOps(ARMCC::AL)); 9991 } else if (IsThumb2) { 9992 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 9993 .addReg(Data) 9994 .addReg(AddrIn) 9995 .addImm(StSize) 9996 .add(predOps(ARMCC::AL)); 9997 } else { // arm 9998 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 9999 .addReg(Data) 10000 .addReg(AddrIn) 10001 .addReg(0) 10002 .addImm(StSize) 10003 .add(predOps(ARMCC::AL)); 10004 } 10005 } 10006 10007 MachineBasicBlock * 10008 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10009 MachineBasicBlock *BB) const { 10010 // This pseudo instruction has 3 operands: dst, src, size 10011 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10012 // Otherwise, we will generate unrolled scalar copies. 10013 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10014 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10015 MachineFunction::iterator It = ++BB->getIterator(); 10016 10017 Register dest = MI.getOperand(0).getReg(); 10018 Register src = MI.getOperand(1).getReg(); 10019 unsigned SizeVal = MI.getOperand(2).getImm(); 10020 unsigned Align = MI.getOperand(3).getImm(); 10021 DebugLoc dl = MI.getDebugLoc(); 10022 10023 MachineFunction *MF = BB->getParent(); 10024 MachineRegisterInfo &MRI = MF->getRegInfo(); 10025 unsigned UnitSize = 0; 10026 const TargetRegisterClass *TRC = nullptr; 10027 const TargetRegisterClass *VecTRC = nullptr; 10028 10029 bool IsThumb1 = Subtarget->isThumb1Only(); 10030 bool IsThumb2 = Subtarget->isThumb2(); 10031 bool IsThumb = Subtarget->isThumb(); 10032 10033 if (Align & 1) { 10034 UnitSize = 1; 10035 } else if (Align & 2) { 10036 UnitSize = 2; 10037 } else { 10038 // Check whether we can use NEON instructions. 10039 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10040 Subtarget->hasNEON()) { 10041 if ((Align % 16 == 0) && SizeVal >= 16) 10042 UnitSize = 16; 10043 else if ((Align % 8 == 0) && SizeVal >= 8) 10044 UnitSize = 8; 10045 } 10046 // Can't use NEON instructions. 10047 if (UnitSize == 0) 10048 UnitSize = 4; 10049 } 10050 10051 // Select the correct opcode and register class for unit size load/store 10052 bool IsNeon = UnitSize >= 8; 10053 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10054 if (IsNeon) 10055 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10056 : UnitSize == 8 ? &ARM::DPRRegClass 10057 : nullptr; 10058 10059 unsigned BytesLeft = SizeVal % UnitSize; 10060 unsigned LoopSize = SizeVal - BytesLeft; 10061 10062 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10063 // Use LDR and STR to copy. 10064 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10065 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10066 unsigned srcIn = src; 10067 unsigned destIn = dest; 10068 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10069 Register srcOut = MRI.createVirtualRegister(TRC); 10070 Register destOut = MRI.createVirtualRegister(TRC); 10071 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10072 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10073 IsThumb1, IsThumb2); 10074 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10075 IsThumb1, IsThumb2); 10076 srcIn = srcOut; 10077 destIn = destOut; 10078 } 10079 10080 // Handle the leftover bytes with LDRB and STRB. 10081 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10082 // [destOut] = STRB_POST(scratch, destIn, 1) 10083 for (unsigned i = 0; i < BytesLeft; i++) { 10084 Register srcOut = MRI.createVirtualRegister(TRC); 10085 Register destOut = MRI.createVirtualRegister(TRC); 10086 Register scratch = MRI.createVirtualRegister(TRC); 10087 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10088 IsThumb1, IsThumb2); 10089 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10090 IsThumb1, IsThumb2); 10091 srcIn = srcOut; 10092 destIn = destOut; 10093 } 10094 MI.eraseFromParent(); // The instruction is gone now. 10095 return BB; 10096 } 10097 10098 // Expand the pseudo op to a loop. 10099 // thisMBB: 10100 // ... 10101 // movw varEnd, # --> with thumb2 10102 // movt varEnd, # 10103 // ldrcp varEnd, idx --> without thumb2 10104 // fallthrough --> loopMBB 10105 // loopMBB: 10106 // PHI varPhi, varEnd, varLoop 10107 // PHI srcPhi, src, srcLoop 10108 // PHI destPhi, dst, destLoop 10109 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10110 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10111 // subs varLoop, varPhi, #UnitSize 10112 // bne loopMBB 10113 // fallthrough --> exitMBB 10114 // exitMBB: 10115 // epilogue to handle left-over bytes 10116 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10117 // [destOut] = STRB_POST(scratch, destLoop, 1) 10118 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10119 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10120 MF->insert(It, loopMBB); 10121 MF->insert(It, exitMBB); 10122 10123 // Transfer the remainder of BB and its successor edges to exitMBB. 10124 exitMBB->splice(exitMBB->begin(), BB, 10125 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10126 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10127 10128 // Load an immediate to varEnd. 10129 Register varEnd = MRI.createVirtualRegister(TRC); 10130 if (Subtarget->useMovt()) { 10131 unsigned Vtmp = varEnd; 10132 if ((LoopSize & 0xFFFF0000) != 0) 10133 Vtmp = MRI.createVirtualRegister(TRC); 10134 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10135 .addImm(LoopSize & 0xFFFF) 10136 .add(predOps(ARMCC::AL)); 10137 10138 if ((LoopSize & 0xFFFF0000) != 0) 10139 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10140 .addReg(Vtmp) 10141 .addImm(LoopSize >> 16) 10142 .add(predOps(ARMCC::AL)); 10143 } else { 10144 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10145 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10146 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10147 10148 // MachineConstantPool wants an explicit alignment. 10149 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 10150 if (Align == 0) 10151 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 10152 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 10153 MachineMemOperand *CPMMO = 10154 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10155 MachineMemOperand::MOLoad, 4, 4); 10156 10157 if (IsThumb) 10158 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10159 .addReg(varEnd, RegState::Define) 10160 .addConstantPoolIndex(Idx) 10161 .add(predOps(ARMCC::AL)) 10162 .addMemOperand(CPMMO); 10163 else 10164 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10165 .addReg(varEnd, RegState::Define) 10166 .addConstantPoolIndex(Idx) 10167 .addImm(0) 10168 .add(predOps(ARMCC::AL)) 10169 .addMemOperand(CPMMO); 10170 } 10171 BB->addSuccessor(loopMBB); 10172 10173 // Generate the loop body: 10174 // varPhi = PHI(varLoop, varEnd) 10175 // srcPhi = PHI(srcLoop, src) 10176 // destPhi = PHI(destLoop, dst) 10177 MachineBasicBlock *entryBB = BB; 10178 BB = loopMBB; 10179 Register varLoop = MRI.createVirtualRegister(TRC); 10180 Register varPhi = MRI.createVirtualRegister(TRC); 10181 Register srcLoop = MRI.createVirtualRegister(TRC); 10182 Register srcPhi = MRI.createVirtualRegister(TRC); 10183 Register destLoop = MRI.createVirtualRegister(TRC); 10184 Register destPhi = MRI.createVirtualRegister(TRC); 10185 10186 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10187 .addReg(varLoop).addMBB(loopMBB) 10188 .addReg(varEnd).addMBB(entryBB); 10189 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10190 .addReg(srcLoop).addMBB(loopMBB) 10191 .addReg(src).addMBB(entryBB); 10192 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10193 .addReg(destLoop).addMBB(loopMBB) 10194 .addReg(dest).addMBB(entryBB); 10195 10196 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10197 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10198 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10199 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10200 IsThumb1, IsThumb2); 10201 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10202 IsThumb1, IsThumb2); 10203 10204 // Decrement loop variable by UnitSize. 10205 if (IsThumb1) { 10206 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10207 .add(t1CondCodeOp()) 10208 .addReg(varPhi) 10209 .addImm(UnitSize) 10210 .add(predOps(ARMCC::AL)); 10211 } else { 10212 MachineInstrBuilder MIB = 10213 BuildMI(*BB, BB->end(), dl, 10214 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10215 MIB.addReg(varPhi) 10216 .addImm(UnitSize) 10217 .add(predOps(ARMCC::AL)) 10218 .add(condCodeOp()); 10219 MIB->getOperand(5).setReg(ARM::CPSR); 10220 MIB->getOperand(5).setIsDef(true); 10221 } 10222 BuildMI(*BB, BB->end(), dl, 10223 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10224 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10225 10226 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10227 BB->addSuccessor(loopMBB); 10228 BB->addSuccessor(exitMBB); 10229 10230 // Add epilogue to handle BytesLeft. 10231 BB = exitMBB; 10232 auto StartOfExit = exitMBB->begin(); 10233 10234 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10235 // [destOut] = STRB_POST(scratch, destLoop, 1) 10236 unsigned srcIn = srcLoop; 10237 unsigned destIn = destLoop; 10238 for (unsigned i = 0; i < BytesLeft; i++) { 10239 Register srcOut = MRI.createVirtualRegister(TRC); 10240 Register destOut = MRI.createVirtualRegister(TRC); 10241 Register scratch = MRI.createVirtualRegister(TRC); 10242 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10243 IsThumb1, IsThumb2); 10244 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10245 IsThumb1, IsThumb2); 10246 srcIn = srcOut; 10247 destIn = destOut; 10248 } 10249 10250 MI.eraseFromParent(); // The instruction is gone now. 10251 return BB; 10252 } 10253 10254 MachineBasicBlock * 10255 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10256 MachineBasicBlock *MBB) const { 10257 const TargetMachine &TM = getTargetMachine(); 10258 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10259 DebugLoc DL = MI.getDebugLoc(); 10260 10261 assert(Subtarget->isTargetWindows() && 10262 "__chkstk is only supported on Windows"); 10263 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10264 10265 // __chkstk takes the number of words to allocate on the stack in R4, and 10266 // returns the stack adjustment in number of bytes in R4. This will not 10267 // clober any other registers (other than the obvious lr). 10268 // 10269 // Although, technically, IP should be considered a register which may be 10270 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10271 // thumb-2 environment, so there is no interworking required. As a result, we 10272 // do not expect a veneer to be emitted by the linker, clobbering IP. 10273 // 10274 // Each module receives its own copy of __chkstk, so no import thunk is 10275 // required, again, ensuring that IP is not clobbered. 10276 // 10277 // Finally, although some linkers may theoretically provide a trampoline for 10278 // out of range calls (which is quite common due to a 32M range limitation of 10279 // branches for Thumb), we can generate the long-call version via 10280 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10281 // IP. 10282 10283 switch (TM.getCodeModel()) { 10284 case CodeModel::Tiny: 10285 llvm_unreachable("Tiny code model not available on ARM."); 10286 case CodeModel::Small: 10287 case CodeModel::Medium: 10288 case CodeModel::Kernel: 10289 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10290 .add(predOps(ARMCC::AL)) 10291 .addExternalSymbol("__chkstk") 10292 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10293 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10294 .addReg(ARM::R12, 10295 RegState::Implicit | RegState::Define | RegState::Dead) 10296 .addReg(ARM::CPSR, 10297 RegState::Implicit | RegState::Define | RegState::Dead); 10298 break; 10299 case CodeModel::Large: { 10300 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10301 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 10302 10303 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 10304 .addExternalSymbol("__chkstk"); 10305 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 10306 .add(predOps(ARMCC::AL)) 10307 .addReg(Reg, RegState::Kill) 10308 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10309 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10310 .addReg(ARM::R12, 10311 RegState::Implicit | RegState::Define | RegState::Dead) 10312 .addReg(ARM::CPSR, 10313 RegState::Implicit | RegState::Define | RegState::Dead); 10314 break; 10315 } 10316 } 10317 10318 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 10319 .addReg(ARM::SP, RegState::Kill) 10320 .addReg(ARM::R4, RegState::Kill) 10321 .setMIFlags(MachineInstr::FrameSetup) 10322 .add(predOps(ARMCC::AL)) 10323 .add(condCodeOp()); 10324 10325 MI.eraseFromParent(); 10326 return MBB; 10327 } 10328 10329 MachineBasicBlock * 10330 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 10331 MachineBasicBlock *MBB) const { 10332 DebugLoc DL = MI.getDebugLoc(); 10333 MachineFunction *MF = MBB->getParent(); 10334 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10335 10336 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 10337 MF->insert(++MBB->getIterator(), ContBB); 10338 ContBB->splice(ContBB->begin(), MBB, 10339 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10340 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 10341 MBB->addSuccessor(ContBB); 10342 10343 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10344 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 10345 MF->push_back(TrapBB); 10346 MBB->addSuccessor(TrapBB); 10347 10348 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 10349 .addReg(MI.getOperand(0).getReg()) 10350 .addImm(0) 10351 .add(predOps(ARMCC::AL)); 10352 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 10353 .addMBB(TrapBB) 10354 .addImm(ARMCC::EQ) 10355 .addReg(ARM::CPSR); 10356 10357 MI.eraseFromParent(); 10358 return ContBB; 10359 } 10360 10361 // The CPSR operand of SelectItr might be missing a kill marker 10362 // because there were multiple uses of CPSR, and ISel didn't know 10363 // which to mark. Figure out whether SelectItr should have had a 10364 // kill marker, and set it if it should. Returns the correct kill 10365 // marker value. 10366 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 10367 MachineBasicBlock* BB, 10368 const TargetRegisterInfo* TRI) { 10369 // Scan forward through BB for a use/def of CPSR. 10370 MachineBasicBlock::iterator miI(std::next(SelectItr)); 10371 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 10372 const MachineInstr& mi = *miI; 10373 if (mi.readsRegister(ARM::CPSR)) 10374 return false; 10375 if (mi.definesRegister(ARM::CPSR)) 10376 break; // Should have kill-flag - update below. 10377 } 10378 10379 // If we hit the end of the block, check whether CPSR is live into a 10380 // successor. 10381 if (miI == BB->end()) { 10382 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 10383 sEnd = BB->succ_end(); 10384 sItr != sEnd; ++sItr) { 10385 MachineBasicBlock* succ = *sItr; 10386 if (succ->isLiveIn(ARM::CPSR)) 10387 return false; 10388 } 10389 } 10390 10391 // We found a def, or hit the end of the basic block and CPSR wasn't live 10392 // out. SelectMI should have a kill flag on CPSR. 10393 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 10394 return true; 10395 } 10396 10397 MachineBasicBlock * 10398 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10399 MachineBasicBlock *BB) const { 10400 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10401 DebugLoc dl = MI.getDebugLoc(); 10402 bool isThumb2 = Subtarget->isThumb2(); 10403 switch (MI.getOpcode()) { 10404 default: { 10405 MI.print(errs()); 10406 llvm_unreachable("Unexpected instr type to insert"); 10407 } 10408 10409 // Thumb1 post-indexed loads are really just single-register LDMs. 10410 case ARM::tLDR_postidx: { 10411 MachineOperand Def(MI.getOperand(1)); 10412 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 10413 .add(Def) // Rn_wb 10414 .add(MI.getOperand(2)) // Rn 10415 .add(MI.getOperand(3)) // PredImm 10416 .add(MI.getOperand(4)) // PredReg 10417 .add(MI.getOperand(0)) // Rt 10418 .cloneMemRefs(MI); 10419 MI.eraseFromParent(); 10420 return BB; 10421 } 10422 10423 // The Thumb2 pre-indexed stores have the same MI operands, they just 10424 // define them differently in the .td files from the isel patterns, so 10425 // they need pseudos. 10426 case ARM::t2STR_preidx: 10427 MI.setDesc(TII->get(ARM::t2STR_PRE)); 10428 return BB; 10429 case ARM::t2STRB_preidx: 10430 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 10431 return BB; 10432 case ARM::t2STRH_preidx: 10433 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 10434 return BB; 10435 10436 case ARM::STRi_preidx: 10437 case ARM::STRBi_preidx: { 10438 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 10439 : ARM::STRB_PRE_IMM; 10440 // Decode the offset. 10441 unsigned Offset = MI.getOperand(4).getImm(); 10442 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 10443 Offset = ARM_AM::getAM2Offset(Offset); 10444 if (isSub) 10445 Offset = -Offset; 10446 10447 MachineMemOperand *MMO = *MI.memoperands_begin(); 10448 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 10449 .add(MI.getOperand(0)) // Rn_wb 10450 .add(MI.getOperand(1)) // Rt 10451 .add(MI.getOperand(2)) // Rn 10452 .addImm(Offset) // offset (skip GPR==zero_reg) 10453 .add(MI.getOperand(5)) // pred 10454 .add(MI.getOperand(6)) 10455 .addMemOperand(MMO); 10456 MI.eraseFromParent(); 10457 return BB; 10458 } 10459 case ARM::STRr_preidx: 10460 case ARM::STRBr_preidx: 10461 case ARM::STRH_preidx: { 10462 unsigned NewOpc; 10463 switch (MI.getOpcode()) { 10464 default: llvm_unreachable("unexpected opcode!"); 10465 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 10466 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 10467 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 10468 } 10469 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 10470 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 10471 MIB.add(MI.getOperand(i)); 10472 MI.eraseFromParent(); 10473 return BB; 10474 } 10475 10476 case ARM::tMOVCCr_pseudo: { 10477 // To "insert" a SELECT_CC instruction, we actually have to insert the 10478 // diamond control-flow pattern. The incoming instruction knows the 10479 // destination vreg to set, the condition code register to branch on, the 10480 // true/false values to select between, and a branch opcode to use. 10481 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10482 MachineFunction::iterator It = ++BB->getIterator(); 10483 10484 // thisMBB: 10485 // ... 10486 // TrueVal = ... 10487 // cmpTY ccX, r1, r2 10488 // bCC copy1MBB 10489 // fallthrough --> copy0MBB 10490 MachineBasicBlock *thisMBB = BB; 10491 MachineFunction *F = BB->getParent(); 10492 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10493 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10494 F->insert(It, copy0MBB); 10495 F->insert(It, sinkMBB); 10496 10497 // Check whether CPSR is live past the tMOVCCr_pseudo. 10498 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 10499 if (!MI.killsRegister(ARM::CPSR) && 10500 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 10501 copy0MBB->addLiveIn(ARM::CPSR); 10502 sinkMBB->addLiveIn(ARM::CPSR); 10503 } 10504 10505 // Transfer the remainder of BB and its successor edges to sinkMBB. 10506 sinkMBB->splice(sinkMBB->begin(), BB, 10507 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10508 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10509 10510 BB->addSuccessor(copy0MBB); 10511 BB->addSuccessor(sinkMBB); 10512 10513 BuildMI(BB, dl, TII->get(ARM::tBcc)) 10514 .addMBB(sinkMBB) 10515 .addImm(MI.getOperand(3).getImm()) 10516 .addReg(MI.getOperand(4).getReg()); 10517 10518 // copy0MBB: 10519 // %FalseValue = ... 10520 // # fallthrough to sinkMBB 10521 BB = copy0MBB; 10522 10523 // Update machine-CFG edges 10524 BB->addSuccessor(sinkMBB); 10525 10526 // sinkMBB: 10527 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10528 // ... 10529 BB = sinkMBB; 10530 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 10531 .addReg(MI.getOperand(1).getReg()) 10532 .addMBB(copy0MBB) 10533 .addReg(MI.getOperand(2).getReg()) 10534 .addMBB(thisMBB); 10535 10536 MI.eraseFromParent(); // The pseudo instruction is gone now. 10537 return BB; 10538 } 10539 10540 case ARM::BCCi64: 10541 case ARM::BCCZi64: { 10542 // If there is an unconditional branch to the other successor, remove it. 10543 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10544 10545 // Compare both parts that make up the double comparison separately for 10546 // equality. 10547 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 10548 10549 Register LHS1 = MI.getOperand(1).getReg(); 10550 Register LHS2 = MI.getOperand(2).getReg(); 10551 if (RHSisZero) { 10552 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10553 .addReg(LHS1) 10554 .addImm(0) 10555 .add(predOps(ARMCC::AL)); 10556 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10557 .addReg(LHS2).addImm(0) 10558 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10559 } else { 10560 Register RHS1 = MI.getOperand(3).getReg(); 10561 Register RHS2 = MI.getOperand(4).getReg(); 10562 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10563 .addReg(LHS1) 10564 .addReg(RHS1) 10565 .add(predOps(ARMCC::AL)); 10566 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10567 .addReg(LHS2).addReg(RHS2) 10568 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10569 } 10570 10571 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 10572 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 10573 if (MI.getOperand(0).getImm() == ARMCC::NE) 10574 std::swap(destMBB, exitMBB); 10575 10576 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10577 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 10578 if (isThumb2) 10579 BuildMI(BB, dl, TII->get(ARM::t2B)) 10580 .addMBB(exitMBB) 10581 .add(predOps(ARMCC::AL)); 10582 else 10583 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 10584 10585 MI.eraseFromParent(); // The pseudo instruction is gone now. 10586 return BB; 10587 } 10588 10589 case ARM::Int_eh_sjlj_setjmp: 10590 case ARM::Int_eh_sjlj_setjmp_nofp: 10591 case ARM::tInt_eh_sjlj_setjmp: 10592 case ARM::t2Int_eh_sjlj_setjmp: 10593 case ARM::t2Int_eh_sjlj_setjmp_nofp: 10594 return BB; 10595 10596 case ARM::Int_eh_sjlj_setup_dispatch: 10597 EmitSjLjDispatchBlock(MI, BB); 10598 return BB; 10599 10600 case ARM::ABS: 10601 case ARM::t2ABS: { 10602 // To insert an ABS instruction, we have to insert the 10603 // diamond control-flow pattern. The incoming instruction knows the 10604 // source vreg to test against 0, the destination vreg to set, 10605 // the condition code register to branch on, the 10606 // true/false values to select between, and a branch opcode to use. 10607 // It transforms 10608 // V1 = ABS V0 10609 // into 10610 // V2 = MOVS V0 10611 // BCC (branch to SinkBB if V0 >= 0) 10612 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 10613 // SinkBB: V1 = PHI(V2, V3) 10614 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10615 MachineFunction::iterator BBI = ++BB->getIterator(); 10616 MachineFunction *Fn = BB->getParent(); 10617 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10618 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10619 Fn->insert(BBI, RSBBB); 10620 Fn->insert(BBI, SinkBB); 10621 10622 Register ABSSrcReg = MI.getOperand(1).getReg(); 10623 Register ABSDstReg = MI.getOperand(0).getReg(); 10624 bool ABSSrcKIll = MI.getOperand(1).isKill(); 10625 bool isThumb2 = Subtarget->isThumb2(); 10626 MachineRegisterInfo &MRI = Fn->getRegInfo(); 10627 // In Thumb mode S must not be specified if source register is the SP or 10628 // PC and if destination register is the SP, so restrict register class 10629 Register NewRsbDstReg = MRI.createVirtualRegister( 10630 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 10631 10632 // Transfer the remainder of BB and its successor edges to sinkMBB. 10633 SinkBB->splice(SinkBB->begin(), BB, 10634 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10635 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 10636 10637 BB->addSuccessor(RSBBB); 10638 BB->addSuccessor(SinkBB); 10639 10640 // fall through to SinkMBB 10641 RSBBB->addSuccessor(SinkBB); 10642 10643 // insert a cmp at the end of BB 10644 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10645 .addReg(ABSSrcReg) 10646 .addImm(0) 10647 .add(predOps(ARMCC::AL)); 10648 10649 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 10650 BuildMI(BB, dl, 10651 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 10652 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 10653 10654 // insert rsbri in RSBBB 10655 // Note: BCC and rsbri will be converted into predicated rsbmi 10656 // by if-conversion pass 10657 BuildMI(*RSBBB, RSBBB->begin(), dl, 10658 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 10659 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 10660 .addImm(0) 10661 .add(predOps(ARMCC::AL)) 10662 .add(condCodeOp()); 10663 10664 // insert PHI in SinkBB, 10665 // reuse ABSDstReg to not change uses of ABS instruction 10666 BuildMI(*SinkBB, SinkBB->begin(), dl, 10667 TII->get(ARM::PHI), ABSDstReg) 10668 .addReg(NewRsbDstReg).addMBB(RSBBB) 10669 .addReg(ABSSrcReg).addMBB(BB); 10670 10671 // remove ABS instruction 10672 MI.eraseFromParent(); 10673 10674 // return last added BB 10675 return SinkBB; 10676 } 10677 case ARM::COPY_STRUCT_BYVAL_I32: 10678 ++NumLoopByVals; 10679 return EmitStructByval(MI, BB); 10680 case ARM::WIN__CHKSTK: 10681 return EmitLowered__chkstk(MI, BB); 10682 case ARM::WIN__DBZCHK: 10683 return EmitLowered__dbzchk(MI, BB); 10684 } 10685 } 10686 10687 /// Attaches vregs to MEMCPY that it will use as scratch registers 10688 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 10689 /// instead of as a custom inserter because we need the use list from the SDNode. 10690 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 10691 MachineInstr &MI, const SDNode *Node) { 10692 bool isThumb1 = Subtarget->isThumb1Only(); 10693 10694 DebugLoc DL = MI.getDebugLoc(); 10695 MachineFunction *MF = MI.getParent()->getParent(); 10696 MachineRegisterInfo &MRI = MF->getRegInfo(); 10697 MachineInstrBuilder MIB(*MF, MI); 10698 10699 // If the new dst/src is unused mark it as dead. 10700 if (!Node->hasAnyUseOfValue(0)) { 10701 MI.getOperand(0).setIsDead(true); 10702 } 10703 if (!Node->hasAnyUseOfValue(1)) { 10704 MI.getOperand(1).setIsDead(true); 10705 } 10706 10707 // The MEMCPY both defines and kills the scratch registers. 10708 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 10709 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 10710 : &ARM::GPRRegClass); 10711 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 10712 } 10713 } 10714 10715 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 10716 SDNode *Node) const { 10717 if (MI.getOpcode() == ARM::MEMCPY) { 10718 attachMEMCPYScratchRegs(Subtarget, MI, Node); 10719 return; 10720 } 10721 10722 const MCInstrDesc *MCID = &MI.getDesc(); 10723 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 10724 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 10725 // operand is still set to noreg. If needed, set the optional operand's 10726 // register to CPSR, and remove the redundant implicit def. 10727 // 10728 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 10729 10730 // Rename pseudo opcodes. 10731 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 10732 unsigned ccOutIdx; 10733 if (NewOpc) { 10734 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 10735 MCID = &TII->get(NewOpc); 10736 10737 assert(MCID->getNumOperands() == 10738 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 10739 && "converted opcode should be the same except for cc_out" 10740 " (and, on Thumb1, pred)"); 10741 10742 MI.setDesc(*MCID); 10743 10744 // Add the optional cc_out operand 10745 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 10746 10747 // On Thumb1, move all input operands to the end, then add the predicate 10748 if (Subtarget->isThumb1Only()) { 10749 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 10750 MI.addOperand(MI.getOperand(1)); 10751 MI.RemoveOperand(1); 10752 } 10753 10754 // Restore the ties 10755 for (unsigned i = MI.getNumOperands(); i--;) { 10756 const MachineOperand& op = MI.getOperand(i); 10757 if (op.isReg() && op.isUse()) { 10758 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 10759 if (DefIdx != -1) 10760 MI.tieOperands(DefIdx, i); 10761 } 10762 } 10763 10764 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 10765 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 10766 ccOutIdx = 1; 10767 } else 10768 ccOutIdx = MCID->getNumOperands() - 1; 10769 } else 10770 ccOutIdx = MCID->getNumOperands() - 1; 10771 10772 // Any ARM instruction that sets the 's' bit should specify an optional 10773 // "cc_out" operand in the last operand position. 10774 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 10775 assert(!NewOpc && "Optional cc_out operand required"); 10776 return; 10777 } 10778 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 10779 // since we already have an optional CPSR def. 10780 bool definesCPSR = false; 10781 bool deadCPSR = false; 10782 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 10783 ++i) { 10784 const MachineOperand &MO = MI.getOperand(i); 10785 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 10786 definesCPSR = true; 10787 if (MO.isDead()) 10788 deadCPSR = true; 10789 MI.RemoveOperand(i); 10790 break; 10791 } 10792 } 10793 if (!definesCPSR) { 10794 assert(!NewOpc && "Optional cc_out operand required"); 10795 return; 10796 } 10797 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 10798 if (deadCPSR) { 10799 assert(!MI.getOperand(ccOutIdx).getReg() && 10800 "expect uninitialized optional cc_out operand"); 10801 // Thumb1 instructions must have the S bit even if the CPSR is dead. 10802 if (!Subtarget->isThumb1Only()) 10803 return; 10804 } 10805 10806 // If this instruction was defined with an optional CPSR def and its dag node 10807 // had a live implicit CPSR def, then activate the optional CPSR def. 10808 MachineOperand &MO = MI.getOperand(ccOutIdx); 10809 MO.setReg(ARM::CPSR); 10810 MO.setIsDef(true); 10811 } 10812 10813 //===----------------------------------------------------------------------===// 10814 // ARM Optimization Hooks 10815 //===----------------------------------------------------------------------===// 10816 10817 // Helper function that checks if N is a null or all ones constant. 10818 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 10819 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 10820 } 10821 10822 // Return true if N is conditionally 0 or all ones. 10823 // Detects these expressions where cc is an i1 value: 10824 // 10825 // (select cc 0, y) [AllOnes=0] 10826 // (select cc y, 0) [AllOnes=0] 10827 // (zext cc) [AllOnes=0] 10828 // (sext cc) [AllOnes=0/1] 10829 // (select cc -1, y) [AllOnes=1] 10830 // (select cc y, -1) [AllOnes=1] 10831 // 10832 // Invert is set when N is the null/all ones constant when CC is false. 10833 // OtherOp is set to the alternative value of N. 10834 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 10835 SDValue &CC, bool &Invert, 10836 SDValue &OtherOp, 10837 SelectionDAG &DAG) { 10838 switch (N->getOpcode()) { 10839 default: return false; 10840 case ISD::SELECT: { 10841 CC = N->getOperand(0); 10842 SDValue N1 = N->getOperand(1); 10843 SDValue N2 = N->getOperand(2); 10844 if (isZeroOrAllOnes(N1, AllOnes)) { 10845 Invert = false; 10846 OtherOp = N2; 10847 return true; 10848 } 10849 if (isZeroOrAllOnes(N2, AllOnes)) { 10850 Invert = true; 10851 OtherOp = N1; 10852 return true; 10853 } 10854 return false; 10855 } 10856 case ISD::ZERO_EXTEND: 10857 // (zext cc) can never be the all ones value. 10858 if (AllOnes) 10859 return false; 10860 LLVM_FALLTHROUGH; 10861 case ISD::SIGN_EXTEND: { 10862 SDLoc dl(N); 10863 EVT VT = N->getValueType(0); 10864 CC = N->getOperand(0); 10865 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 10866 return false; 10867 Invert = !AllOnes; 10868 if (AllOnes) 10869 // When looking for an AllOnes constant, N is an sext, and the 'other' 10870 // value is 0. 10871 OtherOp = DAG.getConstant(0, dl, VT); 10872 else if (N->getOpcode() == ISD::ZERO_EXTEND) 10873 // When looking for a 0 constant, N can be zext or sext. 10874 OtherOp = DAG.getConstant(1, dl, VT); 10875 else 10876 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 10877 VT); 10878 return true; 10879 } 10880 } 10881 } 10882 10883 // Combine a constant select operand into its use: 10884 // 10885 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 10886 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 10887 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 10888 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 10889 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 10890 // 10891 // The transform is rejected if the select doesn't have a constant operand that 10892 // is null, or all ones when AllOnes is set. 10893 // 10894 // Also recognize sext/zext from i1: 10895 // 10896 // (add (zext cc), x) -> (select cc (add x, 1), x) 10897 // (add (sext cc), x) -> (select cc (add x, -1), x) 10898 // 10899 // These transformations eventually create predicated instructions. 10900 // 10901 // @param N The node to transform. 10902 // @param Slct The N operand that is a select. 10903 // @param OtherOp The other N operand (x above). 10904 // @param DCI Context. 10905 // @param AllOnes Require the select constant to be all ones instead of null. 10906 // @returns The new node, or SDValue() on failure. 10907 static 10908 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 10909 TargetLowering::DAGCombinerInfo &DCI, 10910 bool AllOnes = false) { 10911 SelectionDAG &DAG = DCI.DAG; 10912 EVT VT = N->getValueType(0); 10913 SDValue NonConstantVal; 10914 SDValue CCOp; 10915 bool SwapSelectOps; 10916 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 10917 NonConstantVal, DAG)) 10918 return SDValue(); 10919 10920 // Slct is now know to be the desired identity constant when CC is true. 10921 SDValue TrueVal = OtherOp; 10922 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 10923 OtherOp, NonConstantVal); 10924 // Unless SwapSelectOps says CC should be false. 10925 if (SwapSelectOps) 10926 std::swap(TrueVal, FalseVal); 10927 10928 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 10929 CCOp, TrueVal, FalseVal); 10930 } 10931 10932 // Attempt combineSelectAndUse on each operand of a commutative operator N. 10933 static 10934 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 10935 TargetLowering::DAGCombinerInfo &DCI) { 10936 SDValue N0 = N->getOperand(0); 10937 SDValue N1 = N->getOperand(1); 10938 if (N0.getNode()->hasOneUse()) 10939 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 10940 return Result; 10941 if (N1.getNode()->hasOneUse()) 10942 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 10943 return Result; 10944 return SDValue(); 10945 } 10946 10947 static bool IsVUZPShuffleNode(SDNode *N) { 10948 // VUZP shuffle node. 10949 if (N->getOpcode() == ARMISD::VUZP) 10950 return true; 10951 10952 // "VUZP" on i32 is an alias for VTRN. 10953 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 10954 return true; 10955 10956 return false; 10957 } 10958 10959 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 10960 TargetLowering::DAGCombinerInfo &DCI, 10961 const ARMSubtarget *Subtarget) { 10962 // Look for ADD(VUZP.0, VUZP.1). 10963 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 10964 N0 == N1) 10965 return SDValue(); 10966 10967 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 10968 if (!N->getValueType(0).is64BitVector()) 10969 return SDValue(); 10970 10971 // Generate vpadd. 10972 SelectionDAG &DAG = DCI.DAG; 10973 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10974 SDLoc dl(N); 10975 SDNode *Unzip = N0.getNode(); 10976 EVT VT = N->getValueType(0); 10977 10978 SmallVector<SDValue, 8> Ops; 10979 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 10980 TLI.getPointerTy(DAG.getDataLayout()))); 10981 Ops.push_back(Unzip->getOperand(0)); 10982 Ops.push_back(Unzip->getOperand(1)); 10983 10984 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 10985 } 10986 10987 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 10988 TargetLowering::DAGCombinerInfo &DCI, 10989 const ARMSubtarget *Subtarget) { 10990 // Check for two extended operands. 10991 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 10992 N1.getOpcode() == ISD::SIGN_EXTEND) && 10993 !(N0.getOpcode() == ISD::ZERO_EXTEND && 10994 N1.getOpcode() == ISD::ZERO_EXTEND)) 10995 return SDValue(); 10996 10997 SDValue N00 = N0.getOperand(0); 10998 SDValue N10 = N1.getOperand(0); 10999 11000 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11001 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11002 N00 == N10) 11003 return SDValue(); 11004 11005 // We only recognize Q register paddl here; this can't be reached until 11006 // after type legalization. 11007 if (!N00.getValueType().is64BitVector() || 11008 !N0.getValueType().is128BitVector()) 11009 return SDValue(); 11010 11011 // Generate vpaddl. 11012 SelectionDAG &DAG = DCI.DAG; 11013 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11014 SDLoc dl(N); 11015 EVT VT = N->getValueType(0); 11016 11017 SmallVector<SDValue, 8> Ops; 11018 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11019 unsigned Opcode; 11020 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11021 Opcode = Intrinsic::arm_neon_vpaddls; 11022 else 11023 Opcode = Intrinsic::arm_neon_vpaddlu; 11024 Ops.push_back(DAG.getConstant(Opcode, dl, 11025 TLI.getPointerTy(DAG.getDataLayout()))); 11026 EVT ElemTy = N00.getValueType().getVectorElementType(); 11027 unsigned NumElts = VT.getVectorNumElements(); 11028 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11029 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11030 N00.getOperand(0), N00.getOperand(1)); 11031 Ops.push_back(Concat); 11032 11033 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11034 } 11035 11036 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11037 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11038 // much easier to match. 11039 static SDValue 11040 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11041 TargetLowering::DAGCombinerInfo &DCI, 11042 const ARMSubtarget *Subtarget) { 11043 // Only perform optimization if after legalize, and if NEON is available. We 11044 // also expected both operands to be BUILD_VECTORs. 11045 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11046 || N0.getOpcode() != ISD::BUILD_VECTOR 11047 || N1.getOpcode() != ISD::BUILD_VECTOR) 11048 return SDValue(); 11049 11050 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11051 EVT VT = N->getValueType(0); 11052 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11053 return SDValue(); 11054 11055 // Check that the vector operands are of the right form. 11056 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 11057 // operands, where N is the size of the formed vector. 11058 // Each EXTRACT_VECTOR should have the same input vector and odd or even 11059 // index such that we have a pair wise add pattern. 11060 11061 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 11062 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11063 return SDValue(); 11064 SDValue Vec = N0->getOperand(0)->getOperand(0); 11065 SDNode *V = Vec.getNode(); 11066 unsigned nextIndex = 0; 11067 11068 // For each operands to the ADD which are BUILD_VECTORs, 11069 // check to see if each of their operands are an EXTRACT_VECTOR with 11070 // the same vector and appropriate index. 11071 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 11072 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 11073 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11074 11075 SDValue ExtVec0 = N0->getOperand(i); 11076 SDValue ExtVec1 = N1->getOperand(i); 11077 11078 // First operand is the vector, verify its the same. 11079 if (V != ExtVec0->getOperand(0).getNode() || 11080 V != ExtVec1->getOperand(0).getNode()) 11081 return SDValue(); 11082 11083 // Second is the constant, verify its correct. 11084 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 11085 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 11086 11087 // For the constant, we want to see all the even or all the odd. 11088 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 11089 || C1->getZExtValue() != nextIndex+1) 11090 return SDValue(); 11091 11092 // Increment index. 11093 nextIndex+=2; 11094 } else 11095 return SDValue(); 11096 } 11097 11098 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 11099 // we're using the entire input vector, otherwise there's a size/legality 11100 // mismatch somewhere. 11101 if (nextIndex != Vec.getValueType().getVectorNumElements() || 11102 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 11103 return SDValue(); 11104 11105 // Create VPADDL node. 11106 SelectionDAG &DAG = DCI.DAG; 11107 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11108 11109 SDLoc dl(N); 11110 11111 // Build operand list. 11112 SmallVector<SDValue, 8> Ops; 11113 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 11114 TLI.getPointerTy(DAG.getDataLayout()))); 11115 11116 // Input is the vector. 11117 Ops.push_back(Vec); 11118 11119 // Get widened type and narrowed type. 11120 MVT widenType; 11121 unsigned numElem = VT.getVectorNumElements(); 11122 11123 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 11124 switch (inputLaneType.getSimpleVT().SimpleTy) { 11125 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 11126 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 11127 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 11128 default: 11129 llvm_unreachable("Invalid vector element type for padd optimization."); 11130 } 11131 11132 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 11133 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 11134 return DAG.getNode(ExtOp, dl, VT, tmp); 11135 } 11136 11137 static SDValue findMUL_LOHI(SDValue V) { 11138 if (V->getOpcode() == ISD::UMUL_LOHI || 11139 V->getOpcode() == ISD::SMUL_LOHI) 11140 return V; 11141 return SDValue(); 11142 } 11143 11144 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 11145 TargetLowering::DAGCombinerInfo &DCI, 11146 const ARMSubtarget *Subtarget) { 11147 if (!Subtarget->hasBaseDSP()) 11148 return SDValue(); 11149 11150 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 11151 // accumulates the product into a 64-bit value. The 16-bit values will 11152 // be sign extended somehow or SRA'd into 32-bit values 11153 // (addc (adde (mul 16bit, 16bit), lo), hi) 11154 SDValue Mul = AddcNode->getOperand(0); 11155 SDValue Lo = AddcNode->getOperand(1); 11156 if (Mul.getOpcode() != ISD::MUL) { 11157 Lo = AddcNode->getOperand(0); 11158 Mul = AddcNode->getOperand(1); 11159 if (Mul.getOpcode() != ISD::MUL) 11160 return SDValue(); 11161 } 11162 11163 SDValue SRA = AddeNode->getOperand(0); 11164 SDValue Hi = AddeNode->getOperand(1); 11165 if (SRA.getOpcode() != ISD::SRA) { 11166 SRA = AddeNode->getOperand(1); 11167 Hi = AddeNode->getOperand(0); 11168 if (SRA.getOpcode() != ISD::SRA) 11169 return SDValue(); 11170 } 11171 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 11172 if (Const->getZExtValue() != 31) 11173 return SDValue(); 11174 } else 11175 return SDValue(); 11176 11177 if (SRA.getOperand(0) != Mul) 11178 return SDValue(); 11179 11180 SelectionDAG &DAG = DCI.DAG; 11181 SDLoc dl(AddcNode); 11182 unsigned Opcode = 0; 11183 SDValue Op0; 11184 SDValue Op1; 11185 11186 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 11187 Opcode = ARMISD::SMLALBB; 11188 Op0 = Mul.getOperand(0); 11189 Op1 = Mul.getOperand(1); 11190 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 11191 Opcode = ARMISD::SMLALBT; 11192 Op0 = Mul.getOperand(0); 11193 Op1 = Mul.getOperand(1).getOperand(0); 11194 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 11195 Opcode = ARMISD::SMLALTB; 11196 Op0 = Mul.getOperand(0).getOperand(0); 11197 Op1 = Mul.getOperand(1); 11198 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 11199 Opcode = ARMISD::SMLALTT; 11200 Op0 = Mul->getOperand(0).getOperand(0); 11201 Op1 = Mul->getOperand(1).getOperand(0); 11202 } 11203 11204 if (!Op0 || !Op1) 11205 return SDValue(); 11206 11207 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 11208 Op0, Op1, Lo, Hi); 11209 // Replace the ADDs' nodes uses by the MLA node's values. 11210 SDValue HiMLALResult(SMLAL.getNode(), 1); 11211 SDValue LoMLALResult(SMLAL.getNode(), 0); 11212 11213 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 11214 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 11215 11216 // Return original node to notify the driver to stop replacing. 11217 SDValue resNode(AddcNode, 0); 11218 return resNode; 11219 } 11220 11221 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 11222 TargetLowering::DAGCombinerInfo &DCI, 11223 const ARMSubtarget *Subtarget) { 11224 // Look for multiply add opportunities. 11225 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 11226 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 11227 // a glue link from the first add to the second add. 11228 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 11229 // a S/UMLAL instruction. 11230 // UMUL_LOHI 11231 // / :lo \ :hi 11232 // V \ [no multiline comment] 11233 // loAdd -> ADDC | 11234 // \ :carry / 11235 // V V 11236 // ADDE <- hiAdd 11237 // 11238 // In the special case where only the higher part of a signed result is used 11239 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 11240 // a constant with the exact value of 0x80000000, we recognize we are dealing 11241 // with a "rounded multiply and add" (or subtract) and transform it into 11242 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 11243 11244 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 11245 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 11246 "Expect an ADDE or SUBE"); 11247 11248 assert(AddeSubeNode->getNumOperands() == 3 && 11249 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 11250 "ADDE node has the wrong inputs"); 11251 11252 // Check that we are chained to the right ADDC or SUBC node. 11253 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 11254 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 11255 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 11256 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 11257 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 11258 return SDValue(); 11259 11260 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 11261 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 11262 11263 // Check if the two operands are from the same mul_lohi node. 11264 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 11265 return SDValue(); 11266 11267 assert(AddcSubcNode->getNumValues() == 2 && 11268 AddcSubcNode->getValueType(0) == MVT::i32 && 11269 "Expect ADDC with two result values. First: i32"); 11270 11271 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 11272 // maybe a SMLAL which multiplies two 16-bit values. 11273 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 11274 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 11275 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 11276 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 11277 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 11278 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 11279 11280 // Check for the triangle shape. 11281 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 11282 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 11283 11284 // Make sure that the ADDE/SUBE operands are not coming from the same node. 11285 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 11286 return SDValue(); 11287 11288 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 11289 bool IsLeftOperandMUL = false; 11290 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 11291 if (MULOp == SDValue()) 11292 MULOp = findMUL_LOHI(AddeSubeOp1); 11293 else 11294 IsLeftOperandMUL = true; 11295 if (MULOp == SDValue()) 11296 return SDValue(); 11297 11298 // Figure out the right opcode. 11299 unsigned Opc = MULOp->getOpcode(); 11300 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 11301 11302 // Figure out the high and low input values to the MLAL node. 11303 SDValue *HiAddSub = nullptr; 11304 SDValue *LoMul = nullptr; 11305 SDValue *LowAddSub = nullptr; 11306 11307 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 11308 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 11309 return SDValue(); 11310 11311 if (IsLeftOperandMUL) 11312 HiAddSub = &AddeSubeOp1; 11313 else 11314 HiAddSub = &AddeSubeOp0; 11315 11316 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 11317 // whose low result is fed to the ADDC/SUBC we are checking. 11318 11319 if (AddcSubcOp0 == MULOp.getValue(0)) { 11320 LoMul = &AddcSubcOp0; 11321 LowAddSub = &AddcSubcOp1; 11322 } 11323 if (AddcSubcOp1 == MULOp.getValue(0)) { 11324 LoMul = &AddcSubcOp1; 11325 LowAddSub = &AddcSubcOp0; 11326 } 11327 11328 if (!LoMul) 11329 return SDValue(); 11330 11331 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 11332 // the replacement below will create a cycle. 11333 if (AddcSubcNode == HiAddSub->getNode() || 11334 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 11335 return SDValue(); 11336 11337 // Create the merged node. 11338 SelectionDAG &DAG = DCI.DAG; 11339 11340 // Start building operand list. 11341 SmallVector<SDValue, 8> Ops; 11342 Ops.push_back(LoMul->getOperand(0)); 11343 Ops.push_back(LoMul->getOperand(1)); 11344 11345 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 11346 // the case, we must be doing signed multiplication and only use the higher 11347 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 11348 // addition or subtraction with the value of 0x800000. 11349 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 11350 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 11351 LowAddSub->getNode()->getOpcode() == ISD::Constant && 11352 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 11353 0x80000000) { 11354 Ops.push_back(*HiAddSub); 11355 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 11356 FinalOpc = ARMISD::SMMLSR; 11357 } else { 11358 FinalOpc = ARMISD::SMMLAR; 11359 } 11360 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 11361 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 11362 11363 return SDValue(AddeSubeNode, 0); 11364 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 11365 // SMMLS is generated during instruction selection and the rest of this 11366 // function can not handle the case where AddcSubcNode is a SUBC. 11367 return SDValue(); 11368 11369 // Finish building the operand list for {U/S}MLAL 11370 Ops.push_back(*LowAddSub); 11371 Ops.push_back(*HiAddSub); 11372 11373 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 11374 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11375 11376 // Replace the ADDs' nodes uses by the MLA node's values. 11377 SDValue HiMLALResult(MLALNode.getNode(), 1); 11378 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 11379 11380 SDValue LoMLALResult(MLALNode.getNode(), 0); 11381 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 11382 11383 // Return original node to notify the driver to stop replacing. 11384 return SDValue(AddeSubeNode, 0); 11385 } 11386 11387 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 11388 TargetLowering::DAGCombinerInfo &DCI, 11389 const ARMSubtarget *Subtarget) { 11390 // UMAAL is similar to UMLAL except that it adds two unsigned values. 11391 // While trying to combine for the other MLAL nodes, first search for the 11392 // chance to use UMAAL. Check if Addc uses a node which has already 11393 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 11394 // as the addend, and it's handled in PerformUMLALCombine. 11395 11396 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11397 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11398 11399 // Check that we have a glued ADDC node. 11400 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 11401 if (AddcNode->getOpcode() != ARMISD::ADDC) 11402 return SDValue(); 11403 11404 // Find the converted UMAAL or quit if it doesn't exist. 11405 SDNode *UmlalNode = nullptr; 11406 SDValue AddHi; 11407 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 11408 UmlalNode = AddcNode->getOperand(0).getNode(); 11409 AddHi = AddcNode->getOperand(1); 11410 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 11411 UmlalNode = AddcNode->getOperand(1).getNode(); 11412 AddHi = AddcNode->getOperand(0); 11413 } else { 11414 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11415 } 11416 11417 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 11418 // the ADDC as well as Zero. 11419 if (!isNullConstant(UmlalNode->getOperand(3))) 11420 return SDValue(); 11421 11422 if ((isNullConstant(AddeNode->getOperand(0)) && 11423 AddeNode->getOperand(1).getNode() == UmlalNode) || 11424 (AddeNode->getOperand(0).getNode() == UmlalNode && 11425 isNullConstant(AddeNode->getOperand(1)))) { 11426 SelectionDAG &DAG = DCI.DAG; 11427 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 11428 UmlalNode->getOperand(2), AddHi }; 11429 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 11430 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11431 11432 // Replace the ADDs' nodes uses by the UMAAL node's values. 11433 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 11434 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 11435 11436 // Return original node to notify the driver to stop replacing. 11437 return SDValue(AddeNode, 0); 11438 } 11439 return SDValue(); 11440 } 11441 11442 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 11443 const ARMSubtarget *Subtarget) { 11444 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11445 return SDValue(); 11446 11447 // Check that we have a pair of ADDC and ADDE as operands. 11448 // Both addends of the ADDE must be zero. 11449 SDNode* AddcNode = N->getOperand(2).getNode(); 11450 SDNode* AddeNode = N->getOperand(3).getNode(); 11451 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 11452 (AddeNode->getOpcode() == ARMISD::ADDE) && 11453 isNullConstant(AddeNode->getOperand(0)) && 11454 isNullConstant(AddeNode->getOperand(1)) && 11455 (AddeNode->getOperand(2).getNode() == AddcNode)) 11456 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 11457 DAG.getVTList(MVT::i32, MVT::i32), 11458 {N->getOperand(0), N->getOperand(1), 11459 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 11460 else 11461 return SDValue(); 11462 } 11463 11464 static SDValue PerformAddcSubcCombine(SDNode *N, 11465 TargetLowering::DAGCombinerInfo &DCI, 11466 const ARMSubtarget *Subtarget) { 11467 SelectionDAG &DAG(DCI.DAG); 11468 11469 if (N->getOpcode() == ARMISD::SUBC) { 11470 // (SUBC (ADDE 0, 0, C), 1) -> C 11471 SDValue LHS = N->getOperand(0); 11472 SDValue RHS = N->getOperand(1); 11473 if (LHS->getOpcode() == ARMISD::ADDE && 11474 isNullConstant(LHS->getOperand(0)) && 11475 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 11476 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 11477 } 11478 } 11479 11480 if (Subtarget->isThumb1Only()) { 11481 SDValue RHS = N->getOperand(1); 11482 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11483 int32_t imm = C->getSExtValue(); 11484 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 11485 SDLoc DL(N); 11486 RHS = DAG.getConstant(-imm, DL, MVT::i32); 11487 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 11488 : ARMISD::ADDC; 11489 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 11490 } 11491 } 11492 } 11493 11494 return SDValue(); 11495 } 11496 11497 static SDValue PerformAddeSubeCombine(SDNode *N, 11498 TargetLowering::DAGCombinerInfo &DCI, 11499 const ARMSubtarget *Subtarget) { 11500 if (Subtarget->isThumb1Only()) { 11501 SelectionDAG &DAG = DCI.DAG; 11502 SDValue RHS = N->getOperand(1); 11503 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11504 int64_t imm = C->getSExtValue(); 11505 if (imm < 0) { 11506 SDLoc DL(N); 11507 11508 // The with-carry-in form matches bitwise not instead of the negation. 11509 // Effectively, the inverse interpretation of the carry flag already 11510 // accounts for part of the negation. 11511 RHS = DAG.getConstant(~imm, DL, MVT::i32); 11512 11513 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 11514 : ARMISD::ADDE; 11515 return DAG.getNode(Opcode, DL, N->getVTList(), 11516 N->getOperand(0), RHS, N->getOperand(2)); 11517 } 11518 } 11519 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 11520 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 11521 } 11522 return SDValue(); 11523 } 11524 11525 static SDValue PerformABSCombine(SDNode *N, 11526 TargetLowering::DAGCombinerInfo &DCI, 11527 const ARMSubtarget *Subtarget) { 11528 SDValue res; 11529 SelectionDAG &DAG = DCI.DAG; 11530 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11531 11532 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 11533 return SDValue(); 11534 11535 if (!TLI.expandABS(N, res, DAG)) 11536 return SDValue(); 11537 11538 return res; 11539 } 11540 11541 /// PerformADDECombine - Target-specific dag combine transform from 11542 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 11543 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 11544 static SDValue PerformADDECombine(SDNode *N, 11545 TargetLowering::DAGCombinerInfo &DCI, 11546 const ARMSubtarget *Subtarget) { 11547 // Only ARM and Thumb2 support UMLAL/SMLAL. 11548 if (Subtarget->isThumb1Only()) 11549 return PerformAddeSubeCombine(N, DCI, Subtarget); 11550 11551 // Only perform the checks after legalize when the pattern is available. 11552 if (DCI.isBeforeLegalize()) return SDValue(); 11553 11554 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 11555 } 11556 11557 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 11558 /// operands N0 and N1. This is a helper for PerformADDCombine that is 11559 /// called with the default operands, and if that fails, with commuted 11560 /// operands. 11561 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 11562 TargetLowering::DAGCombinerInfo &DCI, 11563 const ARMSubtarget *Subtarget){ 11564 // Attempt to create vpadd for this add. 11565 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 11566 return Result; 11567 11568 // Attempt to create vpaddl for this add. 11569 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 11570 return Result; 11571 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 11572 Subtarget)) 11573 return Result; 11574 11575 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11576 if (N0.getNode()->hasOneUse()) 11577 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 11578 return Result; 11579 return SDValue(); 11580 } 11581 11582 bool 11583 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 11584 CombineLevel Level) const { 11585 if (Level == BeforeLegalizeTypes) 11586 return true; 11587 11588 if (N->getOpcode() != ISD::SHL) 11589 return true; 11590 11591 if (Subtarget->isThumb1Only()) { 11592 // Avoid making expensive immediates by commuting shifts. (This logic 11593 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 11594 // for free.) 11595 if (N->getOpcode() != ISD::SHL) 11596 return true; 11597 SDValue N1 = N->getOperand(0); 11598 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 11599 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 11600 return true; 11601 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 11602 if (Const->getAPIntValue().ult(256)) 11603 return false; 11604 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 11605 Const->getAPIntValue().sgt(-256)) 11606 return false; 11607 } 11608 return true; 11609 } 11610 11611 // Turn off commute-with-shift transform after legalization, so it doesn't 11612 // conflict with PerformSHLSimplify. (We could try to detect when 11613 // PerformSHLSimplify would trigger more precisely, but it isn't 11614 // really necessary.) 11615 return false; 11616 } 11617 11618 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 11619 const SDNode *N, CombineLevel Level) const { 11620 if (!Subtarget->isThumb1Only()) 11621 return true; 11622 11623 if (Level == BeforeLegalizeTypes) 11624 return true; 11625 11626 return false; 11627 } 11628 11629 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 11630 if (!Subtarget->hasNEON()) { 11631 if (Subtarget->isThumb1Only()) 11632 return VT.getScalarSizeInBits() <= 32; 11633 return true; 11634 } 11635 return VT.isScalarInteger(); 11636 } 11637 11638 static SDValue PerformSHLSimplify(SDNode *N, 11639 TargetLowering::DAGCombinerInfo &DCI, 11640 const ARMSubtarget *ST) { 11641 // Allow the generic combiner to identify potential bswaps. 11642 if (DCI.isBeforeLegalize()) 11643 return SDValue(); 11644 11645 // DAG combiner will fold: 11646 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 11647 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 11648 // Other code patterns that can be also be modified have the following form: 11649 // b + ((a << 1) | 510) 11650 // b + ((a << 1) & 510) 11651 // b + ((a << 1) ^ 510) 11652 // b + ((a << 1) + 510) 11653 11654 // Many instructions can perform the shift for free, but it requires both 11655 // the operands to be registers. If c1 << c2 is too large, a mov immediate 11656 // instruction will needed. So, unfold back to the original pattern if: 11657 // - if c1 and c2 are small enough that they don't require mov imms. 11658 // - the user(s) of the node can perform an shl 11659 11660 // No shifted operands for 16-bit instructions. 11661 if (ST->isThumb() && ST->isThumb1Only()) 11662 return SDValue(); 11663 11664 // Check that all the users could perform the shl themselves. 11665 for (auto U : N->uses()) { 11666 switch(U->getOpcode()) { 11667 default: 11668 return SDValue(); 11669 case ISD::SUB: 11670 case ISD::ADD: 11671 case ISD::AND: 11672 case ISD::OR: 11673 case ISD::XOR: 11674 case ISD::SETCC: 11675 case ARMISD::CMP: 11676 // Check that the user isn't already using a constant because there 11677 // aren't any instructions that support an immediate operand and a 11678 // shifted operand. 11679 if (isa<ConstantSDNode>(U->getOperand(0)) || 11680 isa<ConstantSDNode>(U->getOperand(1))) 11681 return SDValue(); 11682 11683 // Check that it's not already using a shift. 11684 if (U->getOperand(0).getOpcode() == ISD::SHL || 11685 U->getOperand(1).getOpcode() == ISD::SHL) 11686 return SDValue(); 11687 break; 11688 } 11689 } 11690 11691 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 11692 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 11693 return SDValue(); 11694 11695 if (N->getOperand(0).getOpcode() != ISD::SHL) 11696 return SDValue(); 11697 11698 SDValue SHL = N->getOperand(0); 11699 11700 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11701 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 11702 if (!C1ShlC2 || !C2) 11703 return SDValue(); 11704 11705 APInt C2Int = C2->getAPIntValue(); 11706 APInt C1Int = C1ShlC2->getAPIntValue(); 11707 11708 // Check that performing a lshr will not lose any information. 11709 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 11710 C2Int.getBitWidth() - C2->getZExtValue()); 11711 if ((C1Int & Mask) != C1Int) 11712 return SDValue(); 11713 11714 // Shift the first constant. 11715 C1Int.lshrInPlace(C2Int); 11716 11717 // The immediates are encoded as an 8-bit value that can be rotated. 11718 auto LargeImm = [](const APInt &Imm) { 11719 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 11720 return Imm.getBitWidth() - Zeros > 8; 11721 }; 11722 11723 if (LargeImm(C1Int) || LargeImm(C2Int)) 11724 return SDValue(); 11725 11726 SelectionDAG &DAG = DCI.DAG; 11727 SDLoc dl(N); 11728 SDValue X = SHL.getOperand(0); 11729 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 11730 DAG.getConstant(C1Int, dl, MVT::i32)); 11731 // Shift left to compensate for the lshr of C1Int. 11732 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 11733 11734 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 11735 SHL.dump(); N->dump()); 11736 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 11737 return Res; 11738 } 11739 11740 11741 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 11742 /// 11743 static SDValue PerformADDCombine(SDNode *N, 11744 TargetLowering::DAGCombinerInfo &DCI, 11745 const ARMSubtarget *Subtarget) { 11746 SDValue N0 = N->getOperand(0); 11747 SDValue N1 = N->getOperand(1); 11748 11749 // Only works one way, because it needs an immediate operand. 11750 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11751 return Result; 11752 11753 // First try with the default operand order. 11754 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 11755 return Result; 11756 11757 // If that didn't work, try again with the operands commuted. 11758 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 11759 } 11760 11761 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 11762 /// 11763 static SDValue PerformSUBCombine(SDNode *N, 11764 TargetLowering::DAGCombinerInfo &DCI) { 11765 SDValue N0 = N->getOperand(0); 11766 SDValue N1 = N->getOperand(1); 11767 11768 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11769 if (N1.getNode()->hasOneUse()) 11770 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 11771 return Result; 11772 11773 return SDValue(); 11774 } 11775 11776 /// PerformVMULCombine 11777 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 11778 /// special multiplier accumulator forwarding. 11779 /// vmul d3, d0, d2 11780 /// vmla d3, d1, d2 11781 /// is faster than 11782 /// vadd d3, d0, d1 11783 /// vmul d3, d3, d2 11784 // However, for (A + B) * (A + B), 11785 // vadd d2, d0, d1 11786 // vmul d3, d0, d2 11787 // vmla d3, d1, d2 11788 // is slower than 11789 // vadd d2, d0, d1 11790 // vmul d3, d2, d2 11791 static SDValue PerformVMULCombine(SDNode *N, 11792 TargetLowering::DAGCombinerInfo &DCI, 11793 const ARMSubtarget *Subtarget) { 11794 if (!Subtarget->hasVMLxForwarding()) 11795 return SDValue(); 11796 11797 SelectionDAG &DAG = DCI.DAG; 11798 SDValue N0 = N->getOperand(0); 11799 SDValue N1 = N->getOperand(1); 11800 unsigned Opcode = N0.getOpcode(); 11801 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11802 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 11803 Opcode = N1.getOpcode(); 11804 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 11805 Opcode != ISD::FADD && Opcode != ISD::FSUB) 11806 return SDValue(); 11807 std::swap(N0, N1); 11808 } 11809 11810 if (N0 == N1) 11811 return SDValue(); 11812 11813 EVT VT = N->getValueType(0); 11814 SDLoc DL(N); 11815 SDValue N00 = N0->getOperand(0); 11816 SDValue N01 = N0->getOperand(1); 11817 return DAG.getNode(Opcode, DL, VT, 11818 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 11819 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 11820 } 11821 11822 static SDValue PerformMULCombine(SDNode *N, 11823 TargetLowering::DAGCombinerInfo &DCI, 11824 const ARMSubtarget *Subtarget) { 11825 SelectionDAG &DAG = DCI.DAG; 11826 11827 if (Subtarget->isThumb1Only()) 11828 return SDValue(); 11829 11830 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11831 return SDValue(); 11832 11833 EVT VT = N->getValueType(0); 11834 if (VT.is64BitVector() || VT.is128BitVector()) 11835 return PerformVMULCombine(N, DCI, Subtarget); 11836 if (VT != MVT::i32) 11837 return SDValue(); 11838 11839 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11840 if (!C) 11841 return SDValue(); 11842 11843 int64_t MulAmt = C->getSExtValue(); 11844 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 11845 11846 ShiftAmt = ShiftAmt & (32 - 1); 11847 SDValue V = N->getOperand(0); 11848 SDLoc DL(N); 11849 11850 SDValue Res; 11851 MulAmt >>= ShiftAmt; 11852 11853 if (MulAmt >= 0) { 11854 if (isPowerOf2_32(MulAmt - 1)) { 11855 // (mul x, 2^N + 1) => (add (shl x, N), x) 11856 Res = DAG.getNode(ISD::ADD, DL, VT, 11857 V, 11858 DAG.getNode(ISD::SHL, DL, VT, 11859 V, 11860 DAG.getConstant(Log2_32(MulAmt - 1), DL, 11861 MVT::i32))); 11862 } else if (isPowerOf2_32(MulAmt + 1)) { 11863 // (mul x, 2^N - 1) => (sub (shl x, N), x) 11864 Res = DAG.getNode(ISD::SUB, DL, VT, 11865 DAG.getNode(ISD::SHL, DL, VT, 11866 V, 11867 DAG.getConstant(Log2_32(MulAmt + 1), DL, 11868 MVT::i32)), 11869 V); 11870 } else 11871 return SDValue(); 11872 } else { 11873 uint64_t MulAmtAbs = -MulAmt; 11874 if (isPowerOf2_32(MulAmtAbs + 1)) { 11875 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 11876 Res = DAG.getNode(ISD::SUB, DL, VT, 11877 V, 11878 DAG.getNode(ISD::SHL, DL, VT, 11879 V, 11880 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 11881 MVT::i32))); 11882 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 11883 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 11884 Res = DAG.getNode(ISD::ADD, DL, VT, 11885 V, 11886 DAG.getNode(ISD::SHL, DL, VT, 11887 V, 11888 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 11889 MVT::i32))); 11890 Res = DAG.getNode(ISD::SUB, DL, VT, 11891 DAG.getConstant(0, DL, MVT::i32), Res); 11892 } else 11893 return SDValue(); 11894 } 11895 11896 if (ShiftAmt != 0) 11897 Res = DAG.getNode(ISD::SHL, DL, VT, 11898 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 11899 11900 // Do not add new nodes to DAG combiner worklist. 11901 DCI.CombineTo(N, Res, false); 11902 return SDValue(); 11903 } 11904 11905 static SDValue CombineANDShift(SDNode *N, 11906 TargetLowering::DAGCombinerInfo &DCI, 11907 const ARMSubtarget *Subtarget) { 11908 // Allow DAGCombine to pattern-match before we touch the canonical form. 11909 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11910 return SDValue(); 11911 11912 if (N->getValueType(0) != MVT::i32) 11913 return SDValue(); 11914 11915 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11916 if (!N1C) 11917 return SDValue(); 11918 11919 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 11920 // Don't transform uxtb/uxth. 11921 if (C1 == 255 || C1 == 65535) 11922 return SDValue(); 11923 11924 SDNode *N0 = N->getOperand(0).getNode(); 11925 if (!N0->hasOneUse()) 11926 return SDValue(); 11927 11928 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 11929 return SDValue(); 11930 11931 bool LeftShift = N0->getOpcode() == ISD::SHL; 11932 11933 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 11934 if (!N01C) 11935 return SDValue(); 11936 11937 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 11938 if (!C2 || C2 >= 32) 11939 return SDValue(); 11940 11941 // Clear irrelevant bits in the mask. 11942 if (LeftShift) 11943 C1 &= (-1U << C2); 11944 else 11945 C1 &= (-1U >> C2); 11946 11947 SelectionDAG &DAG = DCI.DAG; 11948 SDLoc DL(N); 11949 11950 // We have a pattern of the form "(and (shl x, c2) c1)" or 11951 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 11952 // transform to a pair of shifts, to save materializing c1. 11953 11954 // First pattern: right shift, then mask off leading bits. 11955 // FIXME: Use demanded bits? 11956 if (!LeftShift && isMask_32(C1)) { 11957 uint32_t C3 = countLeadingZeros(C1); 11958 if (C2 < C3) { 11959 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 11960 DAG.getConstant(C3 - C2, DL, MVT::i32)); 11961 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 11962 DAG.getConstant(C3, DL, MVT::i32)); 11963 } 11964 } 11965 11966 // First pattern, reversed: left shift, then mask off trailing bits. 11967 if (LeftShift && isMask_32(~C1)) { 11968 uint32_t C3 = countTrailingZeros(C1); 11969 if (C2 < C3) { 11970 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 11971 DAG.getConstant(C3 - C2, DL, MVT::i32)); 11972 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 11973 DAG.getConstant(C3, DL, MVT::i32)); 11974 } 11975 } 11976 11977 // Second pattern: left shift, then mask off leading bits. 11978 // FIXME: Use demanded bits? 11979 if (LeftShift && isShiftedMask_32(C1)) { 11980 uint32_t Trailing = countTrailingZeros(C1); 11981 uint32_t C3 = countLeadingZeros(C1); 11982 if (Trailing == C2 && C2 + C3 < 32) { 11983 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 11984 DAG.getConstant(C2 + C3, DL, MVT::i32)); 11985 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 11986 DAG.getConstant(C3, DL, MVT::i32)); 11987 } 11988 } 11989 11990 // Second pattern, reversed: right shift, then mask off trailing bits. 11991 // FIXME: Handle other patterns of known/demanded bits. 11992 if (!LeftShift && isShiftedMask_32(C1)) { 11993 uint32_t Leading = countLeadingZeros(C1); 11994 uint32_t C3 = countTrailingZeros(C1); 11995 if (Leading == C2 && C2 + C3 < 32) { 11996 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 11997 DAG.getConstant(C2 + C3, DL, MVT::i32)); 11998 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 11999 DAG.getConstant(C3, DL, MVT::i32)); 12000 } 12001 } 12002 12003 // FIXME: Transform "(and (shl x, c2) c1)" -> 12004 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 12005 // c1. 12006 return SDValue(); 12007 } 12008 12009 static SDValue PerformANDCombine(SDNode *N, 12010 TargetLowering::DAGCombinerInfo &DCI, 12011 const ARMSubtarget *Subtarget) { 12012 // Attempt to use immediate-form VBIC 12013 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12014 SDLoc dl(N); 12015 EVT VT = N->getValueType(0); 12016 SelectionDAG &DAG = DCI.DAG; 12017 12018 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12019 return SDValue(); 12020 12021 APInt SplatBits, SplatUndef; 12022 unsigned SplatBitSize; 12023 bool HasAnyUndefs; 12024 if (BVN && Subtarget->hasNEON() && 12025 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12026 if (SplatBitSize <= 64) { 12027 EVT VbicVT; 12028 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 12029 SplatUndef.getZExtValue(), SplatBitSize, 12030 DAG, dl, VbicVT, VT.is128BitVector(), 12031 OtherModImm); 12032 if (Val.getNode()) { 12033 SDValue Input = 12034 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 12035 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 12036 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 12037 } 12038 } 12039 } 12040 12041 if (!Subtarget->isThumb1Only()) { 12042 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 12043 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 12044 return Result; 12045 12046 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12047 return Result; 12048 } 12049 12050 if (Subtarget->isThumb1Only()) 12051 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 12052 return Result; 12053 12054 return SDValue(); 12055 } 12056 12057 // Try combining OR nodes to SMULWB, SMULWT. 12058 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 12059 TargetLowering::DAGCombinerInfo &DCI, 12060 const ARMSubtarget *Subtarget) { 12061 if (!Subtarget->hasV6Ops() || 12062 (Subtarget->isThumb() && 12063 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 12064 return SDValue(); 12065 12066 SDValue SRL = OR->getOperand(0); 12067 SDValue SHL = OR->getOperand(1); 12068 12069 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 12070 SRL = OR->getOperand(1); 12071 SHL = OR->getOperand(0); 12072 } 12073 if (!isSRL16(SRL) || !isSHL16(SHL)) 12074 return SDValue(); 12075 12076 // The first operands to the shifts need to be the two results from the 12077 // same smul_lohi node. 12078 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 12079 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 12080 return SDValue(); 12081 12082 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 12083 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 12084 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 12085 return SDValue(); 12086 12087 // Now we have: 12088 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 12089 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 12090 // For SMUWB the 16-bit value will signed extended somehow. 12091 // For SMULWT only the SRA is required. 12092 // Check both sides of SMUL_LOHI 12093 SDValue OpS16 = SMULLOHI->getOperand(0); 12094 SDValue OpS32 = SMULLOHI->getOperand(1); 12095 12096 SelectionDAG &DAG = DCI.DAG; 12097 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 12098 OpS16 = OpS32; 12099 OpS32 = SMULLOHI->getOperand(0); 12100 } 12101 12102 SDLoc dl(OR); 12103 unsigned Opcode = 0; 12104 if (isS16(OpS16, DAG)) 12105 Opcode = ARMISD::SMULWB; 12106 else if (isSRA16(OpS16)) { 12107 Opcode = ARMISD::SMULWT; 12108 OpS16 = OpS16->getOperand(0); 12109 } 12110 else 12111 return SDValue(); 12112 12113 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 12114 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 12115 return SDValue(OR, 0); 12116 } 12117 12118 static SDValue PerformORCombineToBFI(SDNode *N, 12119 TargetLowering::DAGCombinerInfo &DCI, 12120 const ARMSubtarget *Subtarget) { 12121 // BFI is only available on V6T2+ 12122 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 12123 return SDValue(); 12124 12125 EVT VT = N->getValueType(0); 12126 SDValue N0 = N->getOperand(0); 12127 SDValue N1 = N->getOperand(1); 12128 SelectionDAG &DAG = DCI.DAG; 12129 SDLoc DL(N); 12130 // 1) or (and A, mask), val => ARMbfi A, val, mask 12131 // iff (val & mask) == val 12132 // 12133 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12134 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 12135 // && mask == ~mask2 12136 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 12137 // && ~mask == mask2 12138 // (i.e., copy a bitfield value into another bitfield of the same width) 12139 12140 if (VT != MVT::i32) 12141 return SDValue(); 12142 12143 SDValue N00 = N0.getOperand(0); 12144 12145 // The value and the mask need to be constants so we can verify this is 12146 // actually a bitfield set. If the mask is 0xffff, we can do better 12147 // via a movt instruction, so don't use BFI in that case. 12148 SDValue MaskOp = N0.getOperand(1); 12149 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 12150 if (!MaskC) 12151 return SDValue(); 12152 unsigned Mask = MaskC->getZExtValue(); 12153 if (Mask == 0xffff) 12154 return SDValue(); 12155 SDValue Res; 12156 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 12157 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12158 if (N1C) { 12159 unsigned Val = N1C->getZExtValue(); 12160 if ((Val & ~Mask) != Val) 12161 return SDValue(); 12162 12163 if (ARM::isBitFieldInvertedMask(Mask)) { 12164 Val >>= countTrailingZeros(~Mask); 12165 12166 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 12167 DAG.getConstant(Val, DL, MVT::i32), 12168 DAG.getConstant(Mask, DL, MVT::i32)); 12169 12170 DCI.CombineTo(N, Res, false); 12171 // Return value from the original node to inform the combiner than N is 12172 // now dead. 12173 return SDValue(N, 0); 12174 } 12175 } else if (N1.getOpcode() == ISD::AND) { 12176 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12177 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12178 if (!N11C) 12179 return SDValue(); 12180 unsigned Mask2 = N11C->getZExtValue(); 12181 12182 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 12183 // as is to match. 12184 if (ARM::isBitFieldInvertedMask(Mask) && 12185 (Mask == ~Mask2)) { 12186 // The pack halfword instruction works better for masks that fit it, 12187 // so use that when it's available. 12188 if (Subtarget->hasDSP() && 12189 (Mask == 0xffff || Mask == 0xffff0000)) 12190 return SDValue(); 12191 // 2a 12192 unsigned amt = countTrailingZeros(Mask2); 12193 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 12194 DAG.getConstant(amt, DL, MVT::i32)); 12195 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 12196 DAG.getConstant(Mask, DL, MVT::i32)); 12197 DCI.CombineTo(N, Res, false); 12198 // Return value from the original node to inform the combiner than N is 12199 // now dead. 12200 return SDValue(N, 0); 12201 } else if (ARM::isBitFieldInvertedMask(~Mask) && 12202 (~Mask == Mask2)) { 12203 // The pack halfword instruction works better for masks that fit it, 12204 // so use that when it's available. 12205 if (Subtarget->hasDSP() && 12206 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 12207 return SDValue(); 12208 // 2b 12209 unsigned lsb = countTrailingZeros(Mask); 12210 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 12211 DAG.getConstant(lsb, DL, MVT::i32)); 12212 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 12213 DAG.getConstant(Mask2, DL, MVT::i32)); 12214 DCI.CombineTo(N, Res, false); 12215 // Return value from the original node to inform the combiner than N is 12216 // now dead. 12217 return SDValue(N, 0); 12218 } 12219 } 12220 12221 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 12222 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 12223 ARM::isBitFieldInvertedMask(~Mask)) { 12224 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 12225 // where lsb(mask) == #shamt and masked bits of B are known zero. 12226 SDValue ShAmt = N00.getOperand(1); 12227 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 12228 unsigned LSB = countTrailingZeros(Mask); 12229 if (ShAmtC != LSB) 12230 return SDValue(); 12231 12232 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 12233 DAG.getConstant(~Mask, DL, MVT::i32)); 12234 12235 DCI.CombineTo(N, Res, false); 12236 // Return value from the original node to inform the combiner than N is 12237 // now dead. 12238 return SDValue(N, 0); 12239 } 12240 12241 return SDValue(); 12242 } 12243 12244 static bool isValidMVECond(unsigned CC, bool IsFloat) { 12245 switch (CC) { 12246 case ARMCC::EQ: 12247 case ARMCC::NE: 12248 case ARMCC::LE: 12249 case ARMCC::GT: 12250 case ARMCC::GE: 12251 case ARMCC::LT: 12252 return true; 12253 case ARMCC::HS: 12254 case ARMCC::HI: 12255 return !IsFloat; 12256 default: 12257 return false; 12258 }; 12259 } 12260 12261 static SDValue PerformORCombine_i1(SDNode *N, 12262 TargetLowering::DAGCombinerInfo &DCI, 12263 const ARMSubtarget *Subtarget) { 12264 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 12265 // together with predicates 12266 EVT VT = N->getValueType(0); 12267 SDValue N0 = N->getOperand(0); 12268 SDValue N1 = N->getOperand(1); 12269 12270 ARMCC::CondCodes CondCode0 = ARMCC::AL; 12271 ARMCC::CondCodes CondCode1 = ARMCC::AL; 12272 if (N0->getOpcode() == ARMISD::VCMP) 12273 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) 12274 ->getZExtValue(); 12275 else if (N0->getOpcode() == ARMISD::VCMPZ) 12276 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) 12277 ->getZExtValue(); 12278 if (N1->getOpcode() == ARMISD::VCMP) 12279 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) 12280 ->getZExtValue(); 12281 else if (N1->getOpcode() == ARMISD::VCMPZ) 12282 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) 12283 ->getZExtValue(); 12284 12285 if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) 12286 return SDValue(); 12287 12288 unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); 12289 unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); 12290 12291 if (!isValidMVECond(Opposite0, 12292 N0->getOperand(0)->getValueType(0).isFloatingPoint()) || 12293 !isValidMVECond(Opposite1, 12294 N1->getOperand(0)->getValueType(0).isFloatingPoint())) 12295 return SDValue(); 12296 12297 SmallVector<SDValue, 4> Ops0; 12298 Ops0.push_back(N0->getOperand(0)); 12299 if (N0->getOpcode() == ARMISD::VCMP) 12300 Ops0.push_back(N0->getOperand(1)); 12301 Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); 12302 SmallVector<SDValue, 4> Ops1; 12303 Ops1.push_back(N1->getOperand(0)); 12304 if (N1->getOpcode() == ARMISD::VCMP) 12305 Ops1.push_back(N1->getOperand(1)); 12306 Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); 12307 12308 SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); 12309 SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); 12310 SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); 12311 return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, 12312 DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); 12313 } 12314 12315 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 12316 static SDValue PerformORCombine(SDNode *N, 12317 TargetLowering::DAGCombinerInfo &DCI, 12318 const ARMSubtarget *Subtarget) { 12319 // Attempt to use immediate-form VORR 12320 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12321 SDLoc dl(N); 12322 EVT VT = N->getValueType(0); 12323 SelectionDAG &DAG = DCI.DAG; 12324 12325 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12326 return SDValue(); 12327 12328 APInt SplatBits, SplatUndef; 12329 unsigned SplatBitSize; 12330 bool HasAnyUndefs; 12331 if (BVN && Subtarget->hasNEON() && 12332 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12333 if (SplatBitSize <= 64) { 12334 EVT VorrVT; 12335 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 12336 SplatUndef.getZExtValue(), SplatBitSize, 12337 DAG, dl, VorrVT, VT.is128BitVector(), 12338 OtherModImm); 12339 if (Val.getNode()) { 12340 SDValue Input = 12341 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 12342 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 12343 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 12344 } 12345 } 12346 } 12347 12348 if (!Subtarget->isThumb1Only()) { 12349 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12350 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12351 return Result; 12352 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 12353 return Result; 12354 } 12355 12356 SDValue N0 = N->getOperand(0); 12357 SDValue N1 = N->getOperand(1); 12358 12359 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 12360 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 12361 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12362 12363 // The code below optimizes (or (and X, Y), Z). 12364 // The AND operand needs to have a single user to make these optimizations 12365 // profitable. 12366 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 12367 return SDValue(); 12368 12369 APInt SplatUndef; 12370 unsigned SplatBitSize; 12371 bool HasAnyUndefs; 12372 12373 APInt SplatBits0, SplatBits1; 12374 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 12375 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 12376 // Ensure that the second operand of both ands are constants 12377 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 12378 HasAnyUndefs) && !HasAnyUndefs) { 12379 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 12380 HasAnyUndefs) && !HasAnyUndefs) { 12381 // Ensure that the bit width of the constants are the same and that 12382 // the splat arguments are logical inverses as per the pattern we 12383 // are trying to simplify. 12384 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 12385 SplatBits0 == ~SplatBits1) { 12386 // Canonicalize the vector type to make instruction selection 12387 // simpler. 12388 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 12389 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 12390 N0->getOperand(1), 12391 N0->getOperand(0), 12392 N1->getOperand(0)); 12393 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 12394 } 12395 } 12396 } 12397 } 12398 12399 if (Subtarget->hasMVEIntegerOps() && 12400 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 12401 return PerformORCombine_i1(N, DCI, Subtarget); 12402 12403 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 12404 // reasonable. 12405 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 12406 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 12407 return Res; 12408 } 12409 12410 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12411 return Result; 12412 12413 return SDValue(); 12414 } 12415 12416 static SDValue PerformXORCombine(SDNode *N, 12417 TargetLowering::DAGCombinerInfo &DCI, 12418 const ARMSubtarget *Subtarget) { 12419 EVT VT = N->getValueType(0); 12420 SelectionDAG &DAG = DCI.DAG; 12421 12422 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12423 return SDValue(); 12424 12425 if (!Subtarget->isThumb1Only()) { 12426 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12427 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12428 return Result; 12429 12430 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12431 return Result; 12432 } 12433 12434 return SDValue(); 12435 } 12436 12437 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 12438 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 12439 // their position in "to" (Rd). 12440 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 12441 assert(N->getOpcode() == ARMISD::BFI); 12442 12443 SDValue From = N->getOperand(1); 12444 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 12445 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 12446 12447 // If the Base came from a SHR #C, we can deduce that it is really testing bit 12448 // #C in the base of the SHR. 12449 if (From->getOpcode() == ISD::SRL && 12450 isa<ConstantSDNode>(From->getOperand(1))) { 12451 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 12452 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 12453 FromMask <<= Shift.getLimitedValue(31); 12454 From = From->getOperand(0); 12455 } 12456 12457 return From; 12458 } 12459 12460 // If A and B contain one contiguous set of bits, does A | B == A . B? 12461 // 12462 // Neither A nor B must be zero. 12463 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 12464 unsigned LastActiveBitInA = A.countTrailingZeros(); 12465 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 12466 return LastActiveBitInA - 1 == FirstActiveBitInB; 12467 } 12468 12469 static SDValue FindBFIToCombineWith(SDNode *N) { 12470 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 12471 // if one exists. 12472 APInt ToMask, FromMask; 12473 SDValue From = ParseBFI(N, ToMask, FromMask); 12474 SDValue To = N->getOperand(0); 12475 12476 // Now check for a compatible BFI to merge with. We can pass through BFIs that 12477 // aren't compatible, but not if they set the same bit in their destination as 12478 // we do (or that of any BFI we're going to combine with). 12479 SDValue V = To; 12480 APInt CombinedToMask = ToMask; 12481 while (V.getOpcode() == ARMISD::BFI) { 12482 APInt NewToMask, NewFromMask; 12483 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 12484 if (NewFrom != From) { 12485 // This BFI has a different base. Keep going. 12486 CombinedToMask |= NewToMask; 12487 V = V.getOperand(0); 12488 continue; 12489 } 12490 12491 // Do the written bits conflict with any we've seen so far? 12492 if ((NewToMask & CombinedToMask).getBoolValue()) 12493 // Conflicting bits - bail out because going further is unsafe. 12494 return SDValue(); 12495 12496 // Are the new bits contiguous when combined with the old bits? 12497 if (BitsProperlyConcatenate(ToMask, NewToMask) && 12498 BitsProperlyConcatenate(FromMask, NewFromMask)) 12499 return V; 12500 if (BitsProperlyConcatenate(NewToMask, ToMask) && 12501 BitsProperlyConcatenate(NewFromMask, FromMask)) 12502 return V; 12503 12504 // We've seen a write to some bits, so track it. 12505 CombinedToMask |= NewToMask; 12506 // Keep going... 12507 V = V.getOperand(0); 12508 } 12509 12510 return SDValue(); 12511 } 12512 12513 static SDValue PerformBFICombine(SDNode *N, 12514 TargetLowering::DAGCombinerInfo &DCI) { 12515 SDValue N1 = N->getOperand(1); 12516 if (N1.getOpcode() == ISD::AND) { 12517 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 12518 // the bits being cleared by the AND are not demanded by the BFI. 12519 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12520 if (!N11C) 12521 return SDValue(); 12522 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12523 unsigned LSB = countTrailingZeros(~InvMask); 12524 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 12525 assert(Width < 12526 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 12527 "undefined behavior"); 12528 unsigned Mask = (1u << Width) - 1; 12529 unsigned Mask2 = N11C->getZExtValue(); 12530 if ((Mask & (~Mask2)) == 0) 12531 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 12532 N->getOperand(0), N1.getOperand(0), 12533 N->getOperand(2)); 12534 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 12535 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 12536 // Keep track of any consecutive bits set that all come from the same base 12537 // value. We can combine these together into a single BFI. 12538 SDValue CombineBFI = FindBFIToCombineWith(N); 12539 if (CombineBFI == SDValue()) 12540 return SDValue(); 12541 12542 // We've found a BFI. 12543 APInt ToMask1, FromMask1; 12544 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 12545 12546 APInt ToMask2, FromMask2; 12547 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 12548 assert(From1 == From2); 12549 (void)From2; 12550 12551 // First, unlink CombineBFI. 12552 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 12553 // Then create a new BFI, combining the two together. 12554 APInt NewFromMask = FromMask1 | FromMask2; 12555 APInt NewToMask = ToMask1 | ToMask2; 12556 12557 EVT VT = N->getValueType(0); 12558 SDLoc dl(N); 12559 12560 if (NewFromMask[0] == 0) 12561 From1 = DCI.DAG.getNode( 12562 ISD::SRL, dl, VT, From1, 12563 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 12564 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 12565 DCI.DAG.getConstant(~NewToMask, dl, VT)); 12566 } 12567 return SDValue(); 12568 } 12569 12570 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 12571 /// ARMISD::VMOVRRD. 12572 static SDValue PerformVMOVRRDCombine(SDNode *N, 12573 TargetLowering::DAGCombinerInfo &DCI, 12574 const ARMSubtarget *Subtarget) { 12575 // vmovrrd(vmovdrr x, y) -> x,y 12576 SDValue InDouble = N->getOperand(0); 12577 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 12578 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 12579 12580 // vmovrrd(load f64) -> (load i32), (load i32) 12581 SDNode *InNode = InDouble.getNode(); 12582 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 12583 InNode->getValueType(0) == MVT::f64 && 12584 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 12585 !cast<LoadSDNode>(InNode)->isVolatile()) { 12586 // TODO: Should this be done for non-FrameIndex operands? 12587 LoadSDNode *LD = cast<LoadSDNode>(InNode); 12588 12589 SelectionDAG &DAG = DCI.DAG; 12590 SDLoc DL(LD); 12591 SDValue BasePtr = LD->getBasePtr(); 12592 SDValue NewLD1 = 12593 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 12594 LD->getAlignment(), LD->getMemOperand()->getFlags()); 12595 12596 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 12597 DAG.getConstant(4, DL, MVT::i32)); 12598 12599 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 12600 LD->getPointerInfo().getWithOffset(4), 12601 std::min(4U, LD->getAlignment()), 12602 LD->getMemOperand()->getFlags()); 12603 12604 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 12605 if (DCI.DAG.getDataLayout().isBigEndian()) 12606 std::swap (NewLD1, NewLD2); 12607 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 12608 return Result; 12609 } 12610 12611 return SDValue(); 12612 } 12613 12614 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 12615 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 12616 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 12617 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 12618 SDValue Op0 = N->getOperand(0); 12619 SDValue Op1 = N->getOperand(1); 12620 if (Op0.getOpcode() == ISD::BITCAST) 12621 Op0 = Op0.getOperand(0); 12622 if (Op1.getOpcode() == ISD::BITCAST) 12623 Op1 = Op1.getOperand(0); 12624 if (Op0.getOpcode() == ARMISD::VMOVRRD && 12625 Op0.getNode() == Op1.getNode() && 12626 Op0.getResNo() == 0 && Op1.getResNo() == 1) 12627 return DAG.getNode(ISD::BITCAST, SDLoc(N), 12628 N->getValueType(0), Op0.getOperand(0)); 12629 return SDValue(); 12630 } 12631 12632 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 12633 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 12634 /// i64 vector to have f64 elements, since the value can then be loaded 12635 /// directly into a VFP register. 12636 static bool hasNormalLoadOperand(SDNode *N) { 12637 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 12638 for (unsigned i = 0; i < NumElts; ++i) { 12639 SDNode *Elt = N->getOperand(i).getNode(); 12640 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 12641 return true; 12642 } 12643 return false; 12644 } 12645 12646 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 12647 /// ISD::BUILD_VECTOR. 12648 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 12649 TargetLowering::DAGCombinerInfo &DCI, 12650 const ARMSubtarget *Subtarget) { 12651 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 12652 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 12653 // into a pair of GPRs, which is fine when the value is used as a scalar, 12654 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 12655 SelectionDAG &DAG = DCI.DAG; 12656 if (N->getNumOperands() == 2) 12657 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 12658 return RV; 12659 12660 // Load i64 elements as f64 values so that type legalization does not split 12661 // them up into i32 values. 12662 EVT VT = N->getValueType(0); 12663 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 12664 return SDValue(); 12665 SDLoc dl(N); 12666 SmallVector<SDValue, 8> Ops; 12667 unsigned NumElts = VT.getVectorNumElements(); 12668 for (unsigned i = 0; i < NumElts; ++i) { 12669 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 12670 Ops.push_back(V); 12671 // Make the DAGCombiner fold the bitcast. 12672 DCI.AddToWorklist(V.getNode()); 12673 } 12674 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 12675 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 12676 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 12677 } 12678 12679 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 12680 static SDValue 12681 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12682 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 12683 // At that time, we may have inserted bitcasts from integer to float. 12684 // If these bitcasts have survived DAGCombine, change the lowering of this 12685 // BUILD_VECTOR in something more vector friendly, i.e., that does not 12686 // force to use floating point types. 12687 12688 // Make sure we can change the type of the vector. 12689 // This is possible iff: 12690 // 1. The vector is only used in a bitcast to a integer type. I.e., 12691 // 1.1. Vector is used only once. 12692 // 1.2. Use is a bit convert to an integer type. 12693 // 2. The size of its operands are 32-bits (64-bits are not legal). 12694 EVT VT = N->getValueType(0); 12695 EVT EltVT = VT.getVectorElementType(); 12696 12697 // Check 1.1. and 2. 12698 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 12699 return SDValue(); 12700 12701 // By construction, the input type must be float. 12702 assert(EltVT == MVT::f32 && "Unexpected type!"); 12703 12704 // Check 1.2. 12705 SDNode *Use = *N->use_begin(); 12706 if (Use->getOpcode() != ISD::BITCAST || 12707 Use->getValueType(0).isFloatingPoint()) 12708 return SDValue(); 12709 12710 // Check profitability. 12711 // Model is, if more than half of the relevant operands are bitcast from 12712 // i32, turn the build_vector into a sequence of insert_vector_elt. 12713 // Relevant operands are everything that is not statically 12714 // (i.e., at compile time) bitcasted. 12715 unsigned NumOfBitCastedElts = 0; 12716 unsigned NumElts = VT.getVectorNumElements(); 12717 unsigned NumOfRelevantElts = NumElts; 12718 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 12719 SDValue Elt = N->getOperand(Idx); 12720 if (Elt->getOpcode() == ISD::BITCAST) { 12721 // Assume only bit cast to i32 will go away. 12722 if (Elt->getOperand(0).getValueType() == MVT::i32) 12723 ++NumOfBitCastedElts; 12724 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 12725 // Constants are statically casted, thus do not count them as 12726 // relevant operands. 12727 --NumOfRelevantElts; 12728 } 12729 12730 // Check if more than half of the elements require a non-free bitcast. 12731 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 12732 return SDValue(); 12733 12734 SelectionDAG &DAG = DCI.DAG; 12735 // Create the new vector type. 12736 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 12737 // Check if the type is legal. 12738 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12739 if (!TLI.isTypeLegal(VecVT)) 12740 return SDValue(); 12741 12742 // Combine: 12743 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 12744 // => BITCAST INSERT_VECTOR_ELT 12745 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 12746 // (BITCAST EN), N. 12747 SDValue Vec = DAG.getUNDEF(VecVT); 12748 SDLoc dl(N); 12749 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 12750 SDValue V = N->getOperand(Idx); 12751 if (V.isUndef()) 12752 continue; 12753 if (V.getOpcode() == ISD::BITCAST && 12754 V->getOperand(0).getValueType() == MVT::i32) 12755 // Fold obvious case. 12756 V = V.getOperand(0); 12757 else { 12758 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 12759 // Make the DAGCombiner fold the bitcasts. 12760 DCI.AddToWorklist(V.getNode()); 12761 } 12762 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 12763 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 12764 } 12765 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 12766 // Make the DAGCombiner fold the bitcasts. 12767 DCI.AddToWorklist(Vec.getNode()); 12768 return Vec; 12769 } 12770 12771 static SDValue 12772 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12773 EVT VT = N->getValueType(0); 12774 SDValue Op = N->getOperand(0); 12775 SDLoc dl(N); 12776 12777 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 12778 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 12779 // If the valuetypes are the same, we can remove the cast entirely. 12780 if (Op->getOperand(0).getValueType() == VT) 12781 return Op->getOperand(0); 12782 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, 12783 Op->getOperand(0).getValueType(), Op->getOperand(0)); 12784 } 12785 12786 return SDValue(); 12787 } 12788 12789 /// PerformInsertEltCombine - Target-specific dag combine xforms for 12790 /// ISD::INSERT_VECTOR_ELT. 12791 static SDValue PerformInsertEltCombine(SDNode *N, 12792 TargetLowering::DAGCombinerInfo &DCI) { 12793 // Bitcast an i64 load inserted into a vector to f64. 12794 // Otherwise, the i64 value will be legalized to a pair of i32 values. 12795 EVT VT = N->getValueType(0); 12796 SDNode *Elt = N->getOperand(1).getNode(); 12797 if (VT.getVectorElementType() != MVT::i64 || 12798 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 12799 return SDValue(); 12800 12801 SelectionDAG &DAG = DCI.DAG; 12802 SDLoc dl(N); 12803 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 12804 VT.getVectorNumElements()); 12805 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 12806 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 12807 // Make the DAGCombiner fold the bitcasts. 12808 DCI.AddToWorklist(Vec.getNode()); 12809 DCI.AddToWorklist(V.getNode()); 12810 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 12811 Vec, V, N->getOperand(2)); 12812 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 12813 } 12814 12815 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 12816 /// ISD::VECTOR_SHUFFLE. 12817 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 12818 // The LLVM shufflevector instruction does not require the shuffle mask 12819 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 12820 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 12821 // operands do not match the mask length, they are extended by concatenating 12822 // them with undef vectors. That is probably the right thing for other 12823 // targets, but for NEON it is better to concatenate two double-register 12824 // size vector operands into a single quad-register size vector. Do that 12825 // transformation here: 12826 // shuffle(concat(v1, undef), concat(v2, undef)) -> 12827 // shuffle(concat(v1, v2), undef) 12828 SDValue Op0 = N->getOperand(0); 12829 SDValue Op1 = N->getOperand(1); 12830 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 12831 Op1.getOpcode() != ISD::CONCAT_VECTORS || 12832 Op0.getNumOperands() != 2 || 12833 Op1.getNumOperands() != 2) 12834 return SDValue(); 12835 SDValue Concat0Op1 = Op0.getOperand(1); 12836 SDValue Concat1Op1 = Op1.getOperand(1); 12837 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 12838 return SDValue(); 12839 // Skip the transformation if any of the types are illegal. 12840 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12841 EVT VT = N->getValueType(0); 12842 if (!TLI.isTypeLegal(VT) || 12843 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 12844 !TLI.isTypeLegal(Concat1Op1.getValueType())) 12845 return SDValue(); 12846 12847 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 12848 Op0.getOperand(0), Op1.getOperand(0)); 12849 // Translate the shuffle mask. 12850 SmallVector<int, 16> NewMask; 12851 unsigned NumElts = VT.getVectorNumElements(); 12852 unsigned HalfElts = NumElts/2; 12853 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 12854 for (unsigned n = 0; n < NumElts; ++n) { 12855 int MaskElt = SVN->getMaskElt(n); 12856 int NewElt = -1; 12857 if (MaskElt < (int)HalfElts) 12858 NewElt = MaskElt; 12859 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 12860 NewElt = HalfElts + MaskElt - NumElts; 12861 NewMask.push_back(NewElt); 12862 } 12863 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 12864 DAG.getUNDEF(VT), NewMask); 12865 } 12866 12867 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 12868 /// NEON load/store intrinsics, and generic vector load/stores, to merge 12869 /// base address updates. 12870 /// For generic load/stores, the memory type is assumed to be a vector. 12871 /// The caller is assumed to have checked legality. 12872 static SDValue CombineBaseUpdate(SDNode *N, 12873 TargetLowering::DAGCombinerInfo &DCI) { 12874 SelectionDAG &DAG = DCI.DAG; 12875 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 12876 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 12877 const bool isStore = N->getOpcode() == ISD::STORE; 12878 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 12879 SDValue Addr = N->getOperand(AddrOpIdx); 12880 MemSDNode *MemN = cast<MemSDNode>(N); 12881 SDLoc dl(N); 12882 12883 // Search for a use of the address operand that is an increment. 12884 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 12885 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 12886 SDNode *User = *UI; 12887 if (User->getOpcode() != ISD::ADD || 12888 UI.getUse().getResNo() != Addr.getResNo()) 12889 continue; 12890 12891 // Check that the add is independent of the load/store. Otherwise, folding 12892 // it would create a cycle. We can avoid searching through Addr as it's a 12893 // predecessor to both. 12894 SmallPtrSet<const SDNode *, 32> Visited; 12895 SmallVector<const SDNode *, 16> Worklist; 12896 Visited.insert(Addr.getNode()); 12897 Worklist.push_back(N); 12898 Worklist.push_back(User); 12899 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 12900 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 12901 continue; 12902 12903 // Find the new opcode for the updating load/store. 12904 bool isLoadOp = true; 12905 bool isLaneOp = false; 12906 unsigned NewOpc = 0; 12907 unsigned NumVecs = 0; 12908 if (isIntrinsic) { 12909 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 12910 switch (IntNo) { 12911 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 12912 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 12913 NumVecs = 1; break; 12914 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 12915 NumVecs = 2; break; 12916 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 12917 NumVecs = 3; break; 12918 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 12919 NumVecs = 4; break; 12920 case Intrinsic::arm_neon_vld2dup: 12921 case Intrinsic::arm_neon_vld3dup: 12922 case Intrinsic::arm_neon_vld4dup: 12923 // TODO: Support updating VLDxDUP nodes. For now, we just skip 12924 // combining base updates for such intrinsics. 12925 continue; 12926 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 12927 NumVecs = 2; isLaneOp = true; break; 12928 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 12929 NumVecs = 3; isLaneOp = true; break; 12930 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 12931 NumVecs = 4; isLaneOp = true; break; 12932 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 12933 NumVecs = 1; isLoadOp = false; break; 12934 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 12935 NumVecs = 2; isLoadOp = false; break; 12936 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 12937 NumVecs = 3; isLoadOp = false; break; 12938 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 12939 NumVecs = 4; isLoadOp = false; break; 12940 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 12941 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 12942 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 12943 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 12944 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 12945 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 12946 } 12947 } else { 12948 isLaneOp = true; 12949 switch (N->getOpcode()) { 12950 default: llvm_unreachable("unexpected opcode for Neon base update"); 12951 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 12952 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 12953 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 12954 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 12955 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 12956 NumVecs = 1; isLaneOp = false; break; 12957 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 12958 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 12959 } 12960 } 12961 12962 // Find the size of memory referenced by the load/store. 12963 EVT VecTy; 12964 if (isLoadOp) { 12965 VecTy = N->getValueType(0); 12966 } else if (isIntrinsic) { 12967 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 12968 } else { 12969 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 12970 VecTy = N->getOperand(1).getValueType(); 12971 } 12972 12973 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 12974 if (isLaneOp) 12975 NumBytes /= VecTy.getVectorNumElements(); 12976 12977 // If the increment is a constant, it must match the memory ref size. 12978 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 12979 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 12980 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 12981 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 12982 // separate instructions that make it harder to use a non-constant update. 12983 continue; 12984 } 12985 12986 // OK, we found an ADD we can fold into the base update. 12987 // Now, create a _UPD node, taking care of not breaking alignment. 12988 12989 EVT AlignedVecTy = VecTy; 12990 unsigned Alignment = MemN->getAlignment(); 12991 12992 // If this is a less-than-standard-aligned load/store, change the type to 12993 // match the standard alignment. 12994 // The alignment is overlooked when selecting _UPD variants; and it's 12995 // easier to introduce bitcasts here than fix that. 12996 // There are 3 ways to get to this base-update combine: 12997 // - intrinsics: they are assumed to be properly aligned (to the standard 12998 // alignment of the memory type), so we don't need to do anything. 12999 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 13000 // intrinsics, so, likewise, there's nothing to do. 13001 // - generic load/store instructions: the alignment is specified as an 13002 // explicit operand, rather than implicitly as the standard alignment 13003 // of the memory type (like the intrisics). We need to change the 13004 // memory type to match the explicit alignment. That way, we don't 13005 // generate non-standard-aligned ARMISD::VLDx nodes. 13006 if (isa<LSBaseSDNode>(N)) { 13007 if (Alignment == 0) 13008 Alignment = 1; 13009 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 13010 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 13011 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 13012 assert(!isLaneOp && "Unexpected generic load/store lane."); 13013 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 13014 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 13015 } 13016 // Don't set an explicit alignment on regular load/stores that we want 13017 // to transform to VLD/VST 1_UPD nodes. 13018 // This matches the behavior of regular load/stores, which only get an 13019 // explicit alignment if the MMO alignment is larger than the standard 13020 // alignment of the memory type. 13021 // Intrinsics, however, always get an explicit alignment, set to the 13022 // alignment of the MMO. 13023 Alignment = 1; 13024 } 13025 13026 // Create the new updating load/store node. 13027 // First, create an SDVTList for the new updating node's results. 13028 EVT Tys[6]; 13029 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13030 unsigned n; 13031 for (n = 0; n < NumResultVecs; ++n) 13032 Tys[n] = AlignedVecTy; 13033 Tys[n++] = MVT::i32; 13034 Tys[n] = MVT::Other; 13035 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 13036 13037 // Then, gather the new node's operands. 13038 SmallVector<SDValue, 8> Ops; 13039 Ops.push_back(N->getOperand(0)); // incoming chain 13040 Ops.push_back(N->getOperand(AddrOpIdx)); 13041 Ops.push_back(Inc); 13042 13043 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 13044 // Try to match the intrinsic's signature 13045 Ops.push_back(StN->getValue()); 13046 } else { 13047 // Loads (and of course intrinsics) match the intrinsics' signature, 13048 // so just add all but the alignment operand. 13049 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 13050 Ops.push_back(N->getOperand(i)); 13051 } 13052 13053 // For all node types, the alignment operand is always the last one. 13054 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 13055 13056 // If this is a non-standard-aligned STORE, the penultimate operand is the 13057 // stored value. Bitcast it to the aligned type. 13058 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 13059 SDValue &StVal = Ops[Ops.size()-2]; 13060 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 13061 } 13062 13063 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 13064 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 13065 MemN->getMemOperand()); 13066 13067 // Update the uses. 13068 SmallVector<SDValue, 5> NewResults; 13069 for (unsigned i = 0; i < NumResultVecs; ++i) 13070 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13071 13072 // If this is an non-standard-aligned LOAD, the first result is the loaded 13073 // value. Bitcast it to the expected result type. 13074 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 13075 SDValue &LdVal = NewResults[0]; 13076 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 13077 } 13078 13079 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 13080 DCI.CombineTo(N, NewResults); 13081 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13082 13083 break; 13084 } 13085 return SDValue(); 13086 } 13087 13088 static SDValue PerformVLDCombine(SDNode *N, 13089 TargetLowering::DAGCombinerInfo &DCI) { 13090 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13091 return SDValue(); 13092 13093 return CombineBaseUpdate(N, DCI); 13094 } 13095 13096 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 13097 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 13098 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 13099 /// return true. 13100 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13101 SelectionDAG &DAG = DCI.DAG; 13102 EVT VT = N->getValueType(0); 13103 // vldN-dup instructions only support 64-bit vectors for N > 1. 13104 if (!VT.is64BitVector()) 13105 return false; 13106 13107 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 13108 SDNode *VLD = N->getOperand(0).getNode(); 13109 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 13110 return false; 13111 unsigned NumVecs = 0; 13112 unsigned NewOpc = 0; 13113 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 13114 if (IntNo == Intrinsic::arm_neon_vld2lane) { 13115 NumVecs = 2; 13116 NewOpc = ARMISD::VLD2DUP; 13117 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 13118 NumVecs = 3; 13119 NewOpc = ARMISD::VLD3DUP; 13120 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 13121 NumVecs = 4; 13122 NewOpc = ARMISD::VLD4DUP; 13123 } else { 13124 return false; 13125 } 13126 13127 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 13128 // numbers match the load. 13129 unsigned VLDLaneNo = 13130 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 13131 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13132 UI != UE; ++UI) { 13133 // Ignore uses of the chain result. 13134 if (UI.getUse().getResNo() == NumVecs) 13135 continue; 13136 SDNode *User = *UI; 13137 if (User->getOpcode() != ARMISD::VDUPLANE || 13138 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 13139 return false; 13140 } 13141 13142 // Create the vldN-dup node. 13143 EVT Tys[5]; 13144 unsigned n; 13145 for (n = 0; n < NumVecs; ++n) 13146 Tys[n] = VT; 13147 Tys[n] = MVT::Other; 13148 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 13149 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 13150 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 13151 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 13152 Ops, VLDMemInt->getMemoryVT(), 13153 VLDMemInt->getMemOperand()); 13154 13155 // Update the uses. 13156 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13157 UI != UE; ++UI) { 13158 unsigned ResNo = UI.getUse().getResNo(); 13159 // Ignore uses of the chain result. 13160 if (ResNo == NumVecs) 13161 continue; 13162 SDNode *User = *UI; 13163 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 13164 } 13165 13166 // Now the vldN-lane intrinsic is dead except for its chain result. 13167 // Update uses of the chain. 13168 std::vector<SDValue> VLDDupResults; 13169 for (unsigned n = 0; n < NumVecs; ++n) 13170 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 13171 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 13172 DCI.CombineTo(VLD, VLDDupResults); 13173 13174 return true; 13175 } 13176 13177 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 13178 /// ARMISD::VDUPLANE. 13179 static SDValue PerformVDUPLANECombine(SDNode *N, 13180 TargetLowering::DAGCombinerInfo &DCI) { 13181 SDValue Op = N->getOperand(0); 13182 13183 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 13184 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 13185 if (CombineVLDDUP(N, DCI)) 13186 return SDValue(N, 0); 13187 13188 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 13189 // redundant. Ignore bit_converts for now; element sizes are checked below. 13190 while (Op.getOpcode() == ISD::BITCAST) 13191 Op = Op.getOperand(0); 13192 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 13193 return SDValue(); 13194 13195 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 13196 unsigned EltSize = Op.getScalarValueSizeInBits(); 13197 // The canonical VMOV for a zero vector uses a 32-bit element size. 13198 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13199 unsigned EltBits; 13200 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 13201 EltSize = 8; 13202 EVT VT = N->getValueType(0); 13203 if (EltSize > VT.getScalarSizeInBits()) 13204 return SDValue(); 13205 13206 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 13207 } 13208 13209 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 13210 static SDValue PerformVDUPCombine(SDNode *N, 13211 TargetLowering::DAGCombinerInfo &DCI, 13212 const ARMSubtarget *Subtarget) { 13213 SelectionDAG &DAG = DCI.DAG; 13214 SDValue Op = N->getOperand(0); 13215 13216 if (!Subtarget->hasNEON()) 13217 return SDValue(); 13218 13219 // Match VDUP(LOAD) -> VLD1DUP. 13220 // We match this pattern here rather than waiting for isel because the 13221 // transform is only legal for unindexed loads. 13222 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 13223 if (LD && Op.hasOneUse() && LD->isUnindexed() && 13224 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 13225 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 13226 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 13227 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 13228 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 13229 Ops, LD->getMemoryVT(), 13230 LD->getMemOperand()); 13231 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 13232 return VLDDup; 13233 } 13234 13235 return SDValue(); 13236 } 13237 13238 static SDValue PerformLOADCombine(SDNode *N, 13239 TargetLowering::DAGCombinerInfo &DCI) { 13240 EVT VT = N->getValueType(0); 13241 13242 // If this is a legal vector load, try to combine it into a VLD1_UPD. 13243 if (ISD::isNormalLoad(N) && VT.isVector() && 13244 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13245 return CombineBaseUpdate(N, DCI); 13246 13247 return SDValue(); 13248 } 13249 13250 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 13251 // pack all of the elements in one place. Next, store to memory in fewer 13252 // chunks. 13253 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 13254 SelectionDAG &DAG) { 13255 SDValue StVal = St->getValue(); 13256 EVT VT = StVal.getValueType(); 13257 if (!St->isTruncatingStore() || !VT.isVector()) 13258 return SDValue(); 13259 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13260 EVT StVT = St->getMemoryVT(); 13261 unsigned NumElems = VT.getVectorNumElements(); 13262 assert(StVT != VT && "Cannot truncate to the same type"); 13263 unsigned FromEltSz = VT.getScalarSizeInBits(); 13264 unsigned ToEltSz = StVT.getScalarSizeInBits(); 13265 13266 // From, To sizes and ElemCount must be pow of two 13267 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 13268 return SDValue(); 13269 13270 // We are going to use the original vector elt for storing. 13271 // Accumulated smaller vector elements must be a multiple of the store size. 13272 if (0 != (NumElems * FromEltSz) % ToEltSz) 13273 return SDValue(); 13274 13275 unsigned SizeRatio = FromEltSz / ToEltSz; 13276 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 13277 13278 // Create a type on which we perform the shuffle. 13279 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 13280 NumElems * SizeRatio); 13281 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 13282 13283 SDLoc DL(St); 13284 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 13285 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13286 for (unsigned i = 0; i < NumElems; ++i) 13287 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 13288 : i * SizeRatio; 13289 13290 // Can't shuffle using an illegal type. 13291 if (!TLI.isTypeLegal(WideVecVT)) 13292 return SDValue(); 13293 13294 SDValue Shuff = DAG.getVectorShuffle( 13295 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 13296 // At this point all of the data is stored at the bottom of the 13297 // register. We now need to save it to mem. 13298 13299 // Find the largest store unit 13300 MVT StoreType = MVT::i8; 13301 for (MVT Tp : MVT::integer_valuetypes()) { 13302 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 13303 StoreType = Tp; 13304 } 13305 // Didn't find a legal store type. 13306 if (!TLI.isTypeLegal(StoreType)) 13307 return SDValue(); 13308 13309 // Bitcast the original vector into a vector of store-size units 13310 EVT StoreVecVT = 13311 EVT::getVectorVT(*DAG.getContext(), StoreType, 13312 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 13313 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 13314 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 13315 SmallVector<SDValue, 8> Chains; 13316 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 13317 TLI.getPointerTy(DAG.getDataLayout())); 13318 SDValue BasePtr = St->getBasePtr(); 13319 13320 // Perform one or more big stores into memory. 13321 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 13322 for (unsigned I = 0; I < E; I++) { 13323 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 13324 ShuffWide, DAG.getIntPtrConstant(I, DL)); 13325 SDValue Ch = 13326 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 13327 St->getAlignment(), St->getMemOperand()->getFlags()); 13328 BasePtr = 13329 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 13330 Chains.push_back(Ch); 13331 } 13332 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 13333 } 13334 13335 // Try taking a single vector store from an truncate (which would otherwise turn 13336 // into an expensive buildvector) and splitting it into a series of narrowing 13337 // stores. 13338 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 13339 SelectionDAG &DAG) { 13340 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 13341 return SDValue(); 13342 SDValue Trunc = St->getValue(); 13343 if (Trunc->getOpcode() != ISD::TRUNCATE) 13344 return SDValue(); 13345 EVT FromVT = Trunc->getOperand(0).getValueType(); 13346 EVT ToVT = Trunc.getValueType(); 13347 if (!ToVT.isVector()) 13348 return SDValue(); 13349 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 13350 EVT ToEltVT = ToVT.getVectorElementType(); 13351 EVT FromEltVT = FromVT.getVectorElementType(); 13352 13353 unsigned NumElements = 0; 13354 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 13355 NumElements = 4; 13356 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 13357 NumElements = 8; 13358 if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || 13359 FromVT.getVectorNumElements() % NumElements != 0) 13360 return SDValue(); 13361 13362 SDLoc DL(St); 13363 // Details about the old store 13364 SDValue Ch = St->getChain(); 13365 SDValue BasePtr = St->getBasePtr(); 13366 unsigned Alignment = St->getOriginalAlignment(); 13367 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 13368 AAMDNodes AAInfo = St->getAAInfo(); 13369 13370 EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); 13371 EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); 13372 13373 SmallVector<SDValue, 4> Stores; 13374 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 13375 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 13376 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 13377 13378 SDValue Extract = 13379 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 13380 DAG.getConstant(i * NumElements, DL, MVT::i32)); 13381 SDValue Store = DAG.getTruncStore( 13382 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 13383 NewToVT, Alignment, MMOFlags, AAInfo); 13384 Stores.push_back(Store); 13385 } 13386 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 13387 } 13388 13389 /// PerformSTORECombine - Target-specific dag combine xforms for 13390 /// ISD::STORE. 13391 static SDValue PerformSTORECombine(SDNode *N, 13392 TargetLowering::DAGCombinerInfo &DCI, 13393 const ARMSubtarget *Subtarget) { 13394 StoreSDNode *St = cast<StoreSDNode>(N); 13395 if (St->isVolatile()) 13396 return SDValue(); 13397 SDValue StVal = St->getValue(); 13398 EVT VT = StVal.getValueType(); 13399 13400 if (Subtarget->hasNEON()) 13401 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 13402 return Store; 13403 13404 if (Subtarget->hasMVEIntegerOps()) 13405 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 13406 return NewToken; 13407 13408 if (!ISD::isNormalStore(St)) 13409 return SDValue(); 13410 13411 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 13412 // ARM stores of arguments in the same cache line. 13413 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 13414 StVal.getNode()->hasOneUse()) { 13415 SelectionDAG &DAG = DCI.DAG; 13416 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 13417 SDLoc DL(St); 13418 SDValue BasePtr = St->getBasePtr(); 13419 SDValue NewST1 = DAG.getStore( 13420 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 13421 BasePtr, St->getPointerInfo(), St->getAlignment(), 13422 St->getMemOperand()->getFlags()); 13423 13424 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13425 DAG.getConstant(4, DL, MVT::i32)); 13426 return DAG.getStore(NewST1.getValue(0), DL, 13427 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 13428 OffsetPtr, St->getPointerInfo(), 13429 std::min(4U, St->getAlignment() / 2), 13430 St->getMemOperand()->getFlags()); 13431 } 13432 13433 if (StVal.getValueType() == MVT::i64 && 13434 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13435 13436 // Bitcast an i64 store extracted from a vector to f64. 13437 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13438 SelectionDAG &DAG = DCI.DAG; 13439 SDLoc dl(StVal); 13440 SDValue IntVec = StVal.getOperand(0); 13441 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13442 IntVec.getValueType().getVectorNumElements()); 13443 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 13444 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13445 Vec, StVal.getOperand(1)); 13446 dl = SDLoc(N); 13447 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 13448 // Make the DAGCombiner fold the bitcasts. 13449 DCI.AddToWorklist(Vec.getNode()); 13450 DCI.AddToWorklist(ExtElt.getNode()); 13451 DCI.AddToWorklist(V.getNode()); 13452 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 13453 St->getPointerInfo(), St->getAlignment(), 13454 St->getMemOperand()->getFlags(), St->getAAInfo()); 13455 } 13456 13457 // If this is a legal vector store, try to combine it into a VST1_UPD. 13458 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 13459 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13460 return CombineBaseUpdate(N, DCI); 13461 13462 return SDValue(); 13463 } 13464 13465 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 13466 /// can replace combinations of VMUL and VCVT (floating-point to integer) 13467 /// when the VMUL has a constant operand that is a power of 2. 13468 /// 13469 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13470 /// vmul.f32 d16, d17, d16 13471 /// vcvt.s32.f32 d16, d16 13472 /// becomes: 13473 /// vcvt.s32.f32 d16, d16, #3 13474 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 13475 const ARMSubtarget *Subtarget) { 13476 if (!Subtarget->hasNEON()) 13477 return SDValue(); 13478 13479 SDValue Op = N->getOperand(0); 13480 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 13481 Op.getOpcode() != ISD::FMUL) 13482 return SDValue(); 13483 13484 SDValue ConstVec = Op->getOperand(1); 13485 if (!isa<BuildVectorSDNode>(ConstVec)) 13486 return SDValue(); 13487 13488 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 13489 uint32_t FloatBits = FloatTy.getSizeInBits(); 13490 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 13491 uint32_t IntBits = IntTy.getSizeInBits(); 13492 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13493 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13494 // These instructions only exist converting from f32 to i32. We can handle 13495 // smaller integers by generating an extra truncate, but larger ones would 13496 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13497 // these intructions only support v2i32/v4i32 types. 13498 return SDValue(); 13499 } 13500 13501 BitVector UndefElements; 13502 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13503 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13504 if (C == -1 || C == 0 || C > 32) 13505 return SDValue(); 13506 13507 SDLoc dl(N); 13508 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 13509 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 13510 Intrinsic::arm_neon_vcvtfp2fxu; 13511 SDValue FixConv = DAG.getNode( 13512 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13513 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 13514 DAG.getConstant(C, dl, MVT::i32)); 13515 13516 if (IntBits < FloatBits) 13517 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 13518 13519 return FixConv; 13520 } 13521 13522 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 13523 /// can replace combinations of VCVT (integer to floating-point) and VDIV 13524 /// when the VDIV has a constant operand that is a power of 2. 13525 /// 13526 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13527 /// vcvt.f32.s32 d16, d16 13528 /// vdiv.f32 d16, d17, d16 13529 /// becomes: 13530 /// vcvt.f32.s32 d16, d16, #3 13531 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 13532 const ARMSubtarget *Subtarget) { 13533 if (!Subtarget->hasNEON()) 13534 return SDValue(); 13535 13536 SDValue Op = N->getOperand(0); 13537 unsigned OpOpcode = Op.getNode()->getOpcode(); 13538 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 13539 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 13540 return SDValue(); 13541 13542 SDValue ConstVec = N->getOperand(1); 13543 if (!isa<BuildVectorSDNode>(ConstVec)) 13544 return SDValue(); 13545 13546 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 13547 uint32_t FloatBits = FloatTy.getSizeInBits(); 13548 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 13549 uint32_t IntBits = IntTy.getSizeInBits(); 13550 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13551 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13552 // These instructions only exist converting from i32 to f32. We can handle 13553 // smaller integers by generating an extra extend, but larger ones would 13554 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13555 // these intructions only support v2i32/v4i32 types. 13556 return SDValue(); 13557 } 13558 13559 BitVector UndefElements; 13560 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13561 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13562 if (C == -1 || C == 0 || C > 32) 13563 return SDValue(); 13564 13565 SDLoc dl(N); 13566 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 13567 SDValue ConvInput = Op.getOperand(0); 13568 if (IntBits < FloatBits) 13569 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 13570 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 13571 ConvInput); 13572 13573 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 13574 Intrinsic::arm_neon_vcvtfxu2fp; 13575 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 13576 Op.getValueType(), 13577 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 13578 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 13579 } 13580 13581 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 13582 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 13583 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 13584 switch (IntNo) { 13585 default: 13586 // Don't do anything for most intrinsics. 13587 break; 13588 13589 // Vector shifts: check for immediate versions and lower them. 13590 // Note: This is done during DAG combining instead of DAG legalizing because 13591 // the build_vectors for 64-bit vector element shift counts are generally 13592 // not legal, and it is hard to see their values after they get legalized to 13593 // loads from a constant pool. 13594 case Intrinsic::arm_neon_vshifts: 13595 case Intrinsic::arm_neon_vshiftu: 13596 case Intrinsic::arm_neon_vrshifts: 13597 case Intrinsic::arm_neon_vrshiftu: 13598 case Intrinsic::arm_neon_vrshiftn: 13599 case Intrinsic::arm_neon_vqshifts: 13600 case Intrinsic::arm_neon_vqshiftu: 13601 case Intrinsic::arm_neon_vqshiftsu: 13602 case Intrinsic::arm_neon_vqshiftns: 13603 case Intrinsic::arm_neon_vqshiftnu: 13604 case Intrinsic::arm_neon_vqshiftnsu: 13605 case Intrinsic::arm_neon_vqrshiftns: 13606 case Intrinsic::arm_neon_vqrshiftnu: 13607 case Intrinsic::arm_neon_vqrshiftnsu: { 13608 EVT VT = N->getOperand(1).getValueType(); 13609 int64_t Cnt; 13610 unsigned VShiftOpc = 0; 13611 13612 switch (IntNo) { 13613 case Intrinsic::arm_neon_vshifts: 13614 case Intrinsic::arm_neon_vshiftu: 13615 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 13616 VShiftOpc = ARMISD::VSHLIMM; 13617 break; 13618 } 13619 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 13620 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 13621 : ARMISD::VSHRuIMM); 13622 break; 13623 } 13624 return SDValue(); 13625 13626 case Intrinsic::arm_neon_vrshifts: 13627 case Intrinsic::arm_neon_vrshiftu: 13628 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 13629 break; 13630 return SDValue(); 13631 13632 case Intrinsic::arm_neon_vqshifts: 13633 case Intrinsic::arm_neon_vqshiftu: 13634 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13635 break; 13636 return SDValue(); 13637 13638 case Intrinsic::arm_neon_vqshiftsu: 13639 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 13640 break; 13641 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 13642 13643 case Intrinsic::arm_neon_vrshiftn: 13644 case Intrinsic::arm_neon_vqshiftns: 13645 case Intrinsic::arm_neon_vqshiftnu: 13646 case Intrinsic::arm_neon_vqshiftnsu: 13647 case Intrinsic::arm_neon_vqrshiftns: 13648 case Intrinsic::arm_neon_vqrshiftnu: 13649 case Intrinsic::arm_neon_vqrshiftnsu: 13650 // Narrowing shifts require an immediate right shift. 13651 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 13652 break; 13653 llvm_unreachable("invalid shift count for narrowing vector shift " 13654 "intrinsic"); 13655 13656 default: 13657 llvm_unreachable("unhandled vector shift"); 13658 } 13659 13660 switch (IntNo) { 13661 case Intrinsic::arm_neon_vshifts: 13662 case Intrinsic::arm_neon_vshiftu: 13663 // Opcode already set above. 13664 break; 13665 case Intrinsic::arm_neon_vrshifts: 13666 VShiftOpc = ARMISD::VRSHRsIMM; 13667 break; 13668 case Intrinsic::arm_neon_vrshiftu: 13669 VShiftOpc = ARMISD::VRSHRuIMM; 13670 break; 13671 case Intrinsic::arm_neon_vrshiftn: 13672 VShiftOpc = ARMISD::VRSHRNIMM; 13673 break; 13674 case Intrinsic::arm_neon_vqshifts: 13675 VShiftOpc = ARMISD::VQSHLsIMM; 13676 break; 13677 case Intrinsic::arm_neon_vqshiftu: 13678 VShiftOpc = ARMISD::VQSHLuIMM; 13679 break; 13680 case Intrinsic::arm_neon_vqshiftsu: 13681 VShiftOpc = ARMISD::VQSHLsuIMM; 13682 break; 13683 case Intrinsic::arm_neon_vqshiftns: 13684 VShiftOpc = ARMISD::VQSHRNsIMM; 13685 break; 13686 case Intrinsic::arm_neon_vqshiftnu: 13687 VShiftOpc = ARMISD::VQSHRNuIMM; 13688 break; 13689 case Intrinsic::arm_neon_vqshiftnsu: 13690 VShiftOpc = ARMISD::VQSHRNsuIMM; 13691 break; 13692 case Intrinsic::arm_neon_vqrshiftns: 13693 VShiftOpc = ARMISD::VQRSHRNsIMM; 13694 break; 13695 case Intrinsic::arm_neon_vqrshiftnu: 13696 VShiftOpc = ARMISD::VQRSHRNuIMM; 13697 break; 13698 case Intrinsic::arm_neon_vqrshiftnsu: 13699 VShiftOpc = ARMISD::VQRSHRNsuIMM; 13700 break; 13701 } 13702 13703 SDLoc dl(N); 13704 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13705 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 13706 } 13707 13708 case Intrinsic::arm_neon_vshiftins: { 13709 EVT VT = N->getOperand(1).getValueType(); 13710 int64_t Cnt; 13711 unsigned VShiftOpc = 0; 13712 13713 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 13714 VShiftOpc = ARMISD::VSLIIMM; 13715 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 13716 VShiftOpc = ARMISD::VSRIIMM; 13717 else { 13718 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 13719 } 13720 13721 SDLoc dl(N); 13722 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 13723 N->getOperand(1), N->getOperand(2), 13724 DAG.getConstant(Cnt, dl, MVT::i32)); 13725 } 13726 13727 case Intrinsic::arm_neon_vqrshifts: 13728 case Intrinsic::arm_neon_vqrshiftu: 13729 // No immediate versions of these to check for. 13730 break; 13731 } 13732 13733 return SDValue(); 13734 } 13735 13736 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 13737 /// lowers them. As with the vector shift intrinsics, this is done during DAG 13738 /// combining instead of DAG legalizing because the build_vectors for 64-bit 13739 /// vector element shift counts are generally not legal, and it is hard to see 13740 /// their values after they get legalized to loads from a constant pool. 13741 static SDValue PerformShiftCombine(SDNode *N, 13742 TargetLowering::DAGCombinerInfo &DCI, 13743 const ARMSubtarget *ST) { 13744 SelectionDAG &DAG = DCI.DAG; 13745 EVT VT = N->getValueType(0); 13746 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 13747 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 13748 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 13749 SDValue N1 = N->getOperand(1); 13750 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 13751 SDValue N0 = N->getOperand(0); 13752 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 13753 DAG.MaskedValueIsZero(N0.getOperand(0), 13754 APInt::getHighBitsSet(32, 16))) 13755 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 13756 } 13757 } 13758 13759 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 13760 N->getOperand(0)->getOpcode() == ISD::AND && 13761 N->getOperand(0)->hasOneUse()) { 13762 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13763 return SDValue(); 13764 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 13765 // usually show up because instcombine prefers to canonicalize it to 13766 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 13767 // out of GEP lowering in some cases. 13768 SDValue N0 = N->getOperand(0); 13769 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13770 if (!ShiftAmtNode) 13771 return SDValue(); 13772 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 13773 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 13774 if (!AndMaskNode) 13775 return SDValue(); 13776 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 13777 // Don't transform uxtb/uxth. 13778 if (AndMask == 255 || AndMask == 65535) 13779 return SDValue(); 13780 if (isMask_32(AndMask)) { 13781 uint32_t MaskedBits = countLeadingZeros(AndMask); 13782 if (MaskedBits > ShiftAmt) { 13783 SDLoc DL(N); 13784 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13785 DAG.getConstant(MaskedBits, DL, MVT::i32)); 13786 return DAG.getNode( 13787 ISD::SRL, DL, MVT::i32, SHL, 13788 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 13789 } 13790 } 13791 } 13792 13793 // Nothing to be done for scalar shifts. 13794 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13795 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 13796 return SDValue(); 13797 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 13798 return SDValue(); 13799 13800 int64_t Cnt; 13801 13802 switch (N->getOpcode()) { 13803 default: llvm_unreachable("unexpected shift opcode"); 13804 13805 case ISD::SHL: 13806 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 13807 SDLoc dl(N); 13808 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 13809 DAG.getConstant(Cnt, dl, MVT::i32)); 13810 } 13811 break; 13812 13813 case ISD::SRA: 13814 case ISD::SRL: 13815 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 13816 unsigned VShiftOpc = 13817 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 13818 SDLoc dl(N); 13819 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 13820 DAG.getConstant(Cnt, dl, MVT::i32)); 13821 } 13822 } 13823 return SDValue(); 13824 } 13825 13826 // Look for a sign/zero extend of a larger than legal load. This can be split 13827 // into two extending loads, which are simpler to deal with than an arbitrary 13828 // sign extend. 13829 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 13830 SDValue N0 = N->getOperand(0); 13831 if (N0.getOpcode() != ISD::LOAD) 13832 return SDValue(); 13833 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 13834 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 13835 LD->getExtensionType() != ISD::NON_EXTLOAD) 13836 return SDValue(); 13837 EVT FromVT = LD->getValueType(0); 13838 EVT ToVT = N->getValueType(0); 13839 if (!ToVT.isVector()) 13840 return SDValue(); 13841 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 13842 EVT ToEltVT = ToVT.getVectorElementType(); 13843 EVT FromEltVT = FromVT.getVectorElementType(); 13844 13845 unsigned NumElements = 0; 13846 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 13847 NumElements = 4; 13848 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 13849 NumElements = 8; 13850 if (NumElements == 0 || 13851 FromVT.getVectorNumElements() == NumElements || 13852 FromVT.getVectorNumElements() % NumElements != 0 || 13853 !isPowerOf2_32(NumElements)) 13854 return SDValue(); 13855 13856 SDLoc DL(LD); 13857 // Details about the old load 13858 SDValue Ch = LD->getChain(); 13859 SDValue BasePtr = LD->getBasePtr(); 13860 unsigned Alignment = LD->getOriginalAlignment(); 13861 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 13862 AAMDNodes AAInfo = LD->getAAInfo(); 13863 13864 ISD::LoadExtType NewExtType = 13865 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 13866 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 13867 EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); 13868 EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 13869 unsigned NewOffset = NewFromVT.getSizeInBits() / 8; 13870 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 13871 13872 // Split the load in half, each side of which is extended separately. This 13873 // is good enough, as legalisation will take it from there. They are either 13874 // already legal or they will be split further into something that is 13875 // legal. 13876 SDValue NewLoad1 = 13877 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, 13878 LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); 13879 SDValue NewLoad2 = 13880 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 13881 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 13882 Alignment, MMOFlags, AAInfo); 13883 13884 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 13885 SDValue(NewLoad1.getNode(), 1), 13886 SDValue(NewLoad2.getNode(), 1)); 13887 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 13888 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); 13889 } 13890 13891 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 13892 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 13893 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 13894 const ARMSubtarget *ST) { 13895 SDValue N0 = N->getOperand(0); 13896 13897 // Check for sign- and zero-extensions of vector extract operations of 8- 13898 // and 16-bit vector elements. NEON supports these directly. They are 13899 // handled during DAG combining because type legalization will promote them 13900 // to 32-bit types and it is messy to recognize the operations after that. 13901 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13902 SDValue Vec = N0.getOperand(0); 13903 SDValue Lane = N0.getOperand(1); 13904 EVT VT = N->getValueType(0); 13905 EVT EltVT = N0.getValueType(); 13906 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13907 13908 if (VT == MVT::i32 && 13909 (EltVT == MVT::i8 || EltVT == MVT::i16) && 13910 TLI.isTypeLegal(Vec.getValueType()) && 13911 isa<ConstantSDNode>(Lane)) { 13912 13913 unsigned Opc = 0; 13914 switch (N->getOpcode()) { 13915 default: llvm_unreachable("unexpected opcode"); 13916 case ISD::SIGN_EXTEND: 13917 Opc = ARMISD::VGETLANEs; 13918 break; 13919 case ISD::ZERO_EXTEND: 13920 case ISD::ANY_EXTEND: 13921 Opc = ARMISD::VGETLANEu; 13922 break; 13923 } 13924 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 13925 } 13926 } 13927 13928 if (ST->hasMVEIntegerOps()) 13929 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 13930 return NewLoad; 13931 13932 return SDValue(); 13933 } 13934 13935 static const APInt *isPowerOf2Constant(SDValue V) { 13936 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 13937 if (!C) 13938 return nullptr; 13939 const APInt *CV = &C->getAPIntValue(); 13940 return CV->isPowerOf2() ? CV : nullptr; 13941 } 13942 13943 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 13944 // If we have a CMOV, OR and AND combination such as: 13945 // if (x & CN) 13946 // y |= CM; 13947 // 13948 // And: 13949 // * CN is a single bit; 13950 // * All bits covered by CM are known zero in y 13951 // 13952 // Then we can convert this into a sequence of BFI instructions. This will 13953 // always be a win if CM is a single bit, will always be no worse than the 13954 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 13955 // three bits (due to the extra IT instruction). 13956 13957 SDValue Op0 = CMOV->getOperand(0); 13958 SDValue Op1 = CMOV->getOperand(1); 13959 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 13960 auto CC = CCNode->getAPIntValue().getLimitedValue(); 13961 SDValue CmpZ = CMOV->getOperand(4); 13962 13963 // The compare must be against zero. 13964 if (!isNullConstant(CmpZ->getOperand(1))) 13965 return SDValue(); 13966 13967 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 13968 SDValue And = CmpZ->getOperand(0); 13969 if (And->getOpcode() != ISD::AND) 13970 return SDValue(); 13971 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 13972 if (!AndC) 13973 return SDValue(); 13974 SDValue X = And->getOperand(0); 13975 13976 if (CC == ARMCC::EQ) { 13977 // We're performing an "equal to zero" compare. Swap the operands so we 13978 // canonicalize on a "not equal to zero" compare. 13979 std::swap(Op0, Op1); 13980 } else { 13981 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 13982 } 13983 13984 if (Op1->getOpcode() != ISD::OR) 13985 return SDValue(); 13986 13987 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 13988 if (!OrC) 13989 return SDValue(); 13990 SDValue Y = Op1->getOperand(0); 13991 13992 if (Op0 != Y) 13993 return SDValue(); 13994 13995 // Now, is it profitable to continue? 13996 APInt OrCI = OrC->getAPIntValue(); 13997 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 13998 if (OrCI.countPopulation() > Heuristic) 13999 return SDValue(); 14000 14001 // Lastly, can we determine that the bits defined by OrCI 14002 // are zero in Y? 14003 KnownBits Known = DAG.computeKnownBits(Y); 14004 if ((OrCI & Known.Zero) != OrCI) 14005 return SDValue(); 14006 14007 // OK, we can do the combine. 14008 SDValue V = Y; 14009 SDLoc dl(X); 14010 EVT VT = X.getValueType(); 14011 unsigned BitInX = AndC->logBase2(); 14012 14013 if (BitInX != 0) { 14014 // We must shift X first. 14015 X = DAG.getNode(ISD::SRL, dl, VT, X, 14016 DAG.getConstant(BitInX, dl, VT)); 14017 } 14018 14019 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 14020 BitInY < NumActiveBits; ++BitInY) { 14021 if (OrCI[BitInY] == 0) 14022 continue; 14023 APInt Mask(VT.getSizeInBits(), 0); 14024 Mask.setBit(BitInY); 14025 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 14026 // Confusingly, the operand is an *inverted* mask. 14027 DAG.getConstant(~Mask, dl, VT)); 14028 } 14029 14030 return V; 14031 } 14032 14033 // Given N, the value controlling the conditional branch, search for the loop 14034 // intrinsic, returning it, along with how the value is used. We need to handle 14035 // patterns such as the following: 14036 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 14037 // (brcond (setcc (loop.decrement), 0, eq), exit) 14038 // (brcond (setcc (loop.decrement), 0, ne), header) 14039 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 14040 bool &Negate) { 14041 switch (N->getOpcode()) { 14042 default: 14043 break; 14044 case ISD::XOR: { 14045 if (!isa<ConstantSDNode>(N.getOperand(1))) 14046 return SDValue(); 14047 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 14048 return SDValue(); 14049 Negate = !Negate; 14050 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 14051 } 14052 case ISD::SETCC: { 14053 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 14054 if (!Const) 14055 return SDValue(); 14056 if (Const->isNullValue()) 14057 Imm = 0; 14058 else if (Const->isOne()) 14059 Imm = 1; 14060 else 14061 return SDValue(); 14062 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 14063 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 14064 } 14065 case ISD::INTRINSIC_W_CHAIN: { 14066 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 14067 if (IntOp != Intrinsic::test_set_loop_iterations && 14068 IntOp != Intrinsic::loop_decrement_reg) 14069 return SDValue(); 14070 return N; 14071 } 14072 } 14073 return SDValue(); 14074 } 14075 14076 static SDValue PerformHWLoopCombine(SDNode *N, 14077 TargetLowering::DAGCombinerInfo &DCI, 14078 const ARMSubtarget *ST) { 14079 14080 // The hwloop intrinsics that we're interested are used for control-flow, 14081 // either for entering or exiting the loop: 14082 // - test.set.loop.iterations will test whether its operand is zero. If it 14083 // is zero, the proceeding branch should not enter the loop. 14084 // - loop.decrement.reg also tests whether its operand is zero. If it is 14085 // zero, the proceeding branch should not branch back to the beginning of 14086 // the loop. 14087 // So here, we need to check that how the brcond is using the result of each 14088 // of the intrinsics to ensure that we're branching to the right place at the 14089 // right time. 14090 14091 ISD::CondCode CC; 14092 SDValue Cond; 14093 int Imm = 1; 14094 bool Negate = false; 14095 SDValue Chain = N->getOperand(0); 14096 SDValue Dest; 14097 14098 if (N->getOpcode() == ISD::BRCOND) { 14099 CC = ISD::SETEQ; 14100 Cond = N->getOperand(1); 14101 Dest = N->getOperand(2); 14102 } else { 14103 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 14104 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 14105 Cond = N->getOperand(2); 14106 Dest = N->getOperand(4); 14107 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 14108 if (!Const->isOne() && !Const->isNullValue()) 14109 return SDValue(); 14110 Imm = Const->getZExtValue(); 14111 } else 14112 return SDValue(); 14113 } 14114 14115 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 14116 if (!Int) 14117 return SDValue(); 14118 14119 if (Negate) 14120 CC = ISD::getSetCCInverse(CC, true); 14121 14122 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 14123 return (CC == ISD::SETEQ && Imm == 0) || 14124 (CC == ISD::SETNE && Imm == 1) || 14125 (CC == ISD::SETLT && Imm == 1) || 14126 (CC == ISD::SETULT && Imm == 1); 14127 }; 14128 14129 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 14130 return (CC == ISD::SETEQ && Imm == 1) || 14131 (CC == ISD::SETNE && Imm == 0) || 14132 (CC == ISD::SETGT && Imm == 0) || 14133 (CC == ISD::SETUGT && Imm == 0) || 14134 (CC == ISD::SETGE && Imm == 1) || 14135 (CC == ISD::SETUGE && Imm == 1); 14136 }; 14137 14138 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 14139 "unsupported condition"); 14140 14141 SDLoc dl(Int); 14142 SelectionDAG &DAG = DCI.DAG; 14143 SDValue Elements = Int.getOperand(2); 14144 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 14145 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 14146 && "expected single br user"); 14147 SDNode *Br = *N->use_begin(); 14148 SDValue OtherTarget = Br->getOperand(1); 14149 14150 // Update the unconditional branch to branch to the given Dest. 14151 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 14152 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 14153 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 14154 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 14155 }; 14156 14157 if (IntOp == Intrinsic::test_set_loop_iterations) { 14158 SDValue Res; 14159 // We expect this 'instruction' to branch when the counter is zero. 14160 if (IsTrueIfZero(CC, Imm)) { 14161 SDValue Ops[] = { Chain, Elements, Dest }; 14162 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14163 } else { 14164 // The logic is the reverse of what we need for WLS, so find the other 14165 // basic block target: the target of the proceeding br. 14166 UpdateUncondBr(Br, Dest, DAG); 14167 14168 SDValue Ops[] = { Chain, Elements, OtherTarget }; 14169 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14170 } 14171 DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); 14172 return Res; 14173 } else { 14174 SDValue Size = DAG.getTargetConstant( 14175 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 14176 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 14177 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 14178 DAG.getVTList(MVT::i32, MVT::Other), Args); 14179 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 14180 14181 // We expect this instruction to branch when the count is not zero. 14182 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 14183 14184 // Update the unconditional branch to target the loop preheader if we've 14185 // found the condition has been reversed. 14186 if (Target == OtherTarget) 14187 UpdateUncondBr(Br, Dest, DAG); 14188 14189 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 14190 SDValue(LoopDec.getNode(), 1), Chain); 14191 14192 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 14193 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 14194 } 14195 return SDValue(); 14196 } 14197 14198 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 14199 SDValue 14200 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 14201 SDValue Cmp = N->getOperand(4); 14202 if (Cmp.getOpcode() != ARMISD::CMPZ) 14203 // Only looking at NE cases. 14204 return SDValue(); 14205 14206 EVT VT = N->getValueType(0); 14207 SDLoc dl(N); 14208 SDValue LHS = Cmp.getOperand(0); 14209 SDValue RHS = Cmp.getOperand(1); 14210 SDValue Chain = N->getOperand(0); 14211 SDValue BB = N->getOperand(1); 14212 SDValue ARMcc = N->getOperand(2); 14213 ARMCC::CondCodes CC = 14214 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14215 14216 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 14217 // -> (brcond Chain BB CC CPSR Cmp) 14218 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 14219 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 14220 LHS->getOperand(0)->hasOneUse()) { 14221 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 14222 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 14223 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14224 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14225 if ((LHS00C && LHS00C->getZExtValue() == 0) && 14226 (LHS01C && LHS01C->getZExtValue() == 1) && 14227 (LHS1C && LHS1C->getZExtValue() == 1) && 14228 (RHSC && RHSC->getZExtValue() == 0)) { 14229 return DAG.getNode( 14230 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 14231 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 14232 } 14233 } 14234 14235 return SDValue(); 14236 } 14237 14238 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 14239 SDValue 14240 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 14241 SDValue Cmp = N->getOperand(4); 14242 if (Cmp.getOpcode() != ARMISD::CMPZ) 14243 // Only looking at EQ and NE cases. 14244 return SDValue(); 14245 14246 EVT VT = N->getValueType(0); 14247 SDLoc dl(N); 14248 SDValue LHS = Cmp.getOperand(0); 14249 SDValue RHS = Cmp.getOperand(1); 14250 SDValue FalseVal = N->getOperand(0); 14251 SDValue TrueVal = N->getOperand(1); 14252 SDValue ARMcc = N->getOperand(2); 14253 ARMCC::CondCodes CC = 14254 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14255 14256 // BFI is only available on V6T2+. 14257 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 14258 SDValue R = PerformCMOVToBFICombine(N, DAG); 14259 if (R) 14260 return R; 14261 } 14262 14263 // Simplify 14264 // mov r1, r0 14265 // cmp r1, x 14266 // mov r0, y 14267 // moveq r0, x 14268 // to 14269 // cmp r0, x 14270 // movne r0, y 14271 // 14272 // mov r1, r0 14273 // cmp r1, x 14274 // mov r0, x 14275 // movne r0, y 14276 // to 14277 // cmp r0, x 14278 // movne r0, y 14279 /// FIXME: Turn this into a target neutral optimization? 14280 SDValue Res; 14281 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 14282 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 14283 N->getOperand(3), Cmp); 14284 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 14285 SDValue ARMcc; 14286 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 14287 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 14288 N->getOperand(3), NewCmp); 14289 } 14290 14291 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 14292 // -> (cmov F T CC CPSR Cmp) 14293 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 14294 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 14295 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14296 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14297 if ((LHS0C && LHS0C->getZExtValue() == 0) && 14298 (LHS1C && LHS1C->getZExtValue() == 1) && 14299 (RHSC && RHSC->getZExtValue() == 0)) { 14300 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 14301 LHS->getOperand(2), LHS->getOperand(3), 14302 LHS->getOperand(4)); 14303 } 14304 } 14305 14306 if (!VT.isInteger()) 14307 return SDValue(); 14308 14309 // Materialize a boolean comparison for integers so we can avoid branching. 14310 if (isNullConstant(FalseVal)) { 14311 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 14312 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 14313 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 14314 // right 5 bits will make that 32 be 1, otherwise it will be 0. 14315 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 14316 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14317 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 14318 DAG.getConstant(5, dl, MVT::i32)); 14319 } else { 14320 // CMOV 0, 1, ==, (CMPZ x, y) -> 14321 // (ADDCARRY (SUB x, y), t:0, t:1) 14322 // where t = (SUBCARRY 0, (SUB x, y), 0) 14323 // 14324 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 14325 // x != y. In other words, a carry C == 1 when x == y, C == 0 14326 // otherwise. 14327 // The final ADDCARRY computes 14328 // x - y + (0 - (x - y)) + C == C 14329 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14330 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14331 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 14332 // ISD::SUBCARRY returns a borrow but we want the carry here 14333 // actually. 14334 SDValue Carry = 14335 DAG.getNode(ISD::SUB, dl, MVT::i32, 14336 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 14337 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 14338 } 14339 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 14340 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 14341 // This seems pointless but will allow us to combine it further below. 14342 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14343 SDValue Sub = 14344 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14345 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14346 Sub.getValue(1), SDValue()); 14347 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 14348 N->getOperand(3), CPSRGlue.getValue(1)); 14349 FalseVal = Sub; 14350 } 14351 } else if (isNullConstant(TrueVal)) { 14352 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 14353 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 14354 // This seems pointless but will allow us to combine it further below 14355 // Note that we change == for != as this is the dual for the case above. 14356 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14357 SDValue Sub = 14358 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14359 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14360 Sub.getValue(1), SDValue()); 14361 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 14362 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 14363 N->getOperand(3), CPSRGlue.getValue(1)); 14364 FalseVal = Sub; 14365 } 14366 } 14367 14368 // On Thumb1, the DAG above may be further combined if z is a power of 2 14369 // (z == 2 ^ K). 14370 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 14371 // t1 = (USUBO (SUB x, y), 1) 14372 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 14373 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14374 // 14375 // This also handles the special case of comparing against zero; it's 14376 // essentially, the same pattern, except there's no SUBS: 14377 // CMOV x, z, !=, (CMPZ x, 0) -> 14378 // t1 = (USUBO x, 1) 14379 // t2 = (SUBCARRY x, t1:0, t1:1) 14380 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14381 const APInt *TrueConst; 14382 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 14383 ((FalseVal.getOpcode() == ARMISD::SUBS && 14384 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 14385 (FalseVal == LHS && isNullConstant(RHS))) && 14386 (TrueConst = isPowerOf2Constant(TrueVal))) { 14387 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14388 unsigned ShiftAmount = TrueConst->logBase2(); 14389 if (ShiftAmount) 14390 TrueVal = DAG.getConstant(1, dl, VT); 14391 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 14392 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 14393 14394 if (ShiftAmount) 14395 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 14396 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 14397 } 14398 14399 if (Res.getNode()) { 14400 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 14401 // Capture demanded bits information that would be otherwise lost. 14402 if (Known.Zero == 0xfffffffe) 14403 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14404 DAG.getValueType(MVT::i1)); 14405 else if (Known.Zero == 0xffffff00) 14406 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14407 DAG.getValueType(MVT::i8)); 14408 else if (Known.Zero == 0xffff0000) 14409 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14410 DAG.getValueType(MVT::i16)); 14411 } 14412 14413 return Res; 14414 } 14415 14416 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 14417 DAGCombinerInfo &DCI) const { 14418 switch (N->getOpcode()) { 14419 default: break; 14420 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 14421 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 14422 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 14423 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 14424 case ISD::SUB: return PerformSUBCombine(N, DCI); 14425 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 14426 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 14427 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 14428 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 14429 case ISD::BRCOND: 14430 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 14431 case ARMISD::ADDC: 14432 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 14433 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 14434 case ARMISD::BFI: return PerformBFICombine(N, DCI); 14435 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 14436 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 14437 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 14438 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 14439 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 14440 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 14441 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 14442 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 14443 case ISD::FP_TO_SINT: 14444 case ISD::FP_TO_UINT: 14445 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 14446 case ISD::FDIV: 14447 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 14448 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 14449 case ISD::SHL: 14450 case ISD::SRA: 14451 case ISD::SRL: 14452 return PerformShiftCombine(N, DCI, Subtarget); 14453 case ISD::SIGN_EXTEND: 14454 case ISD::ZERO_EXTEND: 14455 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 14456 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 14457 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 14458 case ISD::LOAD: return PerformLOADCombine(N, DCI); 14459 case ARMISD::VLD1DUP: 14460 case ARMISD::VLD2DUP: 14461 case ARMISD::VLD3DUP: 14462 case ARMISD::VLD4DUP: 14463 return PerformVLDCombine(N, DCI); 14464 case ARMISD::BUILD_VECTOR: 14465 return PerformARMBUILD_VECTORCombine(N, DCI); 14466 case ARMISD::PREDICATE_CAST: 14467 return PerformPREDICATE_CASTCombine(N, DCI); 14468 case ARMISD::SMULWB: { 14469 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14470 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14471 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14472 return SDValue(); 14473 break; 14474 } 14475 case ARMISD::SMULWT: { 14476 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14477 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14478 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14479 return SDValue(); 14480 break; 14481 } 14482 case ARMISD::SMLALBB: 14483 case ARMISD::QADD16b: 14484 case ARMISD::QSUB16b: { 14485 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14486 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 14487 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14488 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14489 return SDValue(); 14490 break; 14491 } 14492 case ARMISD::SMLALBT: { 14493 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 14494 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14495 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 14496 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14497 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 14498 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 14499 return SDValue(); 14500 break; 14501 } 14502 case ARMISD::SMLALTB: { 14503 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 14504 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 14505 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 14506 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 14507 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 14508 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 14509 return SDValue(); 14510 break; 14511 } 14512 case ARMISD::SMLALTT: { 14513 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14514 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 14515 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14516 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14517 return SDValue(); 14518 break; 14519 } 14520 case ARMISD::QADD8b: 14521 case ARMISD::QSUB8b: { 14522 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 14523 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 14524 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 14525 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 14526 return SDValue(); 14527 break; 14528 } 14529 case ISD::INTRINSIC_VOID: 14530 case ISD::INTRINSIC_W_CHAIN: 14531 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 14532 case Intrinsic::arm_neon_vld1: 14533 case Intrinsic::arm_neon_vld1x2: 14534 case Intrinsic::arm_neon_vld1x3: 14535 case Intrinsic::arm_neon_vld1x4: 14536 case Intrinsic::arm_neon_vld2: 14537 case Intrinsic::arm_neon_vld3: 14538 case Intrinsic::arm_neon_vld4: 14539 case Intrinsic::arm_neon_vld2lane: 14540 case Intrinsic::arm_neon_vld3lane: 14541 case Intrinsic::arm_neon_vld4lane: 14542 case Intrinsic::arm_neon_vld2dup: 14543 case Intrinsic::arm_neon_vld3dup: 14544 case Intrinsic::arm_neon_vld4dup: 14545 case Intrinsic::arm_neon_vst1: 14546 case Intrinsic::arm_neon_vst1x2: 14547 case Intrinsic::arm_neon_vst1x3: 14548 case Intrinsic::arm_neon_vst1x4: 14549 case Intrinsic::arm_neon_vst2: 14550 case Intrinsic::arm_neon_vst3: 14551 case Intrinsic::arm_neon_vst4: 14552 case Intrinsic::arm_neon_vst2lane: 14553 case Intrinsic::arm_neon_vst3lane: 14554 case Intrinsic::arm_neon_vst4lane: 14555 return PerformVLDCombine(N, DCI); 14556 default: break; 14557 } 14558 break; 14559 } 14560 return SDValue(); 14561 } 14562 14563 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 14564 EVT VT) const { 14565 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 14566 } 14567 14568 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 14569 unsigned Alignment, 14570 MachineMemOperand::Flags, 14571 bool *Fast) const { 14572 // Depends what it gets converted into if the type is weird. 14573 if (!VT.isSimple()) 14574 return false; 14575 14576 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 14577 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 14578 auto Ty = VT.getSimpleVT().SimpleTy; 14579 14580 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 14581 // Unaligned access can use (for example) LRDB, LRDH, LDR 14582 if (AllowsUnaligned) { 14583 if (Fast) 14584 *Fast = Subtarget->hasV7Ops(); 14585 return true; 14586 } 14587 } 14588 14589 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 14590 // For any little-endian targets with neon, we can support unaligned ld/st 14591 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 14592 // A big-endian target may also explicitly support unaligned accesses 14593 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 14594 if (Fast) 14595 *Fast = true; 14596 return true; 14597 } 14598 } 14599 14600 if (!Subtarget->hasMVEIntegerOps()) 14601 return false; 14602 14603 // These are for predicates 14604 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 14605 if (Fast) 14606 *Fast = true; 14607 return true; 14608 } 14609 14610 // These are for truncated stores/narrowing loads. They are fine so long as 14611 // the alignment is at least the size of the item being loaded 14612 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 14613 Alignment >= VT.getScalarSizeInBits() / 8) { 14614 if (Fast) 14615 *Fast = true; 14616 return true; 14617 } 14618 14619 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 14620 // VSTRW.U32 all store the vector register in exactly the same format, and 14621 // differ only in the range of their immediate offset field and the required 14622 // alignment. So there is always a store that can be used, regardless of 14623 // actual type. 14624 // 14625 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 14626 // VREV64.8) pair and get the same effect. This will likely be better than 14627 // aligning the vector through the stack. 14628 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 14629 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 14630 Ty == MVT::v2f64) { 14631 if (Fast) 14632 *Fast = true; 14633 return true; 14634 } 14635 14636 return false; 14637 } 14638 14639 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 14640 unsigned AlignCheck) { 14641 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 14642 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 14643 } 14644 14645 EVT ARMTargetLowering::getOptimalMemOpType( 14646 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, 14647 bool ZeroMemset, bool MemcpyStrSrc, 14648 const AttributeList &FuncAttributes) const { 14649 // See if we can use NEON instructions for this... 14650 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 14651 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 14652 bool Fast; 14653 if (Size >= 16 && 14654 (memOpAlign(SrcAlign, DstAlign, 16) || 14655 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, 14656 MachineMemOperand::MONone, &Fast) && 14657 Fast))) { 14658 return MVT::v2f64; 14659 } else if (Size >= 8 && 14660 (memOpAlign(SrcAlign, DstAlign, 8) || 14661 (allowsMisalignedMemoryAccesses( 14662 MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && 14663 Fast))) { 14664 return MVT::f64; 14665 } 14666 } 14667 14668 // Let the target-independent logic figure it out. 14669 return MVT::Other; 14670 } 14671 14672 // 64-bit integers are split into their high and low parts and held in two 14673 // different registers, so the trunc is free since the low register can just 14674 // be used. 14675 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 14676 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 14677 return false; 14678 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 14679 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 14680 return (SrcBits == 64 && DestBits == 32); 14681 } 14682 14683 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 14684 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 14685 !DstVT.isInteger()) 14686 return false; 14687 unsigned SrcBits = SrcVT.getSizeInBits(); 14688 unsigned DestBits = DstVT.getSizeInBits(); 14689 return (SrcBits == 64 && DestBits == 32); 14690 } 14691 14692 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14693 if (Val.getOpcode() != ISD::LOAD) 14694 return false; 14695 14696 EVT VT1 = Val.getValueType(); 14697 if (!VT1.isSimple() || !VT1.isInteger() || 14698 !VT2.isSimple() || !VT2.isInteger()) 14699 return false; 14700 14701 switch (VT1.getSimpleVT().SimpleTy) { 14702 default: break; 14703 case MVT::i1: 14704 case MVT::i8: 14705 case MVT::i16: 14706 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 14707 return true; 14708 } 14709 14710 return false; 14711 } 14712 14713 bool ARMTargetLowering::isFNegFree(EVT VT) const { 14714 if (!VT.isSimple()) 14715 return false; 14716 14717 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 14718 // negate values directly (fneg is free). So, we don't want to let the DAG 14719 // combiner rewrite fneg into xors and some other instructions. For f16 and 14720 // FullFP16 argument passing, some bitcast nodes may be introduced, 14721 // triggering this DAG combine rewrite, so we are avoiding that with this. 14722 switch (VT.getSimpleVT().SimpleTy) { 14723 default: break; 14724 case MVT::f16: 14725 return Subtarget->hasFullFP16(); 14726 } 14727 14728 return false; 14729 } 14730 14731 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 14732 /// of the vector elements. 14733 static bool areExtractExts(Value *Ext1, Value *Ext2) { 14734 auto areExtDoubled = [](Instruction *Ext) { 14735 return Ext->getType()->getScalarSizeInBits() == 14736 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 14737 }; 14738 14739 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 14740 !match(Ext2, m_ZExtOrSExt(m_Value())) || 14741 !areExtDoubled(cast<Instruction>(Ext1)) || 14742 !areExtDoubled(cast<Instruction>(Ext2))) 14743 return false; 14744 14745 return true; 14746 } 14747 14748 /// Check if sinking \p I's operands to I's basic block is profitable, because 14749 /// the operands can be folded into a target instruction, e.g. 14750 /// sext/zext can be folded into vsubl. 14751 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 14752 SmallVectorImpl<Use *> &Ops) const { 14753 if (!I->getType()->isVectorTy()) 14754 return false; 14755 14756 if (Subtarget->hasNEON()) { 14757 switch (I->getOpcode()) { 14758 case Instruction::Sub: 14759 case Instruction::Add: { 14760 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 14761 return false; 14762 Ops.push_back(&I->getOperandUse(0)); 14763 Ops.push_back(&I->getOperandUse(1)); 14764 return true; 14765 } 14766 default: 14767 return false; 14768 } 14769 } 14770 14771 if (!Subtarget->hasMVEIntegerOps()) 14772 return false; 14773 14774 auto IsSinker = [](Instruction *I, int Operand) { 14775 switch (I->getOpcode()) { 14776 case Instruction::Add: 14777 case Instruction::Mul: 14778 return true; 14779 case Instruction::Sub: 14780 return Operand == 1; 14781 default: 14782 return false; 14783 } 14784 }; 14785 14786 int Op = 0; 14787 if (!isa<ShuffleVectorInst>(I->getOperand(Op))) 14788 Op = 1; 14789 if (!IsSinker(I, Op)) 14790 return false; 14791 if (!match(I->getOperand(Op), 14792 m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), 14793 m_Undef(), m_Zero()))) { 14794 return false; 14795 } 14796 Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); 14797 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 14798 // and vector registers 14799 for (Use &U : Shuffle->uses()) { 14800 Instruction *Insn = cast<Instruction>(U.getUser()); 14801 if (!IsSinker(Insn, U.getOperandNo())) 14802 return false; 14803 } 14804 Ops.push_back(&Shuffle->getOperandUse(0)); 14805 Ops.push_back(&I->getOperandUse(Op)); 14806 return true; 14807 } 14808 14809 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 14810 EVT VT = ExtVal.getValueType(); 14811 14812 if (!isTypeLegal(VT)) 14813 return false; 14814 14815 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 14816 if (Ld->isExpandingLoad()) 14817 return false; 14818 } 14819 14820 // Don't create a loadext if we can fold the extension into a wide/long 14821 // instruction. 14822 // If there's more than one user instruction, the loadext is desirable no 14823 // matter what. There can be two uses by the same instruction. 14824 if (ExtVal->use_empty() || 14825 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 14826 return true; 14827 14828 SDNode *U = *ExtVal->use_begin(); 14829 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 14830 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 14831 return false; 14832 14833 return true; 14834 } 14835 14836 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 14837 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 14838 return false; 14839 14840 if (!isTypeLegal(EVT::getEVT(Ty1))) 14841 return false; 14842 14843 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 14844 14845 // Assuming the caller doesn't have a zeroext or signext return parameter, 14846 // truncation all the way down to i1 is valid. 14847 return true; 14848 } 14849 14850 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 14851 const AddrMode &AM, Type *Ty, 14852 unsigned AS) const { 14853 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 14854 if (Subtarget->hasFPAO()) 14855 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 14856 return 0; 14857 } 14858 return -1; 14859 } 14860 14861 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 14862 if (V < 0) 14863 return false; 14864 14865 unsigned Scale = 1; 14866 switch (VT.getSimpleVT().SimpleTy) { 14867 case MVT::i1: 14868 case MVT::i8: 14869 // Scale == 1; 14870 break; 14871 case MVT::i16: 14872 // Scale == 2; 14873 Scale = 2; 14874 break; 14875 default: 14876 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 14877 // Scale == 4; 14878 Scale = 4; 14879 break; 14880 } 14881 14882 if ((V & (Scale - 1)) != 0) 14883 return false; 14884 return isUInt<5>(V / Scale); 14885 } 14886 14887 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 14888 const ARMSubtarget *Subtarget) { 14889 if (!VT.isInteger() && !VT.isFloatingPoint()) 14890 return false; 14891 if (VT.isVector() && Subtarget->hasNEON()) 14892 return false; 14893 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 14894 !Subtarget->hasMVEFloatOps()) 14895 return false; 14896 14897 bool IsNeg = false; 14898 if (V < 0) { 14899 IsNeg = true; 14900 V = -V; 14901 } 14902 14903 unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U); 14904 14905 // MVE: size * imm7 14906 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 14907 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 14908 case MVT::i32: 14909 case MVT::f32: 14910 return isShiftedUInt<7,2>(V); 14911 case MVT::i16: 14912 case MVT::f16: 14913 return isShiftedUInt<7,1>(V); 14914 case MVT::i8: 14915 return isUInt<7>(V); 14916 default: 14917 return false; 14918 } 14919 } 14920 14921 // half VLDR: 2 * imm8 14922 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 14923 return isShiftedUInt<8, 1>(V); 14924 // VLDR and LDRD: 4 * imm8 14925 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 14926 return isShiftedUInt<8, 2>(V); 14927 14928 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 14929 // + imm12 or - imm8 14930 if (IsNeg) 14931 return isUInt<8>(V); 14932 return isUInt<12>(V); 14933 } 14934 14935 return false; 14936 } 14937 14938 /// isLegalAddressImmediate - Return true if the integer value can be used 14939 /// as the offset of the target addressing mode for load / store of the 14940 /// given type. 14941 static bool isLegalAddressImmediate(int64_t V, EVT VT, 14942 const ARMSubtarget *Subtarget) { 14943 if (V == 0) 14944 return true; 14945 14946 if (!VT.isSimple()) 14947 return false; 14948 14949 if (Subtarget->isThumb1Only()) 14950 return isLegalT1AddressImmediate(V, VT); 14951 else if (Subtarget->isThumb2()) 14952 return isLegalT2AddressImmediate(V, VT, Subtarget); 14953 14954 // ARM mode. 14955 if (V < 0) 14956 V = - V; 14957 switch (VT.getSimpleVT().SimpleTy) { 14958 default: return false; 14959 case MVT::i1: 14960 case MVT::i8: 14961 case MVT::i32: 14962 // +- imm12 14963 return isUInt<12>(V); 14964 case MVT::i16: 14965 // +- imm8 14966 return isUInt<8>(V); 14967 case MVT::f32: 14968 case MVT::f64: 14969 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 14970 return false; 14971 return isShiftedUInt<8, 2>(V); 14972 } 14973 } 14974 14975 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 14976 EVT VT) const { 14977 int Scale = AM.Scale; 14978 if (Scale < 0) 14979 return false; 14980 14981 switch (VT.getSimpleVT().SimpleTy) { 14982 default: return false; 14983 case MVT::i1: 14984 case MVT::i8: 14985 case MVT::i16: 14986 case MVT::i32: 14987 if (Scale == 1) 14988 return true; 14989 // r + r << imm 14990 Scale = Scale & ~1; 14991 return Scale == 2 || Scale == 4 || Scale == 8; 14992 case MVT::i64: 14993 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 14994 // version in Thumb mode. 14995 // r + r 14996 if (Scale == 1) 14997 return true; 14998 // r * 2 (this can be lowered to r + r). 14999 if (!AM.HasBaseReg && Scale == 2) 15000 return true; 15001 return false; 15002 case MVT::isVoid: 15003 // Note, we allow "void" uses (basically, uses that aren't loads or 15004 // stores), because arm allows folding a scale into many arithmetic 15005 // operations. This should be made more precise and revisited later. 15006 15007 // Allow r << imm, but the imm has to be a multiple of two. 15008 if (Scale & 1) return false; 15009 return isPowerOf2_32(Scale); 15010 } 15011 } 15012 15013 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 15014 EVT VT) const { 15015 const int Scale = AM.Scale; 15016 15017 // Negative scales are not supported in Thumb1. 15018 if (Scale < 0) 15019 return false; 15020 15021 // Thumb1 addressing modes do not support register scaling excepting the 15022 // following cases: 15023 // 1. Scale == 1 means no scaling. 15024 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 15025 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 15026 } 15027 15028 /// isLegalAddressingMode - Return true if the addressing mode represented 15029 /// by AM is legal for this target, for a load/store of the specified type. 15030 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 15031 const AddrMode &AM, Type *Ty, 15032 unsigned AS, Instruction *I) const { 15033 EVT VT = getValueType(DL, Ty, true); 15034 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 15035 return false; 15036 15037 // Can never fold addr of global into load/store. 15038 if (AM.BaseGV) 15039 return false; 15040 15041 switch (AM.Scale) { 15042 case 0: // no scale reg, must be "r+i" or "r", or "i". 15043 break; 15044 default: 15045 // ARM doesn't support any R+R*scale+imm addr modes. 15046 if (AM.BaseOffs) 15047 return false; 15048 15049 if (!VT.isSimple()) 15050 return false; 15051 15052 if (Subtarget->isThumb1Only()) 15053 return isLegalT1ScaledAddressingMode(AM, VT); 15054 15055 if (Subtarget->isThumb2()) 15056 return isLegalT2ScaledAddressingMode(AM, VT); 15057 15058 int Scale = AM.Scale; 15059 switch (VT.getSimpleVT().SimpleTy) { 15060 default: return false; 15061 case MVT::i1: 15062 case MVT::i8: 15063 case MVT::i32: 15064 if (Scale < 0) Scale = -Scale; 15065 if (Scale == 1) 15066 return true; 15067 // r + r << imm 15068 return isPowerOf2_32(Scale & ~1); 15069 case MVT::i16: 15070 case MVT::i64: 15071 // r +/- r 15072 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 15073 return true; 15074 // r * 2 (this can be lowered to r + r). 15075 if (!AM.HasBaseReg && Scale == 2) 15076 return true; 15077 return false; 15078 15079 case MVT::isVoid: 15080 // Note, we allow "void" uses (basically, uses that aren't loads or 15081 // stores), because arm allows folding a scale into many arithmetic 15082 // operations. This should be made more precise and revisited later. 15083 15084 // Allow r << imm, but the imm has to be a multiple of two. 15085 if (Scale & 1) return false; 15086 return isPowerOf2_32(Scale); 15087 } 15088 } 15089 return true; 15090 } 15091 15092 /// isLegalICmpImmediate - Return true if the specified immediate is legal 15093 /// icmp immediate, that is the target has icmp instructions which can compare 15094 /// a register against the immediate without having to materialize the 15095 /// immediate into a register. 15096 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 15097 // Thumb2 and ARM modes can use cmn for negative immediates. 15098 if (!Subtarget->isThumb()) 15099 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 15100 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 15101 if (Subtarget->isThumb2()) 15102 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 15103 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 15104 // Thumb1 doesn't have cmn, and only 8-bit immediates. 15105 return Imm >= 0 && Imm <= 255; 15106 } 15107 15108 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 15109 /// *or sub* immediate, that is the target has add or sub instructions which can 15110 /// add a register with the immediate without having to materialize the 15111 /// immediate into a register. 15112 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 15113 // Same encoding for add/sub, just flip the sign. 15114 int64_t AbsImm = std::abs(Imm); 15115 if (!Subtarget->isThumb()) 15116 return ARM_AM::getSOImmVal(AbsImm) != -1; 15117 if (Subtarget->isThumb2()) 15118 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 15119 // Thumb1 only has 8-bit unsigned immediate. 15120 return AbsImm >= 0 && AbsImm <= 255; 15121 } 15122 15123 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 15124 bool isSEXTLoad, SDValue &Base, 15125 SDValue &Offset, bool &isInc, 15126 SelectionDAG &DAG) { 15127 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15128 return false; 15129 15130 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 15131 // AddressingMode 3 15132 Base = Ptr->getOperand(0); 15133 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15134 int RHSC = (int)RHS->getZExtValue(); 15135 if (RHSC < 0 && RHSC > -256) { 15136 assert(Ptr->getOpcode() == ISD::ADD); 15137 isInc = false; 15138 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15139 return true; 15140 } 15141 } 15142 isInc = (Ptr->getOpcode() == ISD::ADD); 15143 Offset = Ptr->getOperand(1); 15144 return true; 15145 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 15146 // AddressingMode 2 15147 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15148 int RHSC = (int)RHS->getZExtValue(); 15149 if (RHSC < 0 && RHSC > -0x1000) { 15150 assert(Ptr->getOpcode() == ISD::ADD); 15151 isInc = false; 15152 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15153 Base = Ptr->getOperand(0); 15154 return true; 15155 } 15156 } 15157 15158 if (Ptr->getOpcode() == ISD::ADD) { 15159 isInc = true; 15160 ARM_AM::ShiftOpc ShOpcVal= 15161 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 15162 if (ShOpcVal != ARM_AM::no_shift) { 15163 Base = Ptr->getOperand(1); 15164 Offset = Ptr->getOperand(0); 15165 } else { 15166 Base = Ptr->getOperand(0); 15167 Offset = Ptr->getOperand(1); 15168 } 15169 return true; 15170 } 15171 15172 isInc = (Ptr->getOpcode() == ISD::ADD); 15173 Base = Ptr->getOperand(0); 15174 Offset = Ptr->getOperand(1); 15175 return true; 15176 } 15177 15178 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 15179 return false; 15180 } 15181 15182 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 15183 bool isSEXTLoad, SDValue &Base, 15184 SDValue &Offset, bool &isInc, 15185 SelectionDAG &DAG) { 15186 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15187 return false; 15188 15189 Base = Ptr->getOperand(0); 15190 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15191 int RHSC = (int)RHS->getZExtValue(); 15192 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 15193 assert(Ptr->getOpcode() == ISD::ADD); 15194 isInc = false; 15195 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15196 return true; 15197 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 15198 isInc = Ptr->getOpcode() == ISD::ADD; 15199 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15200 return true; 15201 } 15202 } 15203 15204 return false; 15205 } 15206 15207 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, 15208 bool isSEXTLoad, bool isLE, SDValue &Base, 15209 SDValue &Offset, bool &isInc, 15210 SelectionDAG &DAG) { 15211 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15212 return false; 15213 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 15214 return false; 15215 15216 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 15217 int RHSC = (int)RHS->getZExtValue(); 15218 15219 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 15220 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 15221 assert(Ptr->getOpcode() == ISD::ADD); 15222 isInc = false; 15223 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15224 return true; 15225 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 15226 isInc = Ptr->getOpcode() == ISD::ADD; 15227 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15228 return true; 15229 } 15230 return false; 15231 }; 15232 15233 // Try to find a matching instruction based on s/zext, Alignment, Offset and 15234 // (in BE) type. 15235 Base = Ptr->getOperand(0); 15236 if (VT == MVT::v4i16) { 15237 if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) 15238 return true; 15239 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 15240 if (IsInRange(RHSC, 0x80, 1)) 15241 return true; 15242 } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) && 15243 IsInRange(RHSC, 0x80, 4)) 15244 return true; 15245 else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) && 15246 IsInRange(RHSC, 0x80, 2)) 15247 return true; 15248 else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 15249 return true; 15250 return false; 15251 } 15252 15253 /// getPreIndexedAddressParts - returns true by value, base pointer and 15254 /// offset pointer and addressing mode by reference if the node's address 15255 /// can be legally represented as pre-indexed load / store address. 15256 bool 15257 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 15258 SDValue &Offset, 15259 ISD::MemIndexedMode &AM, 15260 SelectionDAG &DAG) const { 15261 if (Subtarget->isThumb1Only()) 15262 return false; 15263 15264 EVT VT; 15265 SDValue Ptr; 15266 unsigned Align; 15267 bool isSEXTLoad = false; 15268 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15269 Ptr = LD->getBasePtr(); 15270 VT = LD->getMemoryVT(); 15271 Align = LD->getAlignment(); 15272 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15273 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15274 Ptr = ST->getBasePtr(); 15275 VT = ST->getMemoryVT(); 15276 Align = ST->getAlignment(); 15277 } else 15278 return false; 15279 15280 bool isInc; 15281 bool isLegal = false; 15282 if (VT.isVector()) 15283 isLegal = Subtarget->hasMVEIntegerOps() && 15284 getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, 15285 Subtarget->isLittle(), Base, Offset, 15286 isInc, DAG); 15287 else { 15288 if (Subtarget->isThumb2()) 15289 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15290 Offset, isInc, DAG); 15291 else 15292 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15293 Offset, isInc, DAG); 15294 } 15295 if (!isLegal) 15296 return false; 15297 15298 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 15299 return true; 15300 } 15301 15302 /// getPostIndexedAddressParts - returns true by value, base pointer and 15303 /// offset pointer and addressing mode by reference if this node can be 15304 /// combined with a load / store to form a post-indexed load / store. 15305 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 15306 SDValue &Base, 15307 SDValue &Offset, 15308 ISD::MemIndexedMode &AM, 15309 SelectionDAG &DAG) const { 15310 EVT VT; 15311 SDValue Ptr; 15312 unsigned Align; 15313 bool isSEXTLoad = false, isNonExt; 15314 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15315 VT = LD->getMemoryVT(); 15316 Ptr = LD->getBasePtr(); 15317 Align = LD->getAlignment(); 15318 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15319 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15320 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15321 VT = ST->getMemoryVT(); 15322 Ptr = ST->getBasePtr(); 15323 Align = ST->getAlignment(); 15324 isNonExt = !ST->isTruncatingStore(); 15325 } else 15326 return false; 15327 15328 if (Subtarget->isThumb1Only()) { 15329 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 15330 // must be non-extending/truncating, i32, with an offset of 4. 15331 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 15332 if (Op->getOpcode() != ISD::ADD || !isNonExt) 15333 return false; 15334 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 15335 if (!RHS || RHS->getZExtValue() != 4) 15336 return false; 15337 15338 Offset = Op->getOperand(1); 15339 Base = Op->getOperand(0); 15340 AM = ISD::POST_INC; 15341 return true; 15342 } 15343 15344 bool isInc; 15345 bool isLegal = false; 15346 if (VT.isVector()) 15347 isLegal = Subtarget->hasMVEIntegerOps() && 15348 getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, 15349 Subtarget->isLittle(), Base, Offset, 15350 isInc, DAG); 15351 else { 15352 if (Subtarget->isThumb2()) 15353 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15354 isInc, DAG); 15355 else 15356 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 15357 isInc, DAG); 15358 } 15359 if (!isLegal) 15360 return false; 15361 15362 if (Ptr != Base) { 15363 // Swap base ptr and offset to catch more post-index load / store when 15364 // it's legal. In Thumb2 mode, offset must be an immediate. 15365 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 15366 !Subtarget->isThumb2()) 15367 std::swap(Base, Offset); 15368 15369 // Post-indexed load / store update the base pointer. 15370 if (Ptr != Base) 15371 return false; 15372 } 15373 15374 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 15375 return true; 15376 } 15377 15378 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 15379 KnownBits &Known, 15380 const APInt &DemandedElts, 15381 const SelectionDAG &DAG, 15382 unsigned Depth) const { 15383 unsigned BitWidth = Known.getBitWidth(); 15384 Known.resetAll(); 15385 switch (Op.getOpcode()) { 15386 default: break; 15387 case ARMISD::ADDC: 15388 case ARMISD::ADDE: 15389 case ARMISD::SUBC: 15390 case ARMISD::SUBE: 15391 // Special cases when we convert a carry to a boolean. 15392 if (Op.getResNo() == 0) { 15393 SDValue LHS = Op.getOperand(0); 15394 SDValue RHS = Op.getOperand(1); 15395 // (ADDE 0, 0, C) will give us a single bit. 15396 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 15397 isNullConstant(RHS)) { 15398 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 15399 return; 15400 } 15401 } 15402 break; 15403 case ARMISD::CMOV: { 15404 // Bits are known zero/one if known on the LHS and RHS. 15405 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 15406 if (Known.isUnknown()) 15407 return; 15408 15409 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 15410 Known.Zero &= KnownRHS.Zero; 15411 Known.One &= KnownRHS.One; 15412 return; 15413 } 15414 case ISD::INTRINSIC_W_CHAIN: { 15415 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 15416 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 15417 switch (IntID) { 15418 default: return; 15419 case Intrinsic::arm_ldaex: 15420 case Intrinsic::arm_ldrex: { 15421 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 15422 unsigned MemBits = VT.getScalarSizeInBits(); 15423 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 15424 return; 15425 } 15426 } 15427 } 15428 case ARMISD::BFI: { 15429 // Conservatively, we can recurse down the first operand 15430 // and just mask out all affected bits. 15431 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 15432 15433 // The operand to BFI is already a mask suitable for removing the bits it 15434 // sets. 15435 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 15436 const APInt &Mask = CI->getAPIntValue(); 15437 Known.Zero &= Mask; 15438 Known.One &= Mask; 15439 return; 15440 } 15441 case ARMISD::VGETLANEs: 15442 case ARMISD::VGETLANEu: { 15443 const SDValue &SrcSV = Op.getOperand(0); 15444 EVT VecVT = SrcSV.getValueType(); 15445 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 15446 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 15447 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 15448 assert(Pos->getAPIntValue().ult(NumSrcElts) && 15449 "VGETLANE index out of bounds"); 15450 unsigned Idx = Pos->getZExtValue(); 15451 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 15452 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 15453 15454 EVT VT = Op.getValueType(); 15455 const unsigned DstSz = VT.getScalarSizeInBits(); 15456 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 15457 (void)SrcSz; 15458 assert(SrcSz == Known.getBitWidth()); 15459 assert(DstSz > SrcSz); 15460 if (Op.getOpcode() == ARMISD::VGETLANEs) 15461 Known = Known.sext(DstSz); 15462 else { 15463 Known = Known.zext(DstSz, true /* extended bits are known zero */); 15464 } 15465 assert(DstSz == Known.getBitWidth()); 15466 break; 15467 } 15468 } 15469 } 15470 15471 bool 15472 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 15473 const APInt &DemandedAPInt, 15474 TargetLoweringOpt &TLO) const { 15475 // Delay optimization, so we don't have to deal with illegal types, or block 15476 // optimizations. 15477 if (!TLO.LegalOps) 15478 return false; 15479 15480 // Only optimize AND for now. 15481 if (Op.getOpcode() != ISD::AND) 15482 return false; 15483 15484 EVT VT = Op.getValueType(); 15485 15486 // Ignore vectors. 15487 if (VT.isVector()) 15488 return false; 15489 15490 assert(VT == MVT::i32 && "Unexpected integer type"); 15491 15492 // Make sure the RHS really is a constant. 15493 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 15494 if (!C) 15495 return false; 15496 15497 unsigned Mask = C->getZExtValue(); 15498 15499 unsigned Demanded = DemandedAPInt.getZExtValue(); 15500 unsigned ShrunkMask = Mask & Demanded; 15501 unsigned ExpandedMask = Mask | ~Demanded; 15502 15503 // If the mask is all zeros, let the target-independent code replace the 15504 // result with zero. 15505 if (ShrunkMask == 0) 15506 return false; 15507 15508 // If the mask is all ones, erase the AND. (Currently, the target-independent 15509 // code won't do this, so we have to do it explicitly to avoid an infinite 15510 // loop in obscure cases.) 15511 if (ExpandedMask == ~0U) 15512 return TLO.CombineTo(Op, Op.getOperand(0)); 15513 15514 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 15515 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 15516 }; 15517 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 15518 if (NewMask == Mask) 15519 return true; 15520 SDLoc DL(Op); 15521 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 15522 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 15523 return TLO.CombineTo(Op, NewOp); 15524 }; 15525 15526 // Prefer uxtb mask. 15527 if (IsLegalMask(0xFF)) 15528 return UseMask(0xFF); 15529 15530 // Prefer uxth mask. 15531 if (IsLegalMask(0xFFFF)) 15532 return UseMask(0xFFFF); 15533 15534 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 15535 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15536 if (ShrunkMask < 256) 15537 return UseMask(ShrunkMask); 15538 15539 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 15540 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 15541 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 15542 return UseMask(ExpandedMask); 15543 15544 // Potential improvements: 15545 // 15546 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 15547 // We could try to prefer Thumb1 immediates which can be lowered to a 15548 // two-instruction sequence. 15549 // We could try to recognize more legal ARM/Thumb2 immediates here. 15550 15551 return false; 15552 } 15553 15554 15555 //===----------------------------------------------------------------------===// 15556 // ARM Inline Assembly Support 15557 //===----------------------------------------------------------------------===// 15558 15559 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 15560 // Looking for "rev" which is V6+. 15561 if (!Subtarget->hasV6Ops()) 15562 return false; 15563 15564 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 15565 std::string AsmStr = IA->getAsmString(); 15566 SmallVector<StringRef, 4> AsmPieces; 15567 SplitString(AsmStr, AsmPieces, ";\n"); 15568 15569 switch (AsmPieces.size()) { 15570 default: return false; 15571 case 1: 15572 AsmStr = AsmPieces[0]; 15573 AsmPieces.clear(); 15574 SplitString(AsmStr, AsmPieces, " \t,"); 15575 15576 // rev $0, $1 15577 if (AsmPieces.size() == 3 && 15578 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 15579 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 15580 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 15581 if (Ty && Ty->getBitWidth() == 32) 15582 return IntrinsicLowering::LowerToByteSwap(CI); 15583 } 15584 break; 15585 } 15586 15587 return false; 15588 } 15589 15590 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 15591 // At this point, we have to lower this constraint to something else, so we 15592 // lower it to an "r" or "w". However, by doing this we will force the result 15593 // to be in register, while the X constraint is much more permissive. 15594 // 15595 // Although we are correct (we are free to emit anything, without 15596 // constraints), we might break use cases that would expect us to be more 15597 // efficient and emit something else. 15598 if (!Subtarget->hasVFP2Base()) 15599 return "r"; 15600 if (ConstraintVT.isFloatingPoint()) 15601 return "w"; 15602 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 15603 (ConstraintVT.getSizeInBits() == 64 || 15604 ConstraintVT.getSizeInBits() == 128)) 15605 return "w"; 15606 15607 return "r"; 15608 } 15609 15610 /// getConstraintType - Given a constraint letter, return the type of 15611 /// constraint it is for this target. 15612 ARMTargetLowering::ConstraintType 15613 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 15614 unsigned S = Constraint.size(); 15615 if (S == 1) { 15616 switch (Constraint[0]) { 15617 default: break; 15618 case 'l': return C_RegisterClass; 15619 case 'w': return C_RegisterClass; 15620 case 'h': return C_RegisterClass; 15621 case 'x': return C_RegisterClass; 15622 case 't': return C_RegisterClass; 15623 case 'j': return C_Immediate; // Constant for movw. 15624 // An address with a single base register. Due to the way we 15625 // currently handle addresses it is the same as an 'r' memory constraint. 15626 case 'Q': return C_Memory; 15627 } 15628 } else if (S == 2) { 15629 switch (Constraint[0]) { 15630 default: break; 15631 case 'T': return C_RegisterClass; 15632 // All 'U+' constraints are addresses. 15633 case 'U': return C_Memory; 15634 } 15635 } 15636 return TargetLowering::getConstraintType(Constraint); 15637 } 15638 15639 /// Examine constraint type and operand type and determine a weight value. 15640 /// This object must already have been set up with the operand type 15641 /// and the current alternative constraint selected. 15642 TargetLowering::ConstraintWeight 15643 ARMTargetLowering::getSingleConstraintMatchWeight( 15644 AsmOperandInfo &info, const char *constraint) const { 15645 ConstraintWeight weight = CW_Invalid; 15646 Value *CallOperandVal = info.CallOperandVal; 15647 // If we don't have a value, we can't do a match, 15648 // but allow it at the lowest weight. 15649 if (!CallOperandVal) 15650 return CW_Default; 15651 Type *type = CallOperandVal->getType(); 15652 // Look at the constraint type. 15653 switch (*constraint) { 15654 default: 15655 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 15656 break; 15657 case 'l': 15658 if (type->isIntegerTy()) { 15659 if (Subtarget->isThumb()) 15660 weight = CW_SpecificReg; 15661 else 15662 weight = CW_Register; 15663 } 15664 break; 15665 case 'w': 15666 if (type->isFloatingPointTy()) 15667 weight = CW_Register; 15668 break; 15669 } 15670 return weight; 15671 } 15672 15673 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 15674 15675 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 15676 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 15677 switch (Constraint.size()) { 15678 case 1: 15679 // GCC ARM Constraint Letters 15680 switch (Constraint[0]) { 15681 case 'l': // Low regs or general regs. 15682 if (Subtarget->isThumb()) 15683 return RCPair(0U, &ARM::tGPRRegClass); 15684 return RCPair(0U, &ARM::GPRRegClass); 15685 case 'h': // High regs or no regs. 15686 if (Subtarget->isThumb()) 15687 return RCPair(0U, &ARM::hGPRRegClass); 15688 break; 15689 case 'r': 15690 if (Subtarget->isThumb1Only()) 15691 return RCPair(0U, &ARM::tGPRRegClass); 15692 return RCPair(0U, &ARM::GPRRegClass); 15693 case 'w': 15694 if (VT == MVT::Other) 15695 break; 15696 if (VT == MVT::f32) 15697 return RCPair(0U, &ARM::SPRRegClass); 15698 if (VT.getSizeInBits() == 64) 15699 return RCPair(0U, &ARM::DPRRegClass); 15700 if (VT.getSizeInBits() == 128) 15701 return RCPair(0U, &ARM::QPRRegClass); 15702 break; 15703 case 'x': 15704 if (VT == MVT::Other) 15705 break; 15706 if (VT == MVT::f32) 15707 return RCPair(0U, &ARM::SPR_8RegClass); 15708 if (VT.getSizeInBits() == 64) 15709 return RCPair(0U, &ARM::DPR_8RegClass); 15710 if (VT.getSizeInBits() == 128) 15711 return RCPair(0U, &ARM::QPR_8RegClass); 15712 break; 15713 case 't': 15714 if (VT == MVT::Other) 15715 break; 15716 if (VT == MVT::f32 || VT == MVT::i32) 15717 return RCPair(0U, &ARM::SPRRegClass); 15718 if (VT.getSizeInBits() == 64) 15719 return RCPair(0U, &ARM::DPR_VFP2RegClass); 15720 if (VT.getSizeInBits() == 128) 15721 return RCPair(0U, &ARM::QPR_VFP2RegClass); 15722 break; 15723 } 15724 break; 15725 15726 case 2: 15727 if (Constraint[0] == 'T') { 15728 switch (Constraint[1]) { 15729 default: 15730 break; 15731 case 'e': 15732 return RCPair(0U, &ARM::tGPREvenRegClass); 15733 case 'o': 15734 return RCPair(0U, &ARM::tGPROddRegClass); 15735 } 15736 } 15737 break; 15738 15739 default: 15740 break; 15741 } 15742 15743 if (StringRef("{cc}").equals_lower(Constraint)) 15744 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 15745 15746 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 15747 } 15748 15749 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 15750 /// vector. If it is invalid, don't add anything to Ops. 15751 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 15752 std::string &Constraint, 15753 std::vector<SDValue>&Ops, 15754 SelectionDAG &DAG) const { 15755 SDValue Result; 15756 15757 // Currently only support length 1 constraints. 15758 if (Constraint.length() != 1) return; 15759 15760 char ConstraintLetter = Constraint[0]; 15761 switch (ConstraintLetter) { 15762 default: break; 15763 case 'j': 15764 case 'I': case 'J': case 'K': case 'L': 15765 case 'M': case 'N': case 'O': 15766 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 15767 if (!C) 15768 return; 15769 15770 int64_t CVal64 = C->getSExtValue(); 15771 int CVal = (int) CVal64; 15772 // None of these constraints allow values larger than 32 bits. Check 15773 // that the value fits in an int. 15774 if (CVal != CVal64) 15775 return; 15776 15777 switch (ConstraintLetter) { 15778 case 'j': 15779 // Constant suitable for movw, must be between 0 and 15780 // 65535. 15781 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 15782 if (CVal >= 0 && CVal <= 65535) 15783 break; 15784 return; 15785 case 'I': 15786 if (Subtarget->isThumb1Only()) { 15787 // This must be a constant between 0 and 255, for ADD 15788 // immediates. 15789 if (CVal >= 0 && CVal <= 255) 15790 break; 15791 } else if (Subtarget->isThumb2()) { 15792 // A constant that can be used as an immediate value in a 15793 // data-processing instruction. 15794 if (ARM_AM::getT2SOImmVal(CVal) != -1) 15795 break; 15796 } else { 15797 // A constant that can be used as an immediate value in a 15798 // data-processing instruction. 15799 if (ARM_AM::getSOImmVal(CVal) != -1) 15800 break; 15801 } 15802 return; 15803 15804 case 'J': 15805 if (Subtarget->isThumb1Only()) { 15806 // This must be a constant between -255 and -1, for negated ADD 15807 // immediates. This can be used in GCC with an "n" modifier that 15808 // prints the negated value, for use with SUB instructions. It is 15809 // not useful otherwise but is implemented for compatibility. 15810 if (CVal >= -255 && CVal <= -1) 15811 break; 15812 } else { 15813 // This must be a constant between -4095 and 4095. It is not clear 15814 // what this constraint is intended for. Implemented for 15815 // compatibility with GCC. 15816 if (CVal >= -4095 && CVal <= 4095) 15817 break; 15818 } 15819 return; 15820 15821 case 'K': 15822 if (Subtarget->isThumb1Only()) { 15823 // A 32-bit value where only one byte has a nonzero value. Exclude 15824 // zero to match GCC. This constraint is used by GCC internally for 15825 // constants that can be loaded with a move/shift combination. 15826 // It is not useful otherwise but is implemented for compatibility. 15827 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 15828 break; 15829 } else if (Subtarget->isThumb2()) { 15830 // A constant whose bitwise inverse can be used as an immediate 15831 // value in a data-processing instruction. This can be used in GCC 15832 // with a "B" modifier that prints the inverted value, for use with 15833 // BIC and MVN instructions. It is not useful otherwise but is 15834 // implemented for compatibility. 15835 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 15836 break; 15837 } else { 15838 // A constant whose bitwise inverse can be used as an immediate 15839 // value in a data-processing instruction. This can be used in GCC 15840 // with a "B" modifier that prints the inverted value, for use with 15841 // BIC and MVN instructions. It is not useful otherwise but is 15842 // implemented for compatibility. 15843 if (ARM_AM::getSOImmVal(~CVal) != -1) 15844 break; 15845 } 15846 return; 15847 15848 case 'L': 15849 if (Subtarget->isThumb1Only()) { 15850 // This must be a constant between -7 and 7, 15851 // for 3-operand ADD/SUB immediate instructions. 15852 if (CVal >= -7 && CVal < 7) 15853 break; 15854 } else if (Subtarget->isThumb2()) { 15855 // A constant whose negation can be used as an immediate value in a 15856 // data-processing instruction. This can be used in GCC with an "n" 15857 // modifier that prints the negated value, for use with SUB 15858 // instructions. It is not useful otherwise but is implemented for 15859 // compatibility. 15860 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 15861 break; 15862 } else { 15863 // A constant whose negation can be used as an immediate value in a 15864 // data-processing instruction. This can be used in GCC with an "n" 15865 // modifier that prints the negated value, for use with SUB 15866 // instructions. It is not useful otherwise but is implemented for 15867 // compatibility. 15868 if (ARM_AM::getSOImmVal(-CVal) != -1) 15869 break; 15870 } 15871 return; 15872 15873 case 'M': 15874 if (Subtarget->isThumb1Only()) { 15875 // This must be a multiple of 4 between 0 and 1020, for 15876 // ADD sp + immediate. 15877 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 15878 break; 15879 } else { 15880 // A power of two or a constant between 0 and 32. This is used in 15881 // GCC for the shift amount on shifted register operands, but it is 15882 // useful in general for any shift amounts. 15883 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 15884 break; 15885 } 15886 return; 15887 15888 case 'N': 15889 if (Subtarget->isThumb1Only()) { 15890 // This must be a constant between 0 and 31, for shift amounts. 15891 if (CVal >= 0 && CVal <= 31) 15892 break; 15893 } 15894 return; 15895 15896 case 'O': 15897 if (Subtarget->isThumb1Only()) { 15898 // This must be a multiple of 4 between -508 and 508, for 15899 // ADD/SUB sp = sp + immediate. 15900 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 15901 break; 15902 } 15903 return; 15904 } 15905 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 15906 break; 15907 } 15908 15909 if (Result.getNode()) { 15910 Ops.push_back(Result); 15911 return; 15912 } 15913 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 15914 } 15915 15916 static RTLIB::Libcall getDivRemLibcall( 15917 const SDNode *N, MVT::SimpleValueType SVT) { 15918 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 15919 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 15920 "Unhandled Opcode in getDivRemLibcall"); 15921 bool isSigned = N->getOpcode() == ISD::SDIVREM || 15922 N->getOpcode() == ISD::SREM; 15923 RTLIB::Libcall LC; 15924 switch (SVT) { 15925 default: llvm_unreachable("Unexpected request for libcall!"); 15926 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 15927 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 15928 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 15929 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 15930 } 15931 return LC; 15932 } 15933 15934 static TargetLowering::ArgListTy getDivRemArgList( 15935 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 15936 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 15937 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 15938 "Unhandled Opcode in getDivRemArgList"); 15939 bool isSigned = N->getOpcode() == ISD::SDIVREM || 15940 N->getOpcode() == ISD::SREM; 15941 TargetLowering::ArgListTy Args; 15942 TargetLowering::ArgListEntry Entry; 15943 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 15944 EVT ArgVT = N->getOperand(i).getValueType(); 15945 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 15946 Entry.Node = N->getOperand(i); 15947 Entry.Ty = ArgTy; 15948 Entry.IsSExt = isSigned; 15949 Entry.IsZExt = !isSigned; 15950 Args.push_back(Entry); 15951 } 15952 if (Subtarget->isTargetWindows() && Args.size() >= 2) 15953 std::swap(Args[0], Args[1]); 15954 return Args; 15955 } 15956 15957 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 15958 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 15959 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 15960 Subtarget->isTargetWindows()) && 15961 "Register-based DivRem lowering only"); 15962 unsigned Opcode = Op->getOpcode(); 15963 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 15964 "Invalid opcode for Div/Rem lowering"); 15965 bool isSigned = (Opcode == ISD::SDIVREM); 15966 EVT VT = Op->getValueType(0); 15967 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 15968 SDLoc dl(Op); 15969 15970 // If the target has hardware divide, use divide + multiply + subtract: 15971 // div = a / b 15972 // rem = a - b * div 15973 // return {div, rem} 15974 // This should be lowered into UDIV/SDIV + MLS later on. 15975 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 15976 : Subtarget->hasDivideInARMMode(); 15977 if (hasDivide && Op->getValueType(0).isSimple() && 15978 Op->getSimpleValueType(0) == MVT::i32) { 15979 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 15980 const SDValue Dividend = Op->getOperand(0); 15981 const SDValue Divisor = Op->getOperand(1); 15982 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 15983 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 15984 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 15985 15986 SDValue Values[2] = {Div, Rem}; 15987 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 15988 } 15989 15990 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 15991 VT.getSimpleVT().SimpleTy); 15992 SDValue InChain = DAG.getEntryNode(); 15993 15994 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 15995 DAG.getContext(), 15996 Subtarget); 15997 15998 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 15999 getPointerTy(DAG.getDataLayout())); 16000 16001 Type *RetTy = StructType::get(Ty, Ty); 16002 16003 if (Subtarget->isTargetWindows()) 16004 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 16005 16006 TargetLowering::CallLoweringInfo CLI(DAG); 16007 CLI.setDebugLoc(dl).setChain(InChain) 16008 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 16009 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16010 16011 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16012 return CallInfo.first; 16013 } 16014 16015 // Lowers REM using divmod helpers 16016 // see RTABI section 4.2/4.3 16017 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 16018 // Build return types (div and rem) 16019 std::vector<Type*> RetTyParams; 16020 Type *RetTyElement; 16021 16022 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 16023 default: llvm_unreachable("Unexpected request for libcall!"); 16024 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 16025 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 16026 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 16027 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 16028 } 16029 16030 RetTyParams.push_back(RetTyElement); 16031 RetTyParams.push_back(RetTyElement); 16032 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 16033 Type *RetTy = StructType::get(*DAG.getContext(), ret); 16034 16035 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 16036 SimpleTy); 16037 SDValue InChain = DAG.getEntryNode(); 16038 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 16039 Subtarget); 16040 bool isSigned = N->getOpcode() == ISD::SREM; 16041 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16042 getPointerTy(DAG.getDataLayout())); 16043 16044 if (Subtarget->isTargetWindows()) 16045 InChain = WinDBZCheckDenominator(DAG, N, InChain); 16046 16047 // Lower call 16048 CallLoweringInfo CLI(DAG); 16049 CLI.setChain(InChain) 16050 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 16051 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 16052 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 16053 16054 // Return second (rem) result operand (first contains div) 16055 SDNode *ResNode = CallResult.first.getNode(); 16056 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 16057 return ResNode->getOperand(1); 16058 } 16059 16060 SDValue 16061 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 16062 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 16063 SDLoc DL(Op); 16064 16065 // Get the inputs. 16066 SDValue Chain = Op.getOperand(0); 16067 SDValue Size = Op.getOperand(1); 16068 16069 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 16070 "no-stack-arg-probe")) { 16071 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 16072 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16073 Chain = SP.getValue(1); 16074 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 16075 if (Align) 16076 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 16077 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 16078 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 16079 SDValue Ops[2] = { SP, Chain }; 16080 return DAG.getMergeValues(Ops, DL); 16081 } 16082 16083 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 16084 DAG.getConstant(2, DL, MVT::i32)); 16085 16086 SDValue Flag; 16087 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 16088 Flag = Chain.getValue(1); 16089 16090 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 16091 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 16092 16093 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16094 Chain = NewSP.getValue(1); 16095 16096 SDValue Ops[2] = { NewSP, Chain }; 16097 return DAG.getMergeValues(Ops, DL); 16098 } 16099 16100 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 16101 SDValue SrcVal = Op.getOperand(0); 16102 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16103 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 16104 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 16105 "Unexpected type for custom-lowering FP_EXTEND"); 16106 16107 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16108 "With both FP DP and 16, any FP conversion is legal!"); 16109 16110 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 16111 "With FP16, 16 to 32 conversion is legal!"); 16112 16113 // Either we are converting from 16 -> 64, without FP16 and/or 16114 // FP.double-precision or without Armv8-fp. So we must do it in two 16115 // steps. 16116 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 16117 // without FP16. So we must do a function call. 16118 SDLoc Loc(Op); 16119 RTLIB::Libcall LC; 16120 MakeLibCallOptions CallOptions; 16121 if (SrcSz == 16) { 16122 // Instruction from 16 -> 32 16123 if (Subtarget->hasFP16()) 16124 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal); 16125 // Lib call from 16 -> 32 16126 else { 16127 LC = RTLIB::getFPEXT(MVT::f16, MVT::f32); 16128 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16129 "Unexpected type for custom-lowering FP_EXTEND"); 16130 SrcVal = 16131 makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first; 16132 } 16133 } 16134 16135 if (DstSz != 64) 16136 return SrcVal; 16137 // For sure now SrcVal is 32 bits 16138 if (Subtarget->hasFP64()) // Instruction from 32 -> 64 16139 return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal); 16140 16141 LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); 16142 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16143 "Unexpected type for custom-lowering FP_EXTEND"); 16144 return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first; 16145 } 16146 16147 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 16148 SDValue SrcVal = Op.getOperand(0); 16149 EVT SrcVT = SrcVal.getValueType(); 16150 EVT DstVT = Op.getValueType(); 16151 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16152 const unsigned SrcSz = SrcVT.getSizeInBits(); 16153 (void)DstSz; 16154 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 16155 "Unexpected type for custom-lowering FP_ROUND"); 16156 16157 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16158 "With both FP DP and 16, any FP conversion is legal!"); 16159 16160 SDLoc Loc(Op); 16161 16162 // Instruction from 32 -> 16 if hasFP16 is valid 16163 if (SrcSz == 32 && Subtarget->hasFP16()) 16164 return Op; 16165 16166 // Lib call from 32 -> 16 / 64 -> [32, 16] 16167 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 16168 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16169 "Unexpected type for custom-lowering FP_ROUND"); 16170 MakeLibCallOptions CallOptions; 16171 return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first; 16172 } 16173 16174 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 16175 SelectionDAG &DAG) const { 16176 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 16177 MVT HalfT = MVT::i32; 16178 SDLoc dl(N); 16179 SDValue Hi, Lo, Tmp; 16180 16181 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 16182 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 16183 return ; 16184 16185 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 16186 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 16187 16188 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16189 DAG.getConstant(0, dl, HalfT)); 16190 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16191 DAG.getConstant(1, dl, HalfT)); 16192 16193 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 16194 DAG.getConstant(OpTypeBits - 1, dl, 16195 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 16196 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 16197 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 16198 SDValue(Lo.getNode(), 1)); 16199 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 16200 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 16201 16202 Results.push_back(Lo); 16203 Results.push_back(Hi); 16204 } 16205 16206 bool 16207 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 16208 // The ARM target isn't yet aware of offsets. 16209 return false; 16210 } 16211 16212 bool ARM::isBitFieldInvertedMask(unsigned v) { 16213 if (v == 0xffffffff) 16214 return false; 16215 16216 // there can be 1's on either or both "outsides", all the "inside" 16217 // bits must be 0's 16218 return isShiftedMask_32(~v); 16219 } 16220 16221 /// isFPImmLegal - Returns true if the target can instruction select the 16222 /// specified FP immediate natively. If false, the legalizer will 16223 /// materialize the FP immediate as a load from a constant pool. 16224 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 16225 bool ForCodeSize) const { 16226 if (!Subtarget->hasVFP3Base()) 16227 return false; 16228 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 16229 return ARM_AM::getFP16Imm(Imm) != -1; 16230 if (VT == MVT::f32) 16231 return ARM_AM::getFP32Imm(Imm) != -1; 16232 if (VT == MVT::f64 && Subtarget->hasFP64()) 16233 return ARM_AM::getFP64Imm(Imm) != -1; 16234 return false; 16235 } 16236 16237 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 16238 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 16239 /// specified in the intrinsic calls. 16240 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 16241 const CallInst &I, 16242 MachineFunction &MF, 16243 unsigned Intrinsic) const { 16244 switch (Intrinsic) { 16245 case Intrinsic::arm_neon_vld1: 16246 case Intrinsic::arm_neon_vld2: 16247 case Intrinsic::arm_neon_vld3: 16248 case Intrinsic::arm_neon_vld4: 16249 case Intrinsic::arm_neon_vld2lane: 16250 case Intrinsic::arm_neon_vld3lane: 16251 case Intrinsic::arm_neon_vld4lane: 16252 case Intrinsic::arm_neon_vld2dup: 16253 case Intrinsic::arm_neon_vld3dup: 16254 case Intrinsic::arm_neon_vld4dup: { 16255 Info.opc = ISD::INTRINSIC_W_CHAIN; 16256 // Conservatively set memVT to the entire set of vectors loaded. 16257 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16258 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16259 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16260 Info.ptrVal = I.getArgOperand(0); 16261 Info.offset = 0; 16262 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16263 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16264 // volatile loads with NEON intrinsics not supported 16265 Info.flags = MachineMemOperand::MOLoad; 16266 return true; 16267 } 16268 case Intrinsic::arm_neon_vld1x2: 16269 case Intrinsic::arm_neon_vld1x3: 16270 case Intrinsic::arm_neon_vld1x4: { 16271 Info.opc = ISD::INTRINSIC_W_CHAIN; 16272 // Conservatively set memVT to the entire set of vectors loaded. 16273 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16274 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16275 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16276 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 16277 Info.offset = 0; 16278 Info.align.reset(); 16279 // volatile loads with NEON intrinsics not supported 16280 Info.flags = MachineMemOperand::MOLoad; 16281 return true; 16282 } 16283 case Intrinsic::arm_neon_vst1: 16284 case Intrinsic::arm_neon_vst2: 16285 case Intrinsic::arm_neon_vst3: 16286 case Intrinsic::arm_neon_vst4: 16287 case Intrinsic::arm_neon_vst2lane: 16288 case Intrinsic::arm_neon_vst3lane: 16289 case Intrinsic::arm_neon_vst4lane: { 16290 Info.opc = ISD::INTRINSIC_VOID; 16291 // Conservatively set memVT to the entire set of vectors stored. 16292 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16293 unsigned NumElts = 0; 16294 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16295 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16296 if (!ArgTy->isVectorTy()) 16297 break; 16298 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16299 } 16300 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16301 Info.ptrVal = I.getArgOperand(0); 16302 Info.offset = 0; 16303 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16304 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16305 // volatile stores with NEON intrinsics not supported 16306 Info.flags = MachineMemOperand::MOStore; 16307 return true; 16308 } 16309 case Intrinsic::arm_neon_vst1x2: 16310 case Intrinsic::arm_neon_vst1x3: 16311 case Intrinsic::arm_neon_vst1x4: { 16312 Info.opc = ISD::INTRINSIC_VOID; 16313 // Conservatively set memVT to the entire set of vectors stored. 16314 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16315 unsigned NumElts = 0; 16316 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 16317 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 16318 if (!ArgTy->isVectorTy()) 16319 break; 16320 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 16321 } 16322 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16323 Info.ptrVal = I.getArgOperand(0); 16324 Info.offset = 0; 16325 Info.align.reset(); 16326 // volatile stores with NEON intrinsics not supported 16327 Info.flags = MachineMemOperand::MOStore; 16328 return true; 16329 } 16330 case Intrinsic::arm_ldaex: 16331 case Intrinsic::arm_ldrex: { 16332 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16333 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 16334 Info.opc = ISD::INTRINSIC_W_CHAIN; 16335 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16336 Info.ptrVal = I.getArgOperand(0); 16337 Info.offset = 0; 16338 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16339 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16340 return true; 16341 } 16342 case Intrinsic::arm_stlex: 16343 case Intrinsic::arm_strex: { 16344 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16345 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 16346 Info.opc = ISD::INTRINSIC_W_CHAIN; 16347 Info.memVT = MVT::getVT(PtrTy->getElementType()); 16348 Info.ptrVal = I.getArgOperand(1); 16349 Info.offset = 0; 16350 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 16351 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16352 return true; 16353 } 16354 case Intrinsic::arm_stlexd: 16355 case Intrinsic::arm_strexd: 16356 Info.opc = ISD::INTRINSIC_W_CHAIN; 16357 Info.memVT = MVT::i64; 16358 Info.ptrVal = I.getArgOperand(2); 16359 Info.offset = 0; 16360 Info.align = Align(8); 16361 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 16362 return true; 16363 16364 case Intrinsic::arm_ldaexd: 16365 case Intrinsic::arm_ldrexd: 16366 Info.opc = ISD::INTRINSIC_W_CHAIN; 16367 Info.memVT = MVT::i64; 16368 Info.ptrVal = I.getArgOperand(0); 16369 Info.offset = 0; 16370 Info.align = Align(8); 16371 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 16372 return true; 16373 16374 default: 16375 break; 16376 } 16377 16378 return false; 16379 } 16380 16381 /// Returns true if it is beneficial to convert a load of a constant 16382 /// to just the constant itself. 16383 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 16384 Type *Ty) const { 16385 assert(Ty->isIntegerTy()); 16386 16387 unsigned Bits = Ty->getPrimitiveSizeInBits(); 16388 if (Bits == 0 || Bits > 32) 16389 return false; 16390 return true; 16391 } 16392 16393 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 16394 unsigned Index) const { 16395 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 16396 return false; 16397 16398 return (Index == 0 || Index == ResVT.getVectorNumElements()); 16399 } 16400 16401 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 16402 ARM_MB::MemBOpt Domain) const { 16403 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16404 16405 // First, if the target has no DMB, see what fallback we can use. 16406 if (!Subtarget->hasDataBarrier()) { 16407 // Some ARMv6 cpus can support data barriers with an mcr instruction. 16408 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 16409 // here. 16410 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 16411 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 16412 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 16413 Builder.getInt32(0), Builder.getInt32(7), 16414 Builder.getInt32(10), Builder.getInt32(5)}; 16415 return Builder.CreateCall(MCR, args); 16416 } else { 16417 // Instead of using barriers, atomic accesses on these subtargets use 16418 // libcalls. 16419 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 16420 } 16421 } else { 16422 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 16423 // Only a full system barrier exists in the M-class architectures. 16424 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 16425 Constant *CDomain = Builder.getInt32(Domain); 16426 return Builder.CreateCall(DMB, CDomain); 16427 } 16428 } 16429 16430 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 16431 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 16432 Instruction *Inst, 16433 AtomicOrdering Ord) const { 16434 switch (Ord) { 16435 case AtomicOrdering::NotAtomic: 16436 case AtomicOrdering::Unordered: 16437 llvm_unreachable("Invalid fence: unordered/non-atomic"); 16438 case AtomicOrdering::Monotonic: 16439 case AtomicOrdering::Acquire: 16440 return nullptr; // Nothing to do 16441 case AtomicOrdering::SequentiallyConsistent: 16442 if (!Inst->hasAtomicStore()) 16443 return nullptr; // Nothing to do 16444 LLVM_FALLTHROUGH; 16445 case AtomicOrdering::Release: 16446 case AtomicOrdering::AcquireRelease: 16447 if (Subtarget->preferISHSTBarriers()) 16448 return makeDMB(Builder, ARM_MB::ISHST); 16449 // FIXME: add a comment with a link to documentation justifying this. 16450 else 16451 return makeDMB(Builder, ARM_MB::ISH); 16452 } 16453 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 16454 } 16455 16456 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 16457 Instruction *Inst, 16458 AtomicOrdering Ord) const { 16459 switch (Ord) { 16460 case AtomicOrdering::NotAtomic: 16461 case AtomicOrdering::Unordered: 16462 llvm_unreachable("Invalid fence: unordered/not-atomic"); 16463 case AtomicOrdering::Monotonic: 16464 case AtomicOrdering::Release: 16465 return nullptr; // Nothing to do 16466 case AtomicOrdering::Acquire: 16467 case AtomicOrdering::AcquireRelease: 16468 case AtomicOrdering::SequentiallyConsistent: 16469 return makeDMB(Builder, ARM_MB::ISH); 16470 } 16471 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 16472 } 16473 16474 // Loads and stores less than 64-bits are already atomic; ones above that 16475 // are doomed anyway, so defer to the default libcall and blame the OS when 16476 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16477 // anything for those. 16478 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16479 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 16480 return (Size == 64) && !Subtarget->isMClass(); 16481 } 16482 16483 // Loads and stores less than 64-bits are already atomic; ones above that 16484 // are doomed anyway, so defer to the default libcall and blame the OS when 16485 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 16486 // anything for those. 16487 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 16488 // guarantee, see DDI0406C ARM architecture reference manual, 16489 // sections A8.8.72-74 LDRD) 16490 TargetLowering::AtomicExpansionKind 16491 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16492 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 16493 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 16494 : AtomicExpansionKind::None; 16495 } 16496 16497 // For the real atomic operations, we have ldrex/strex up to 32 bits, 16498 // and up to 64 bits on the non-M profiles 16499 TargetLowering::AtomicExpansionKind 16500 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 16501 if (AI->isFloatingPointOperation()) 16502 return AtomicExpansionKind::CmpXChg; 16503 16504 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 16505 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16506 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 16507 ? AtomicExpansionKind::LLSC 16508 : AtomicExpansionKind::None; 16509 } 16510 16511 TargetLowering::AtomicExpansionKind 16512 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 16513 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 16514 // implement cmpxchg without spilling. If the address being exchanged is also 16515 // on the stack and close enough to the spill slot, this can lead to a 16516 // situation where the monitor always gets cleared and the atomic operation 16517 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 16518 bool HasAtomicCmpXchg = 16519 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 16520 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 16521 return AtomicExpansionKind::LLSC; 16522 return AtomicExpansionKind::None; 16523 } 16524 16525 bool ARMTargetLowering::shouldInsertFencesForAtomic( 16526 const Instruction *I) const { 16527 return InsertFencesForAtomic; 16528 } 16529 16530 // This has so far only been implemented for MachO. 16531 bool ARMTargetLowering::useLoadStackGuardNode() const { 16532 return Subtarget->isTargetMachO(); 16533 } 16534 16535 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 16536 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16537 return TargetLowering::insertSSPDeclarations(M); 16538 16539 // MSVC CRT has a global variable holding security cookie. 16540 M.getOrInsertGlobal("__security_cookie", 16541 Type::getInt8PtrTy(M.getContext())); 16542 16543 // MSVC CRT has a function to validate security cookie. 16544 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 16545 "__security_check_cookie", Type::getVoidTy(M.getContext()), 16546 Type::getInt8PtrTy(M.getContext())); 16547 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 16548 F->addAttribute(1, Attribute::AttrKind::InReg); 16549 } 16550 16551 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 16552 // MSVC CRT has a global variable holding security cookie. 16553 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16554 return M.getGlobalVariable("__security_cookie"); 16555 return TargetLowering::getSDagStackGuard(M); 16556 } 16557 16558 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 16559 // MSVC CRT has a function to validate security cookie. 16560 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 16561 return M.getFunction("__security_check_cookie"); 16562 return TargetLowering::getSSPStackGuardCheck(M); 16563 } 16564 16565 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 16566 unsigned &Cost) const { 16567 // If we do not have NEON, vector types are not natively supported. 16568 if (!Subtarget->hasNEON()) 16569 return false; 16570 16571 // Floating point values and vector values map to the same register file. 16572 // Therefore, although we could do a store extract of a vector type, this is 16573 // better to leave at float as we have more freedom in the addressing mode for 16574 // those. 16575 if (VectorTy->isFPOrFPVectorTy()) 16576 return false; 16577 16578 // If the index is unknown at compile time, this is very expensive to lower 16579 // and it is not possible to combine the store with the extract. 16580 if (!isa<ConstantInt>(Idx)) 16581 return false; 16582 16583 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 16584 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 16585 // We can do a store + vector extract on any vector that fits perfectly in a D 16586 // or Q register. 16587 if (BitWidth == 64 || BitWidth == 128) { 16588 Cost = 0; 16589 return true; 16590 } 16591 return false; 16592 } 16593 16594 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 16595 return Subtarget->hasV6T2Ops(); 16596 } 16597 16598 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 16599 return Subtarget->hasV6T2Ops(); 16600 } 16601 16602 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 16603 return !Subtarget->hasMinSize(); 16604 } 16605 16606 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 16607 AtomicOrdering Ord) const { 16608 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16609 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 16610 bool IsAcquire = isAcquireOrStronger(Ord); 16611 16612 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 16613 // intrinsic must return {i32, i32} and we have to recombine them into a 16614 // single i64 here. 16615 if (ValTy->getPrimitiveSizeInBits() == 64) { 16616 Intrinsic::ID Int = 16617 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 16618 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 16619 16620 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16621 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 16622 16623 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 16624 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 16625 if (!Subtarget->isLittle()) 16626 std::swap (Lo, Hi); 16627 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 16628 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 16629 return Builder.CreateOr( 16630 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 16631 } 16632 16633 Type *Tys[] = { Addr->getType() }; 16634 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 16635 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 16636 16637 return Builder.CreateTruncOrBitCast( 16638 Builder.CreateCall(Ldrex, Addr), 16639 cast<PointerType>(Addr->getType())->getElementType()); 16640 } 16641 16642 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 16643 IRBuilder<> &Builder) const { 16644 if (!Subtarget->hasV7Ops()) 16645 return; 16646 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16647 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 16648 } 16649 16650 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 16651 Value *Addr, 16652 AtomicOrdering Ord) const { 16653 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 16654 bool IsRelease = isReleaseOrStronger(Ord); 16655 16656 // Since the intrinsics must have legal type, the i64 intrinsics take two 16657 // parameters: "i32, i32". We must marshal Val into the appropriate form 16658 // before the call. 16659 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 16660 Intrinsic::ID Int = 16661 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 16662 Function *Strex = Intrinsic::getDeclaration(M, Int); 16663 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 16664 16665 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 16666 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 16667 if (!Subtarget->isLittle()) 16668 std::swap(Lo, Hi); 16669 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 16670 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 16671 } 16672 16673 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 16674 Type *Tys[] = { Addr->getType() }; 16675 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 16676 16677 return Builder.CreateCall( 16678 Strex, {Builder.CreateZExtOrBitCast( 16679 Val, Strex->getFunctionType()->getParamType(0)), 16680 Addr}); 16681 } 16682 16683 16684 bool ARMTargetLowering::alignLoopsWithOptSize() const { 16685 return Subtarget->isMClass(); 16686 } 16687 16688 /// A helper function for determining the number of interleaved accesses we 16689 /// will generate when lowering accesses of the given type. 16690 unsigned 16691 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 16692 const DataLayout &DL) const { 16693 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 16694 } 16695 16696 bool ARMTargetLowering::isLegalInterleavedAccessType( 16697 VectorType *VecTy, const DataLayout &DL) const { 16698 16699 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 16700 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 16701 16702 // Ensure the vector doesn't have f16 elements. Even though we could do an 16703 // i16 vldN, we can't hold the f16 vectors and will end up converting via 16704 // f32. 16705 if (VecTy->getElementType()->isHalfTy()) 16706 return false; 16707 16708 // Ensure the number of vector elements is greater than 1. 16709 if (VecTy->getNumElements() < 2) 16710 return false; 16711 16712 // Ensure the element type is legal. 16713 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 16714 return false; 16715 16716 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 16717 // 128 will be split into multiple interleaved accesses. 16718 return VecSize == 64 || VecSize % 128 == 0; 16719 } 16720 16721 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 16722 if (Subtarget->hasNEON()) 16723 return 4; 16724 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 16725 } 16726 16727 /// Lower an interleaved load into a vldN intrinsic. 16728 /// 16729 /// E.g. Lower an interleaved load (Factor = 2): 16730 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 16731 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 16732 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 16733 /// 16734 /// Into: 16735 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 16736 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 16737 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 16738 bool ARMTargetLowering::lowerInterleavedLoad( 16739 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 16740 ArrayRef<unsigned> Indices, unsigned Factor) const { 16741 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 16742 "Invalid interleave factor"); 16743 assert(!Shuffles.empty() && "Empty shufflevector input"); 16744 assert(Shuffles.size() == Indices.size() && 16745 "Unmatched number of shufflevectors and indices"); 16746 16747 VectorType *VecTy = Shuffles[0]->getType(); 16748 Type *EltTy = VecTy->getVectorElementType(); 16749 16750 const DataLayout &DL = LI->getModule()->getDataLayout(); 16751 16752 // Skip if we do not have NEON and skip illegal vector types. We can 16753 // "legalize" wide vector types into multiple interleaved accesses as long as 16754 // the vector types are divisible by 128. 16755 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) 16756 return false; 16757 16758 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 16759 16760 // A pointer vector can not be the return type of the ldN intrinsics. Need to 16761 // load integer vectors first and then convert to pointer vectors. 16762 if (EltTy->isPointerTy()) 16763 VecTy = 16764 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 16765 16766 IRBuilder<> Builder(LI); 16767 16768 // The base address of the load. 16769 Value *BaseAddr = LI->getPointerOperand(); 16770 16771 if (NumLoads > 1) { 16772 // If we're going to generate more than one load, reset the sub-vector type 16773 // to something legal. 16774 VecTy = VectorType::get(VecTy->getVectorElementType(), 16775 VecTy->getVectorNumElements() / NumLoads); 16776 16777 // We will compute the pointer operand of each load from the original base 16778 // address using GEPs. Cast the base address to a pointer to the scalar 16779 // element type. 16780 BaseAddr = Builder.CreateBitCast( 16781 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 16782 LI->getPointerAddressSpace())); 16783 } 16784 16785 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 16786 16787 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 16788 Type *Tys[] = {VecTy, Int8Ptr}; 16789 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 16790 Intrinsic::arm_neon_vld3, 16791 Intrinsic::arm_neon_vld4}; 16792 Function *VldnFunc = 16793 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 16794 16795 // Holds sub-vectors extracted from the load intrinsic return values. The 16796 // sub-vectors are associated with the shufflevector instructions they will 16797 // replace. 16798 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 16799 16800 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 16801 // If we're generating more than one load, compute the base address of 16802 // subsequent loads as an offset from the previous. 16803 if (LoadCount > 0) 16804 BaseAddr = 16805 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 16806 VecTy->getVectorNumElements() * Factor); 16807 16808 SmallVector<Value *, 2> Ops; 16809 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 16810 Ops.push_back(Builder.getInt32(LI->getAlignment())); 16811 16812 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 16813 16814 // Replace uses of each shufflevector with the corresponding vector loaded 16815 // by ldN. 16816 for (unsigned i = 0; i < Shuffles.size(); i++) { 16817 ShuffleVectorInst *SV = Shuffles[i]; 16818 unsigned Index = Indices[i]; 16819 16820 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 16821 16822 // Convert the integer vector to pointer vector if the element is pointer. 16823 if (EltTy->isPointerTy()) 16824 SubVec = Builder.CreateIntToPtr( 16825 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 16826 VecTy->getVectorNumElements())); 16827 16828 SubVecs[SV].push_back(SubVec); 16829 } 16830 } 16831 16832 // Replace uses of the shufflevector instructions with the sub-vectors 16833 // returned by the load intrinsic. If a shufflevector instruction is 16834 // associated with more than one sub-vector, those sub-vectors will be 16835 // concatenated into a single wide vector. 16836 for (ShuffleVectorInst *SVI : Shuffles) { 16837 auto &SubVec = SubVecs[SVI]; 16838 auto *WideVec = 16839 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 16840 SVI->replaceAllUsesWith(WideVec); 16841 } 16842 16843 return true; 16844 } 16845 16846 /// Lower an interleaved store into a vstN intrinsic. 16847 /// 16848 /// E.g. Lower an interleaved store (Factor = 3): 16849 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 16850 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 16851 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 16852 /// 16853 /// Into: 16854 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 16855 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 16856 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 16857 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 16858 /// 16859 /// Note that the new shufflevectors will be removed and we'll only generate one 16860 /// vst3 instruction in CodeGen. 16861 /// 16862 /// Example for a more general valid mask (Factor 3). Lower: 16863 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 16864 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 16865 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 16866 /// 16867 /// Into: 16868 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 16869 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 16870 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 16871 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 16872 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 16873 ShuffleVectorInst *SVI, 16874 unsigned Factor) const { 16875 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 16876 "Invalid interleave factor"); 16877 16878 VectorType *VecTy = SVI->getType(); 16879 assert(VecTy->getVectorNumElements() % Factor == 0 && 16880 "Invalid interleaved store"); 16881 16882 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 16883 Type *EltTy = VecTy->getVectorElementType(); 16884 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 16885 16886 const DataLayout &DL = SI->getModule()->getDataLayout(); 16887 16888 // Skip if we do not have NEON and skip illegal vector types. We can 16889 // "legalize" wide vector types into multiple interleaved accesses as long as 16890 // the vector types are divisible by 128. 16891 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 16892 return false; 16893 16894 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 16895 16896 Value *Op0 = SVI->getOperand(0); 16897 Value *Op1 = SVI->getOperand(1); 16898 IRBuilder<> Builder(SI); 16899 16900 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 16901 // vectors to integer vectors. 16902 if (EltTy->isPointerTy()) { 16903 Type *IntTy = DL.getIntPtrType(EltTy); 16904 16905 // Convert to the corresponding integer vector. 16906 Type *IntVecTy = 16907 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 16908 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 16909 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 16910 16911 SubVecTy = VectorType::get(IntTy, LaneLen); 16912 } 16913 16914 // The base address of the store. 16915 Value *BaseAddr = SI->getPointerOperand(); 16916 16917 if (NumStores > 1) { 16918 // If we're going to generate more than one store, reset the lane length 16919 // and sub-vector type to something legal. 16920 LaneLen /= NumStores; 16921 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 16922 16923 // We will compute the pointer operand of each store from the original base 16924 // address using GEPs. Cast the base address to a pointer to the scalar 16925 // element type. 16926 BaseAddr = Builder.CreateBitCast( 16927 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 16928 SI->getPointerAddressSpace())); 16929 } 16930 16931 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 16932 16933 auto Mask = SVI->getShuffleMask(); 16934 16935 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 16936 Type *Tys[] = {Int8Ptr, SubVecTy}; 16937 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 16938 Intrinsic::arm_neon_vst3, 16939 Intrinsic::arm_neon_vst4}; 16940 16941 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 16942 // If we generating more than one store, we compute the base address of 16943 // subsequent stores as an offset from the previous. 16944 if (StoreCount > 0) 16945 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 16946 BaseAddr, LaneLen * Factor); 16947 16948 SmallVector<Value *, 6> Ops; 16949 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 16950 16951 Function *VstNFunc = 16952 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 16953 16954 // Split the shufflevector operands into sub vectors for the new vstN call. 16955 for (unsigned i = 0; i < Factor; i++) { 16956 unsigned IdxI = StoreCount * LaneLen * Factor + i; 16957 if (Mask[IdxI] >= 0) { 16958 Ops.push_back(Builder.CreateShuffleVector( 16959 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 16960 } else { 16961 unsigned StartMask = 0; 16962 for (unsigned j = 1; j < LaneLen; j++) { 16963 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 16964 if (Mask[IdxJ * Factor + IdxI] >= 0) { 16965 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 16966 break; 16967 } 16968 } 16969 // Note: If all elements in a chunk are undefs, StartMask=0! 16970 // Note: Filling undef gaps with random elements is ok, since 16971 // those elements were being written anyway (with undefs). 16972 // In the case of all undefs we're defaulting to using elems from 0 16973 // Note: StartMask cannot be negative, it's checked in 16974 // isReInterleaveMask 16975 Ops.push_back(Builder.CreateShuffleVector( 16976 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 16977 } 16978 } 16979 16980 Ops.push_back(Builder.getInt32(SI->getAlignment())); 16981 Builder.CreateCall(VstNFunc, Ops); 16982 } 16983 return true; 16984 } 16985 16986 enum HABaseType { 16987 HA_UNKNOWN = 0, 16988 HA_FLOAT, 16989 HA_DOUBLE, 16990 HA_VECT64, 16991 HA_VECT128 16992 }; 16993 16994 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 16995 uint64_t &Members) { 16996 if (auto *ST = dyn_cast<StructType>(Ty)) { 16997 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 16998 uint64_t SubMembers = 0; 16999 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 17000 return false; 17001 Members += SubMembers; 17002 } 17003 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 17004 uint64_t SubMembers = 0; 17005 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 17006 return false; 17007 Members += SubMembers * AT->getNumElements(); 17008 } else if (Ty->isFloatTy()) { 17009 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 17010 return false; 17011 Members = 1; 17012 Base = HA_FLOAT; 17013 } else if (Ty->isDoubleTy()) { 17014 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 17015 return false; 17016 Members = 1; 17017 Base = HA_DOUBLE; 17018 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 17019 Members = 1; 17020 switch (Base) { 17021 case HA_FLOAT: 17022 case HA_DOUBLE: 17023 return false; 17024 case HA_VECT64: 17025 return VT->getBitWidth() == 64; 17026 case HA_VECT128: 17027 return VT->getBitWidth() == 128; 17028 case HA_UNKNOWN: 17029 switch (VT->getBitWidth()) { 17030 case 64: 17031 Base = HA_VECT64; 17032 return true; 17033 case 128: 17034 Base = HA_VECT128; 17035 return true; 17036 default: 17037 return false; 17038 } 17039 } 17040 } 17041 17042 return (Members > 0 && Members <= 4); 17043 } 17044 17045 /// Return the correct alignment for the current calling convention. 17046 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 17047 DataLayout DL) const { 17048 const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); 17049 if (!ArgTy->isVectorTy()) 17050 return ABITypeAlign; 17051 17052 // Avoid over-aligning vector parameters. It would require realigning the 17053 // stack and waste space for no real benefit. 17054 return std::min(ABITypeAlign, DL.getStackAlignment()); 17055 } 17056 17057 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 17058 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 17059 /// passing according to AAPCS rules. 17060 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 17061 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 17062 if (getEffectiveCallingConv(CallConv, isVarArg) != 17063 CallingConv::ARM_AAPCS_VFP) 17064 return false; 17065 17066 HABaseType Base = HA_UNKNOWN; 17067 uint64_t Members = 0; 17068 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 17069 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 17070 17071 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 17072 return IsHA || IsIntArray; 17073 } 17074 17075 unsigned ARMTargetLowering::getExceptionPointerRegister( 17076 const Constant *PersonalityFn) const { 17077 // Platforms which do not use SjLj EH may return values in these registers 17078 // via the personality function. 17079 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 17080 } 17081 17082 unsigned ARMTargetLowering::getExceptionSelectorRegister( 17083 const Constant *PersonalityFn) const { 17084 // Platforms which do not use SjLj EH may return values in these registers 17085 // via the personality function. 17086 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 17087 } 17088 17089 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 17090 // Update IsSplitCSR in ARMFunctionInfo. 17091 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 17092 AFI->setIsSplitCSR(true); 17093 } 17094 17095 void ARMTargetLowering::insertCopiesSplitCSR( 17096 MachineBasicBlock *Entry, 17097 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 17098 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17099 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 17100 if (!IStart) 17101 return; 17102 17103 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 17104 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 17105 MachineBasicBlock::iterator MBBI = Entry->begin(); 17106 for (const MCPhysReg *I = IStart; *I; ++I) { 17107 const TargetRegisterClass *RC = nullptr; 17108 if (ARM::GPRRegClass.contains(*I)) 17109 RC = &ARM::GPRRegClass; 17110 else if (ARM::DPRRegClass.contains(*I)) 17111 RC = &ARM::DPRRegClass; 17112 else 17113 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 17114 17115 Register NewVR = MRI->createVirtualRegister(RC); 17116 // Create copy from CSR to a virtual register. 17117 // FIXME: this currently does not emit CFI pseudo-instructions, it works 17118 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 17119 // nounwind. If we want to generalize this later, we may need to emit 17120 // CFI pseudo-instructions. 17121 assert(Entry->getParent()->getFunction().hasFnAttribute( 17122 Attribute::NoUnwind) && 17123 "Function should be nounwind in insertCopiesSplitCSR!"); 17124 Entry->addLiveIn(*I); 17125 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 17126 .addReg(*I); 17127 17128 // Insert the copy-back instructions right before the terminator. 17129 for (auto *Exit : Exits) 17130 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 17131 TII->get(TargetOpcode::COPY), *I) 17132 .addReg(NewVR); 17133 } 17134 } 17135 17136 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 17137 MF.getFrameInfo().computeMaxCallFrameSize(MF); 17138 TargetLoweringBase::finalizeLowering(MF); 17139 } 17140