1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Custom DAG lowering for R600 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "R600ISelLowering.h" 15 #include "AMDGPU.h" 16 #include "MCTargetDesc/R600MCTargetDesc.h" 17 #include "R600Defines.h" 18 #include "R600InstrInfo.h" 19 #include "R600MachineFunctionInfo.h" 20 #include "R600Subtarget.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/IR/IntrinsicsAMDGPU.h" 23 #include "llvm/IR/IntrinsicsR600.h" 24 25 using namespace llvm; 26 27 #include "R600GenCallingConv.inc" 28 29 R600TargetLowering::R600TargetLowering(const TargetMachine &TM, 30 const R600Subtarget &STI) 31 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { 32 addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); 33 addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); 34 addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); 35 addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass); 36 addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass); 37 addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass); 38 39 setBooleanContents(ZeroOrNegativeOneBooleanContent); 40 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 41 42 computeRegisterProperties(Subtarget->getRegisterInfo()); 43 44 // Legalize loads and stores to the private address space. 45 setOperationAction(ISD::LOAD, MVT::i32, Custom); 46 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 47 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 48 49 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 50 // spaces, so it is custom lowered to handle those where it isn't. 51 for (MVT VT : MVT::integer_valuetypes()) { 52 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 53 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 54 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 55 56 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 57 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 58 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 59 60 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 61 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 62 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 63 } 64 65 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. 66 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 67 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 68 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 69 70 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 71 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 72 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 73 74 setOperationAction(ISD::STORE, MVT::i8, Custom); 75 setOperationAction(ISD::STORE, MVT::i32, Custom); 76 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 77 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 78 79 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 80 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 81 // We need to include these since trunc STORES to PRIVATE need 82 // special handling to accommodate RMW 83 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 84 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom); 85 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom); 86 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom); 87 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom); 88 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 89 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 90 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom); 91 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom); 92 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom); 93 94 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. 95 setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); 96 setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); 97 98 // Set condition code actions 99 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 100 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 101 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 102 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 103 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 104 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 105 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 106 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 107 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 108 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 109 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 110 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 111 112 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 113 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 114 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 115 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 116 117 setOperationAction(ISD::FCOS, MVT::f32, Custom); 118 setOperationAction(ISD::FSIN, MVT::f32, Custom); 119 120 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 121 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 122 123 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 124 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 125 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 126 127 setOperationAction(ISD::FSUB, MVT::f32, Expand); 128 129 setOperationAction(ISD::FCEIL, MVT::f64, Custom); 130 setOperationAction(ISD::FTRUNC, MVT::f64, Custom); 131 setOperationAction(ISD::FRINT, MVT::f64, Custom); 132 setOperationAction(ISD::FFLOOR, MVT::f64, Custom); 133 134 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 135 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 136 137 setOperationAction(ISD::SETCC, MVT::i32, Expand); 138 setOperationAction(ISD::SETCC, MVT::f32, Expand); 139 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 140 setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); 141 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 142 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 143 144 setOperationAction(ISD::SELECT, MVT::i32, Expand); 145 setOperationAction(ISD::SELECT, MVT::f32, Expand); 146 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 147 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 148 149 // ADD, SUB overflow. 150 // TODO: turn these into Legal? 151 if (Subtarget->hasCARRY()) 152 setOperationAction(ISD::UADDO, MVT::i32, Custom); 153 154 if (Subtarget->hasBORROW()) 155 setOperationAction(ISD::USUBO, MVT::i32, Custom); 156 157 // Expand sign extension of vectors 158 if (!Subtarget->hasBFE()) 159 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 160 161 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 162 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 163 164 if (!Subtarget->hasBFE()) 165 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 166 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 167 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 168 169 if (!Subtarget->hasBFE()) 170 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 171 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 172 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 173 174 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 175 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 176 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 177 178 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 179 180 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 181 182 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 183 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 184 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 185 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 186 187 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 188 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 189 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 190 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 191 192 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 193 // to be Legal/Custom in order to avoid library calls. 194 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 195 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 196 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 197 198 if (!Subtarget->hasFMA()) { 199 setOperationAction(ISD::FMA, MVT::f32, Expand); 200 setOperationAction(ISD::FMA, MVT::f64, Expand); 201 } 202 203 // FIXME: May need no denormals check 204 setOperationAction(ISD::FMAD, MVT::f32, Legal); 205 206 if (!Subtarget->hasBFI()) { 207 // fcopysign can be done in a single instruction with BFI. 208 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 209 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 210 } 211 212 if (!Subtarget->hasBCNT(32)) 213 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 214 215 if (!Subtarget->hasBCNT(64)) 216 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 217 218 if (Subtarget->hasFFBH()) 219 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); 220 221 if (Subtarget->hasFFBL()) 222 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); 223 224 // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we 225 // need it for R600. 226 if (Subtarget->hasBFE()) 227 setHasExtractBitsInsn(true); 228 229 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 230 231 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 232 for (MVT VT : ScalarIntVTs) { 233 setOperationAction(ISD::ADDC, VT, Expand); 234 setOperationAction(ISD::SUBC, VT, Expand); 235 setOperationAction(ISD::ADDE, VT, Expand); 236 setOperationAction(ISD::SUBE, VT, Expand); 237 } 238 239 // LLVM will expand these to atomic_cmp_swap(0) 240 // and atomic_swap, respectively. 241 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); 242 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); 243 244 // We need to custom lower some of the intrinsics 245 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 246 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 247 248 setSchedulingPreference(Sched::Source); 249 250 setTargetDAGCombine({ISD::FP_ROUND, ISD::FP_TO_SINT, ISD::EXTRACT_VECTOR_ELT, 251 ISD::SELECT_CC, ISD::INSERT_VECTOR_ELT, ISD::LOAD}); 252 } 253 254 static inline bool isEOP(MachineBasicBlock::iterator I) { 255 if (std::next(I) == I->getParent()->end()) 256 return false; 257 return std::next(I)->getOpcode() == R600::RETURN; 258 } 259 260 MachineBasicBlock * 261 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 262 MachineBasicBlock *BB) const { 263 MachineFunction *MF = BB->getParent(); 264 MachineRegisterInfo &MRI = MF->getRegInfo(); 265 MachineBasicBlock::iterator I = MI; 266 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 267 268 switch (MI.getOpcode()) { 269 default: 270 // Replace LDS_*_RET instruction that don't have any uses with the 271 // equivalent LDS_*_NORET instruction. 272 if (TII->isLDSRetInstr(MI.getOpcode())) { 273 int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); 274 assert(DstIdx != -1); 275 MachineInstrBuilder NewMI; 276 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 277 // LDS_1A2D support and remove this special case. 278 if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || 279 MI.getOpcode() == R600::LDS_CMPST_RET) 280 return BB; 281 282 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 283 TII->get(R600::getLDSNoRetOp(MI.getOpcode()))); 284 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) 285 NewMI.add(MO); 286 } else { 287 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 288 } 289 break; 290 291 case R600::FABS_R600: { 292 MachineInstr *NewMI = TII->buildDefaultInstruction( 293 *BB, I, R600::MOV, MI.getOperand(0).getReg(), 294 MI.getOperand(1).getReg()); 295 TII->addFlag(*NewMI, 0, MO_FLAG_ABS); 296 break; 297 } 298 299 case R600::FNEG_R600: { 300 MachineInstr *NewMI = TII->buildDefaultInstruction( 301 *BB, I, R600::MOV, MI.getOperand(0).getReg(), 302 MI.getOperand(1).getReg()); 303 TII->addFlag(*NewMI, 0, MO_FLAG_NEG); 304 break; 305 } 306 307 case R600::MASK_WRITE: { 308 Register maskedRegister = MI.getOperand(0).getReg(); 309 assert(maskedRegister.isVirtual()); 310 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 311 TII->addFlag(*defInstr, 0, MO_FLAG_MASK); 312 break; 313 } 314 315 case R600::MOV_IMM_F32: 316 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) 317 .getFPImm() 318 ->getValueAPF() 319 .bitcastToAPInt() 320 .getZExtValue()); 321 break; 322 323 case R600::MOV_IMM_I32: 324 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), 325 MI.getOperand(1).getImm()); 326 break; 327 328 case R600::MOV_IMM_GLOBAL_ADDR: { 329 //TODO: Perhaps combine this instruction with the next if possible 330 auto MIB = TII->buildDefaultInstruction( 331 *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X); 332 int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal); 333 //TODO: Ugh this is rather ugly 334 const MachineOperand &MO = MI.getOperand(1); 335 MIB->getOperand(Idx).ChangeToGA(MO.getGlobal(), MO.getOffset(), 336 MO.getTargetFlags()); 337 break; 338 } 339 340 case R600::CONST_COPY: { 341 MachineInstr *NewMI = TII->buildDefaultInstruction( 342 *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST); 343 TII->setImmOperand(*NewMI, R600::OpName::src0_sel, 344 MI.getOperand(1).getImm()); 345 break; 346 } 347 348 case R600::RAT_WRITE_CACHELESS_32_eg: 349 case R600::RAT_WRITE_CACHELESS_64_eg: 350 case R600::RAT_WRITE_CACHELESS_128_eg: 351 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 352 .add(MI.getOperand(0)) 353 .add(MI.getOperand(1)) 354 .addImm(isEOP(I)); // Set End of program bit 355 break; 356 357 case R600::RAT_STORE_TYPED_eg: 358 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 359 .add(MI.getOperand(0)) 360 .add(MI.getOperand(1)) 361 .add(MI.getOperand(2)) 362 .addImm(isEOP(I)); // Set End of program bit 363 break; 364 365 case R600::BRANCH: 366 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP)) 367 .add(MI.getOperand(0)); 368 break; 369 370 case R600::BRANCH_COND_f32: { 371 MachineInstr *NewMI = 372 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), 373 R600::PREDICATE_BIT) 374 .add(MI.getOperand(1)) 375 .addImm(R600::PRED_SETNE) 376 .addImm(0); // Flags 377 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 378 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) 379 .add(MI.getOperand(0)) 380 .addReg(R600::PREDICATE_BIT, RegState::Kill); 381 break; 382 } 383 384 case R600::BRANCH_COND_i32: { 385 MachineInstr *NewMI = 386 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), 387 R600::PREDICATE_BIT) 388 .add(MI.getOperand(1)) 389 .addImm(R600::PRED_SETNE_INT) 390 .addImm(0); // Flags 391 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 392 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) 393 .add(MI.getOperand(0)) 394 .addReg(R600::PREDICATE_BIT, RegState::Kill); 395 break; 396 } 397 398 case R600::EG_ExportSwz: 399 case R600::R600_ExportSwz: { 400 // Instruction is left unmodified if its not the last one of its type 401 bool isLastInstructionOfItsType = true; 402 unsigned InstExportType = MI.getOperand(1).getImm(); 403 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 404 EndBlock = BB->end(); NextExportInst != EndBlock; 405 NextExportInst = std::next(NextExportInst)) { 406 if (NextExportInst->getOpcode() == R600::EG_ExportSwz || 407 NextExportInst->getOpcode() == R600::R600_ExportSwz) { 408 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 409 .getImm(); 410 if (CurrentInstExportType == InstExportType) { 411 isLastInstructionOfItsType = false; 412 break; 413 } 414 } 415 } 416 bool EOP = isEOP(I); 417 if (!EOP && !isLastInstructionOfItsType) 418 return BB; 419 unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40; 420 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 421 .add(MI.getOperand(0)) 422 .add(MI.getOperand(1)) 423 .add(MI.getOperand(2)) 424 .add(MI.getOperand(3)) 425 .add(MI.getOperand(4)) 426 .add(MI.getOperand(5)) 427 .add(MI.getOperand(6)) 428 .addImm(CfInst) 429 .addImm(EOP); 430 break; 431 } 432 case R600::RETURN: { 433 return BB; 434 } 435 } 436 437 MI.eraseFromParent(); 438 return BB; 439 } 440 441 //===----------------------------------------------------------------------===// 442 // Custom DAG Lowering Operations 443 //===----------------------------------------------------------------------===// 444 445 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 446 MachineFunction &MF = DAG.getMachineFunction(); 447 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 448 switch (Op.getOpcode()) { 449 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 450 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 451 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 452 case ISD::SHL_PARTS: 453 case ISD::SRA_PARTS: 454 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 455 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 456 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 457 case ISD::FCOS: 458 case ISD::FSIN: return LowerTrig(Op, DAG); 459 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 460 case ISD::STORE: return LowerSTORE(Op, DAG); 461 case ISD::LOAD: { 462 SDValue Result = LowerLOAD(Op, DAG); 463 assert((!Result.getNode() || 464 Result.getNode()->getNumValues() == 2) && 465 "Load should return a value and a chain"); 466 return Result; 467 } 468 469 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 470 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 471 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); 472 case ISD::INTRINSIC_VOID: { 473 SDValue Chain = Op.getOperand(0); 474 unsigned IntrinsicID = 475 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 476 switch (IntrinsicID) { 477 case Intrinsic::r600_store_swizzle: { 478 SDLoc DL(Op); 479 const SDValue Args[8] = { 480 Chain, 481 Op.getOperand(2), // Export Value 482 Op.getOperand(3), // ArrayBase 483 Op.getOperand(4), // Type 484 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 485 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 486 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 487 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 488 }; 489 return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args); 490 } 491 492 // default for switch(IntrinsicID) 493 default: break; 494 } 495 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 496 break; 497 } 498 case ISD::INTRINSIC_WO_CHAIN: { 499 unsigned IntrinsicID = 500 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 501 EVT VT = Op.getValueType(); 502 SDLoc DL(Op); 503 switch (IntrinsicID) { 504 case Intrinsic::r600_tex: 505 case Intrinsic::r600_texc: { 506 unsigned TextureOp; 507 switch (IntrinsicID) { 508 case Intrinsic::r600_tex: 509 TextureOp = 0; 510 break; 511 case Intrinsic::r600_texc: 512 TextureOp = 1; 513 break; 514 default: 515 llvm_unreachable("unhandled texture operation"); 516 } 517 518 SDValue TexArgs[19] = { 519 DAG.getConstant(TextureOp, DL, MVT::i32), 520 Op.getOperand(1), 521 DAG.getConstant(0, DL, MVT::i32), 522 DAG.getConstant(1, DL, MVT::i32), 523 DAG.getConstant(2, DL, MVT::i32), 524 DAG.getConstant(3, DL, MVT::i32), 525 Op.getOperand(2), 526 Op.getOperand(3), 527 Op.getOperand(4), 528 DAG.getConstant(0, DL, MVT::i32), 529 DAG.getConstant(1, DL, MVT::i32), 530 DAG.getConstant(2, DL, MVT::i32), 531 DAG.getConstant(3, DL, MVT::i32), 532 Op.getOperand(5), 533 Op.getOperand(6), 534 Op.getOperand(7), 535 Op.getOperand(8), 536 Op.getOperand(9), 537 Op.getOperand(10) 538 }; 539 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 540 } 541 case Intrinsic::r600_dot4: { 542 SDValue Args[8] = { 543 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 544 DAG.getConstant(0, DL, MVT::i32)), 545 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 546 DAG.getConstant(0, DL, MVT::i32)), 547 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 548 DAG.getConstant(1, DL, MVT::i32)), 549 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 550 DAG.getConstant(1, DL, MVT::i32)), 551 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 552 DAG.getConstant(2, DL, MVT::i32)), 553 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 554 DAG.getConstant(2, DL, MVT::i32)), 555 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 556 DAG.getConstant(3, DL, MVT::i32)), 557 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 558 DAG.getConstant(3, DL, MVT::i32)) 559 }; 560 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 561 } 562 563 case Intrinsic::r600_implicitarg_ptr: { 564 MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); 565 uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT); 566 return DAG.getConstant(ByteOffset, DL, PtrVT); 567 } 568 case Intrinsic::r600_read_ngroups_x: 569 return LowerImplicitParameter(DAG, VT, DL, 0); 570 case Intrinsic::r600_read_ngroups_y: 571 return LowerImplicitParameter(DAG, VT, DL, 1); 572 case Intrinsic::r600_read_ngroups_z: 573 return LowerImplicitParameter(DAG, VT, DL, 2); 574 case Intrinsic::r600_read_global_size_x: 575 return LowerImplicitParameter(DAG, VT, DL, 3); 576 case Intrinsic::r600_read_global_size_y: 577 return LowerImplicitParameter(DAG, VT, DL, 4); 578 case Intrinsic::r600_read_global_size_z: 579 return LowerImplicitParameter(DAG, VT, DL, 5); 580 case Intrinsic::r600_read_local_size_x: 581 return LowerImplicitParameter(DAG, VT, DL, 6); 582 case Intrinsic::r600_read_local_size_y: 583 return LowerImplicitParameter(DAG, VT, DL, 7); 584 case Intrinsic::r600_read_local_size_z: 585 return LowerImplicitParameter(DAG, VT, DL, 8); 586 587 case Intrinsic::r600_read_tgid_x: 588 case Intrinsic::amdgcn_workgroup_id_x: 589 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 590 R600::T1_X, VT); 591 case Intrinsic::r600_read_tgid_y: 592 case Intrinsic::amdgcn_workgroup_id_y: 593 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 594 R600::T1_Y, VT); 595 case Intrinsic::r600_read_tgid_z: 596 case Intrinsic::amdgcn_workgroup_id_z: 597 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 598 R600::T1_Z, VT); 599 case Intrinsic::r600_read_tidig_x: 600 case Intrinsic::amdgcn_workitem_id_x: 601 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 602 R600::T0_X, VT); 603 case Intrinsic::r600_read_tidig_y: 604 case Intrinsic::amdgcn_workitem_id_y: 605 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 606 R600::T0_Y, VT); 607 case Intrinsic::r600_read_tidig_z: 608 case Intrinsic::amdgcn_workitem_id_z: 609 return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, 610 R600::T0_Z, VT); 611 612 case Intrinsic::r600_recipsqrt_ieee: 613 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 614 615 case Intrinsic::r600_recipsqrt_clamped: 616 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 617 default: 618 return Op; 619 } 620 621 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 622 break; 623 } 624 } // end switch(Op.getOpcode()) 625 return SDValue(); 626 } 627 628 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 629 SmallVectorImpl<SDValue> &Results, 630 SelectionDAG &DAG) const { 631 switch (N->getOpcode()) { 632 default: 633 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 634 return; 635 case ISD::FP_TO_UINT: 636 if (N->getValueType(0) == MVT::i1) { 637 Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); 638 return; 639 } 640 // Since we don't care about out of bounds values we can use FP_TO_SINT for 641 // uints too. The DAGLegalizer code for uint considers some extra cases 642 // which are not necessary here. 643 LLVM_FALLTHROUGH; 644 case ISD::FP_TO_SINT: { 645 if (N->getValueType(0) == MVT::i1) { 646 Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); 647 return; 648 } 649 650 SDValue Result; 651 if (expandFP_TO_SINT(N, Result, DAG)) 652 Results.push_back(Result); 653 return; 654 } 655 case ISD::SDIVREM: { 656 SDValue Op = SDValue(N, 1); 657 SDValue RES = LowerSDIVREM(Op, DAG); 658 Results.push_back(RES); 659 Results.push_back(RES.getValue(1)); 660 break; 661 } 662 case ISD::UDIVREM: { 663 SDValue Op = SDValue(N, 0); 664 LowerUDIVREM64(Op, DAG, Results); 665 break; 666 } 667 } 668 } 669 670 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 671 SDValue Vector) const { 672 SDLoc DL(Vector); 673 EVT VecVT = Vector.getValueType(); 674 EVT EltVT = VecVT.getVectorElementType(); 675 SmallVector<SDValue, 8> Args; 676 677 for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { 678 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 679 DAG.getVectorIdxConstant(i, DL))); 680 } 681 682 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 683 } 684 685 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 686 SelectionDAG &DAG) const { 687 SDLoc DL(Op); 688 SDValue Vector = Op.getOperand(0); 689 SDValue Index = Op.getOperand(1); 690 691 if (isa<ConstantSDNode>(Index) || 692 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 693 return Op; 694 695 Vector = vectorToVerticalVector(DAG, Vector); 696 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 697 Vector, Index); 698 } 699 700 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 701 SelectionDAG &DAG) const { 702 SDLoc DL(Op); 703 SDValue Vector = Op.getOperand(0); 704 SDValue Value = Op.getOperand(1); 705 SDValue Index = Op.getOperand(2); 706 707 if (isa<ConstantSDNode>(Index) || 708 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 709 return Op; 710 711 Vector = vectorToVerticalVector(DAG, Vector); 712 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 713 Vector, Value, Index); 714 return vectorToVerticalVector(DAG, Insert); 715 } 716 717 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 718 SDValue Op, 719 SelectionDAG &DAG) const { 720 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 721 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 722 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 723 724 const DataLayout &DL = DAG.getDataLayout(); 725 const GlobalValue *GV = GSD->getGlobal(); 726 MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 727 728 SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); 729 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); 730 } 731 732 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 733 // On hw >= R700, COS/SIN input must be between -1. and 1. 734 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 735 EVT VT = Op.getValueType(); 736 SDValue Arg = Op.getOperand(0); 737 SDLoc DL(Op); 738 739 // TODO: Should this propagate fast-math-flags? 740 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 741 DAG.getNode(ISD::FADD, DL, VT, 742 DAG.getNode(ISD::FMUL, DL, VT, Arg, 743 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 744 DAG.getConstantFP(0.5, DL, MVT::f32))); 745 unsigned TrigNode; 746 switch (Op.getOpcode()) { 747 case ISD::FCOS: 748 TrigNode = AMDGPUISD::COS_HW; 749 break; 750 case ISD::FSIN: 751 TrigNode = AMDGPUISD::SIN_HW; 752 break; 753 default: 754 llvm_unreachable("Wrong trig opcode"); 755 } 756 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 757 DAG.getNode(ISD::FADD, DL, VT, FractPart, 758 DAG.getConstantFP(-0.5, DL, MVT::f32))); 759 if (Gen >= AMDGPUSubtarget::R700) 760 return TrigVal; 761 // On R600 hw, COS/SIN input must be between -Pi and Pi. 762 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 763 DAG.getConstantFP(numbers::pif, DL, MVT::f32)); 764 } 765 766 SDValue R600TargetLowering::LowerShiftParts(SDValue Op, 767 SelectionDAG &DAG) const { 768 SDValue Lo, Hi; 769 expandShiftParts(Op.getNode(), Lo, Hi, DAG); 770 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); 771 } 772 773 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 774 unsigned mainop, unsigned ovf) const { 775 SDLoc DL(Op); 776 EVT VT = Op.getValueType(); 777 778 SDValue Lo = Op.getOperand(0); 779 SDValue Hi = Op.getOperand(1); 780 781 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 782 // Extend sign. 783 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 784 DAG.getValueType(MVT::i1)); 785 786 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 787 788 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 789 } 790 791 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { 792 SDLoc DL(Op); 793 return DAG.getNode( 794 ISD::SETCC, 795 DL, 796 MVT::i1, 797 Op, DAG.getConstantFP(1.0f, DL, MVT::f32), 798 DAG.getCondCode(ISD::SETEQ)); 799 } 800 801 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { 802 SDLoc DL(Op); 803 return DAG.getNode( 804 ISD::SETCC, 805 DL, 806 MVT::i1, 807 Op, DAG.getConstantFP(-1.0f, DL, MVT::f32), 808 DAG.getCondCode(ISD::SETEQ)); 809 } 810 811 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 812 const SDLoc &DL, 813 unsigned DwordOffset) const { 814 unsigned ByteOffset = DwordOffset * 4; 815 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 816 AMDGPUAS::PARAM_I_ADDRESS); 817 818 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 819 assert(isInt<16>(ByteOffset)); 820 821 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 822 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 823 MachinePointerInfo(ConstantPointerNull::get(PtrType))); 824 } 825 826 bool R600TargetLowering::isZero(SDValue Op) const { 827 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 828 return Cst->isZero(); 829 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 830 return CstFP->isZero(); 831 } else { 832 return false; 833 } 834 } 835 836 bool R600TargetLowering::isHWTrueValue(SDValue Op) const { 837 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 838 return CFP->isExactlyValue(1.0); 839 } 840 return isAllOnesConstant(Op); 841 } 842 843 bool R600TargetLowering::isHWFalseValue(SDValue Op) const { 844 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 845 return CFP->getValueAPF().isZero(); 846 } 847 return isNullConstant(Op); 848 } 849 850 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 851 SDLoc DL(Op); 852 EVT VT = Op.getValueType(); 853 854 SDValue LHS = Op.getOperand(0); 855 SDValue RHS = Op.getOperand(1); 856 SDValue True = Op.getOperand(2); 857 SDValue False = Op.getOperand(3); 858 SDValue CC = Op.getOperand(4); 859 SDValue Temp; 860 861 if (VT == MVT::f32) { 862 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 863 SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 864 if (MinMax) 865 return MinMax; 866 } 867 868 // LHS and RHS are guaranteed to be the same value type 869 EVT CompareVT = LHS.getValueType(); 870 871 // Check if we can lower this to a native operation. 872 873 // Try to lower to a SET* instruction: 874 // 875 // SET* can match the following patterns: 876 // 877 // select_cc f32, f32, -1, 0, cc_supported 878 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 879 // select_cc i32, i32, -1, 0, cc_supported 880 // 881 882 // Move hardware True/False values to the correct operand. 883 if (isHWTrueValue(False) && isHWFalseValue(True)) { 884 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 885 ISD::CondCode InverseCC = ISD::getSetCCInverse(CCOpcode, CompareVT); 886 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 887 std::swap(False, True); 888 CC = DAG.getCondCode(InverseCC); 889 } else { 890 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 891 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 892 std::swap(False, True); 893 std::swap(LHS, RHS); 894 CC = DAG.getCondCode(SwapInvCC); 895 } 896 } 897 } 898 899 if (isHWTrueValue(True) && isHWFalseValue(False) && 900 (CompareVT == VT || VT == MVT::i32)) { 901 // This can be matched by a SET* instruction. 902 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 903 } 904 905 // Try to lower to a CND* instruction: 906 // 907 // CND* can match the following patterns: 908 // 909 // select_cc f32, 0.0, f32, f32, cc_supported 910 // select_cc f32, 0.0, i32, i32, cc_supported 911 // select_cc i32, 0, f32, f32, cc_supported 912 // select_cc i32, 0, i32, i32, cc_supported 913 // 914 915 // Try to move the zero value to the RHS 916 if (isZero(LHS)) { 917 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 918 // Try swapping the operands 919 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 920 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 921 std::swap(LHS, RHS); 922 CC = DAG.getCondCode(CCSwapped); 923 } else { 924 // Try inverting the condition and then swapping the operands 925 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT); 926 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 927 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 928 std::swap(True, False); 929 std::swap(LHS, RHS); 930 CC = DAG.getCondCode(CCSwapped); 931 } 932 } 933 } 934 if (isZero(RHS)) { 935 SDValue Cond = LHS; 936 SDValue Zero = RHS; 937 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 938 if (CompareVT != VT) { 939 // Bitcast True / False to the correct types. This will end up being 940 // a nop, but it allows us to define only a single pattern in the 941 // .TD files for each CND* instruction rather than having to have 942 // one pattern for integer True/False and one for fp True/False 943 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 944 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 945 } 946 947 switch (CCOpcode) { 948 case ISD::SETONE: 949 case ISD::SETUNE: 950 case ISD::SETNE: 951 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT); 952 Temp = True; 953 True = False; 954 False = Temp; 955 break; 956 default: 957 break; 958 } 959 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 960 Cond, Zero, 961 True, False, 962 DAG.getCondCode(CCOpcode)); 963 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 964 } 965 966 // If we make it this for it means we have no native instructions to handle 967 // this SELECT_CC, so we must lower it. 968 SDValue HWTrue, HWFalse; 969 970 if (CompareVT == MVT::f32) { 971 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 972 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 973 } else if (CompareVT == MVT::i32) { 974 HWTrue = DAG.getConstant(-1, DL, CompareVT); 975 HWFalse = DAG.getConstant(0, DL, CompareVT); 976 } 977 else { 978 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 979 } 980 981 // Lower this unsupported SELECT_CC into a combination of two supported 982 // SELECT_CC operations. 983 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 984 985 return DAG.getNode(ISD::SELECT_CC, DL, VT, 986 Cond, HWFalse, 987 True, False, 988 DAG.getCondCode(ISD::SETNE)); 989 } 990 991 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 992 /// convert these pointers to a register index. Each register holds 993 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 994 /// \p StackWidth, which tells us how many of the 4 sub-registers will be used 995 /// for indirect addressing. 996 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 997 unsigned StackWidth, 998 SelectionDAG &DAG) const { 999 unsigned SRLPad; 1000 switch(StackWidth) { 1001 case 1: 1002 SRLPad = 2; 1003 break; 1004 case 2: 1005 SRLPad = 3; 1006 break; 1007 case 4: 1008 SRLPad = 4; 1009 break; 1010 default: llvm_unreachable("Invalid stack width"); 1011 } 1012 1013 SDLoc DL(Ptr); 1014 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1015 DAG.getConstant(SRLPad, DL, MVT::i32)); 1016 } 1017 1018 void R600TargetLowering::getStackAddress(unsigned StackWidth, 1019 unsigned ElemIdx, 1020 unsigned &Channel, 1021 unsigned &PtrIncr) const { 1022 switch (StackWidth) { 1023 default: 1024 case 1: 1025 Channel = 0; 1026 if (ElemIdx > 0) { 1027 PtrIncr = 1; 1028 } else { 1029 PtrIncr = 0; 1030 } 1031 break; 1032 case 2: 1033 Channel = ElemIdx % 2; 1034 if (ElemIdx == 2) { 1035 PtrIncr = 1; 1036 } else { 1037 PtrIncr = 0; 1038 } 1039 break; 1040 case 4: 1041 Channel = ElemIdx; 1042 PtrIncr = 0; 1043 break; 1044 } 1045 } 1046 1047 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, 1048 SelectionDAG &DAG) const { 1049 SDLoc DL(Store); 1050 //TODO: Who creates the i8 stores? 1051 assert(Store->isTruncatingStore() 1052 || Store->getValue().getValueType() == MVT::i8); 1053 assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); 1054 1055 SDValue Mask; 1056 if (Store->getMemoryVT() == MVT::i8) { 1057 assert(Store->getAlignment() >= 1); 1058 Mask = DAG.getConstant(0xff, DL, MVT::i32); 1059 } else if (Store->getMemoryVT() == MVT::i16) { 1060 assert(Store->getAlignment() >= 2); 1061 Mask = DAG.getConstant(0xffff, DL, MVT::i32); 1062 } else { 1063 llvm_unreachable("Unsupported private trunc store"); 1064 } 1065 1066 SDValue OldChain = Store->getChain(); 1067 bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN); 1068 // Skip dummy 1069 SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain; 1070 SDValue BasePtr = Store->getBasePtr(); 1071 SDValue Offset = Store->getOffset(); 1072 EVT MemVT = Store->getMemoryVT(); 1073 1074 SDValue LoadPtr = BasePtr; 1075 if (!Offset.isUndef()) { 1076 LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); 1077 } 1078 1079 // Get dword location 1080 // TODO: this should be eliminated by the future SHR ptr, 2 1081 SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1082 DAG.getConstant(0xfffffffc, DL, MVT::i32)); 1083 1084 // Load dword 1085 // TODO: can we be smarter about machine pointer info? 1086 MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS); 1087 SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); 1088 1089 Chain = Dst.getValue(1); 1090 1091 // Get offset in dword 1092 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1093 DAG.getConstant(0x3, DL, MVT::i32)); 1094 1095 // Convert byte offset to bit shift 1096 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1097 DAG.getConstant(3, DL, MVT::i32)); 1098 1099 // TODO: Contrary to the name of the function, 1100 // it also handles sub i32 non-truncating stores (like i1) 1101 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1102 Store->getValue()); 1103 1104 // Mask the value to the right type 1105 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1106 1107 // Shift the value in place 1108 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1109 MaskedValue, ShiftAmt); 1110 1111 // Shift the mask in place 1112 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt); 1113 1114 // Invert the mask. NOTE: if we had native ROL instructions we could 1115 // use inverted mask 1116 DstMask = DAG.getNOT(DL, DstMask, MVT::i32); 1117 1118 // Cleanup the target bits 1119 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1120 1121 // Add the new bits 1122 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1123 1124 // Store dword 1125 // TODO: Can we be smarter about MachinePointerInfo? 1126 SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo); 1127 1128 // If we are part of expanded vector, make our neighbors depend on this store 1129 if (VectorTrunc) { 1130 // Make all other vector elements depend on this store 1131 Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore); 1132 DAG.ReplaceAllUsesOfValueWith(OldChain, Chain); 1133 } 1134 return NewStore; 1135 } 1136 1137 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1138 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1139 unsigned AS = StoreNode->getAddressSpace(); 1140 1141 SDValue Chain = StoreNode->getChain(); 1142 SDValue Ptr = StoreNode->getBasePtr(); 1143 SDValue Value = StoreNode->getValue(); 1144 1145 EVT VT = Value.getValueType(); 1146 EVT MemVT = StoreNode->getMemoryVT(); 1147 EVT PtrVT = Ptr.getValueType(); 1148 1149 SDLoc DL(Op); 1150 1151 const bool TruncatingStore = StoreNode->isTruncatingStore(); 1152 1153 // Neither LOCAL nor PRIVATE can do vectors at the moment 1154 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS || 1155 TruncatingStore) && 1156 VT.isVector()) { 1157 if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { 1158 // Add an extra level of chain to isolate this vector 1159 SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); 1160 // TODO: can the chain be replaced without creating a new store? 1161 SDValue NewStore = DAG.getTruncStore( 1162 NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), 1163 MemVT, StoreNode->getAlignment(), 1164 StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo()); 1165 StoreNode = cast<StoreSDNode>(NewStore); 1166 } 1167 1168 return scalarizeVectorStore(StoreNode, DAG); 1169 } 1170 1171 Align Alignment = StoreNode->getAlign(); 1172 if (Alignment < MemVT.getStoreSize() && 1173 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, 1174 StoreNode->getMemOperand()->getFlags(), 1175 nullptr)) { 1176 return expandUnalignedStore(StoreNode, DAG); 1177 } 1178 1179 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, 1180 DAG.getConstant(2, DL, PtrVT)); 1181 1182 if (AS == AMDGPUAS::GLOBAL_ADDRESS) { 1183 // It is beneficial to create MSKOR here instead of combiner to avoid 1184 // artificial dependencies introduced by RMW 1185 if (TruncatingStore) { 1186 assert(VT.bitsLE(MVT::i32)); 1187 SDValue MaskConstant; 1188 if (MemVT == MVT::i8) { 1189 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1190 } else { 1191 assert(MemVT == MVT::i16); 1192 assert(StoreNode->getAlignment() >= 2); 1193 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1194 } 1195 1196 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr, 1197 DAG.getConstant(0x00000003, DL, PtrVT)); 1198 SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1199 DAG.getConstant(3, DL, VT)); 1200 1201 // Put the mask in correct place 1202 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); 1203 1204 // Put the value bits in correct place 1205 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1206 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); 1207 1208 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1209 // vector instead. 1210 SDValue Src[4] = { 1211 ShiftedValue, 1212 DAG.getConstant(0, DL, MVT::i32), 1213 DAG.getConstant(0, DL, MVT::i32), 1214 Mask 1215 }; 1216 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); 1217 SDValue Args[3] = { Chain, Input, DWordAddr }; 1218 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1219 Op->getVTList(), Args, MemVT, 1220 StoreNode->getMemOperand()); 1221 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) { 1222 // Convert pointer from byte address to dword address. 1223 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); 1224 1225 if (StoreNode->isIndexed()) { 1226 llvm_unreachable("Indexed stores not supported yet"); 1227 } else { 1228 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1229 } 1230 return Chain; 1231 } 1232 } 1233 1234 // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes 1235 if (AS != AMDGPUAS::PRIVATE_ADDRESS) 1236 return SDValue(); 1237 1238 if (MemVT.bitsLT(MVT::i32)) 1239 return lowerPrivateTruncStore(StoreNode, DAG); 1240 1241 // Standard i32+ store, tag it with DWORDADDR to note that the address 1242 // has been shifted 1243 if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { 1244 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); 1245 return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1246 } 1247 1248 // Tagged i32+ stores will be matched by patterns 1249 return SDValue(); 1250 } 1251 1252 // return (512 + (kc_bank << 12) 1253 static int 1254 ConstantAddressBlock(unsigned AddressSpace) { 1255 switch (AddressSpace) { 1256 case AMDGPUAS::CONSTANT_BUFFER_0: 1257 return 512; 1258 case AMDGPUAS::CONSTANT_BUFFER_1: 1259 return 512 + 4096; 1260 case AMDGPUAS::CONSTANT_BUFFER_2: 1261 return 512 + 4096 * 2; 1262 case AMDGPUAS::CONSTANT_BUFFER_3: 1263 return 512 + 4096 * 3; 1264 case AMDGPUAS::CONSTANT_BUFFER_4: 1265 return 512 + 4096 * 4; 1266 case AMDGPUAS::CONSTANT_BUFFER_5: 1267 return 512 + 4096 * 5; 1268 case AMDGPUAS::CONSTANT_BUFFER_6: 1269 return 512 + 4096 * 6; 1270 case AMDGPUAS::CONSTANT_BUFFER_7: 1271 return 512 + 4096 * 7; 1272 case AMDGPUAS::CONSTANT_BUFFER_8: 1273 return 512 + 4096 * 8; 1274 case AMDGPUAS::CONSTANT_BUFFER_9: 1275 return 512 + 4096 * 9; 1276 case AMDGPUAS::CONSTANT_BUFFER_10: 1277 return 512 + 4096 * 10; 1278 case AMDGPUAS::CONSTANT_BUFFER_11: 1279 return 512 + 4096 * 11; 1280 case AMDGPUAS::CONSTANT_BUFFER_12: 1281 return 512 + 4096 * 12; 1282 case AMDGPUAS::CONSTANT_BUFFER_13: 1283 return 512 + 4096 * 13; 1284 case AMDGPUAS::CONSTANT_BUFFER_14: 1285 return 512 + 4096 * 14; 1286 case AMDGPUAS::CONSTANT_BUFFER_15: 1287 return 512 + 4096 * 15; 1288 default: 1289 return -1; 1290 } 1291 } 1292 1293 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, 1294 SelectionDAG &DAG) const { 1295 SDLoc DL(Op); 1296 LoadSDNode *Load = cast<LoadSDNode>(Op); 1297 ISD::LoadExtType ExtType = Load->getExtensionType(); 1298 EVT MemVT = Load->getMemoryVT(); 1299 assert(Load->getAlignment() >= MemVT.getStoreSize()); 1300 1301 SDValue BasePtr = Load->getBasePtr(); 1302 SDValue Chain = Load->getChain(); 1303 SDValue Offset = Load->getOffset(); 1304 1305 SDValue LoadPtr = BasePtr; 1306 if (!Offset.isUndef()) { 1307 LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); 1308 } 1309 1310 // Get dword location 1311 // NOTE: this should be eliminated by the future SHR ptr, 2 1312 SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, 1313 DAG.getConstant(0xfffffffc, DL, MVT::i32)); 1314 1315 // Load dword 1316 // TODO: can we be smarter about machine pointer info? 1317 MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS); 1318 SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); 1319 1320 // Get offset within the register. 1321 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1322 LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); 1323 1324 // Bit offset of target byte (byteIdx * 8). 1325 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1326 DAG.getConstant(3, DL, MVT::i32)); 1327 1328 // Shift to the right. 1329 SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt); 1330 1331 // Eliminate the upper bits by setting them to ... 1332 EVT MemEltVT = MemVT.getScalarType(); 1333 1334 if (ExtType == ISD::SEXTLOAD) { // ... ones. 1335 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1336 Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); 1337 } else { // ... or zeros. 1338 Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT); 1339 } 1340 1341 SDValue Ops[] = { 1342 Ret, 1343 Read.getValue(1) // This should be our output chain 1344 }; 1345 1346 return DAG.getMergeValues(Ops, DL); 1347 } 1348 1349 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1350 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1351 unsigned AS = LoadNode->getAddressSpace(); 1352 EVT MemVT = LoadNode->getMemoryVT(); 1353 ISD::LoadExtType ExtType = LoadNode->getExtensionType(); 1354 1355 if (AS == AMDGPUAS::PRIVATE_ADDRESS && 1356 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { 1357 return lowerPrivateExtLoad(Op, DAG); 1358 } 1359 1360 SDLoc DL(Op); 1361 EVT VT = Op.getValueType(); 1362 SDValue Chain = LoadNode->getChain(); 1363 SDValue Ptr = LoadNode->getBasePtr(); 1364 1365 if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1366 LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 1367 VT.isVector()) { 1368 SDValue Ops[2]; 1369 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LoadNode, DAG); 1370 return DAG.getMergeValues(Ops, DL); 1371 } 1372 1373 // This is still used for explicit load from addrspace(8) 1374 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1375 if (ConstantBlock > -1 && 1376 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1377 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1378 SDValue Result; 1379 if (isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1380 isa<ConstantSDNode>(Ptr)) { 1381 return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); 1382 } else { 1383 //TODO: Does this even work? 1384 // non-constant ptr can't be folded, keeps it as a v4f32 load 1385 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1386 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1387 DAG.getConstant(4, DL, MVT::i32)), 1388 DAG.getConstant(LoadNode->getAddressSpace() - 1389 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1390 ); 1391 } 1392 1393 if (!VT.isVector()) { 1394 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1395 DAG.getConstant(0, DL, MVT::i32)); 1396 } 1397 1398 SDValue MergedValues[2] = { 1399 Result, 1400 Chain 1401 }; 1402 return DAG.getMergeValues(MergedValues, DL); 1403 } 1404 1405 // For most operations returning SDValue() will result in the node being 1406 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1407 // need to manually expand loads that may be legal in some address spaces and 1408 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1409 // compute shaders, since the data is sign extended when it is uploaded to the 1410 // buffer. However SEXT loads from other address spaces are not supported, so 1411 // we need to expand them here. 1412 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1413 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1414 SDValue NewLoad = DAG.getExtLoad( 1415 ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, 1416 LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); 1417 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1418 DAG.getValueType(MemVT)); 1419 1420 SDValue MergedValues[2] = { Res, Chain }; 1421 return DAG.getMergeValues(MergedValues, DL); 1422 } 1423 1424 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1425 return SDValue(); 1426 } 1427 1428 // DWORDADDR ISD marks already shifted address 1429 if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { 1430 assert(VT == MVT::i32); 1431 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); 1432 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); 1433 return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand()); 1434 } 1435 return SDValue(); 1436 } 1437 1438 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1439 SDValue Chain = Op.getOperand(0); 1440 SDValue Cond = Op.getOperand(1); 1441 SDValue Jump = Op.getOperand(2); 1442 1443 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1444 Chain, Jump, Cond); 1445 } 1446 1447 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, 1448 SelectionDAG &DAG) const { 1449 MachineFunction &MF = DAG.getMachineFunction(); 1450 const R600FrameLowering *TFL = Subtarget->getFrameLowering(); 1451 1452 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 1453 1454 unsigned FrameIndex = FIN->getIndex(); 1455 Register IgnoredFrameReg; 1456 StackOffset Offset = 1457 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); 1458 return DAG.getConstant(Offset.getFixed() * 4 * TFL->getStackWidth(MF), 1459 SDLoc(Op), Op.getValueType()); 1460 } 1461 1462 CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1463 bool IsVarArg) const { 1464 switch (CC) { 1465 case CallingConv::AMDGPU_KERNEL: 1466 case CallingConv::SPIR_KERNEL: 1467 case CallingConv::C: 1468 case CallingConv::Fast: 1469 case CallingConv::Cold: 1470 llvm_unreachable("kernels should not be handled here"); 1471 case CallingConv::AMDGPU_VS: 1472 case CallingConv::AMDGPU_GS: 1473 case CallingConv::AMDGPU_PS: 1474 case CallingConv::AMDGPU_CS: 1475 case CallingConv::AMDGPU_HS: 1476 case CallingConv::AMDGPU_ES: 1477 case CallingConv::AMDGPU_LS: 1478 return CC_R600; 1479 default: 1480 report_fatal_error("Unsupported calling convention."); 1481 } 1482 } 1483 1484 /// XXX Only kernel functions are supported, so we can assume for now that 1485 /// every function is a kernel function, but in the future we should use 1486 /// separate calling conventions for kernel and non-kernel functions. 1487 SDValue R600TargetLowering::LowerFormalArguments( 1488 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1489 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1490 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1491 SmallVector<CCValAssign, 16> ArgLocs; 1492 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1493 *DAG.getContext()); 1494 MachineFunction &MF = DAG.getMachineFunction(); 1495 SmallVector<ISD::InputArg, 8> LocalIns; 1496 1497 if (AMDGPU::isShader(CallConv)) { 1498 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 1499 } else { 1500 analyzeFormalArgumentsCompute(CCInfo, Ins); 1501 } 1502 1503 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1504 CCValAssign &VA = ArgLocs[i]; 1505 const ISD::InputArg &In = Ins[i]; 1506 EVT VT = In.VT; 1507 EVT MemVT = VA.getLocVT(); 1508 if (!VT.isVector() && MemVT.isVector()) { 1509 // Get load source type if scalarized. 1510 MemVT = MemVT.getVectorElementType(); 1511 } 1512 1513 if (AMDGPU::isShader(CallConv)) { 1514 Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); 1515 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1516 InVals.push_back(Register); 1517 continue; 1518 } 1519 1520 // i64 isn't a legal type, so the register type used ends up as i32, which 1521 // isn't expected here. It attempts to create this sextload, but it ends up 1522 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1523 // for <1 x i64>. 1524 1525 // The first 36 bytes of the input buffer contains information about 1526 // thread group and global sizes. 1527 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1528 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1529 // FIXME: This should really check the extload type, but the handling of 1530 // extload vector parameters seems to be broken. 1531 1532 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1533 Ext = ISD::SEXTLOAD; 1534 } 1535 1536 // Compute the offset from the value. 1537 // XXX - I think PartOffset should give you this, but it seems to give the 1538 // size of the register which isn't useful. 1539 1540 unsigned PartOffset = VA.getLocMemOffset(); 1541 unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset); 1542 1543 MachinePointerInfo PtrInfo(AMDGPUAS::PARAM_I_ADDRESS); 1544 SDValue Arg = DAG.getLoad( 1545 ISD::UNINDEXED, Ext, VT, DL, Chain, 1546 DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), 1547 PtrInfo, 1548 MemVT, Alignment, MachineMemOperand::MONonTemporal | 1549 MachineMemOperand::MODereferenceable | 1550 MachineMemOperand::MOInvariant); 1551 1552 InVals.push_back(Arg); 1553 } 1554 return Chain; 1555 } 1556 1557 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1558 EVT VT) const { 1559 if (!VT.isVector()) 1560 return MVT::i32; 1561 return VT.changeVectorElementTypeToInteger(); 1562 } 1563 1564 bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 1565 const MachineFunction &MF) const { 1566 // Local and Private addresses do not handle vectors. Limit to i32 1567 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) { 1568 return (MemVT.getSizeInBits() <= 32); 1569 } 1570 return true; 1571 } 1572 1573 bool R600TargetLowering::allowsMisalignedMemoryAccesses( 1574 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1575 bool *IsFast) const { 1576 if (IsFast) 1577 *IsFast = false; 1578 1579 if (!VT.isSimple() || VT == MVT::Other) 1580 return false; 1581 1582 if (VT.bitsLT(MVT::i32)) 1583 return false; 1584 1585 // TODO: This is a rough estimate. 1586 if (IsFast) 1587 *IsFast = true; 1588 1589 return VT.bitsGT(MVT::i32) && Alignment >= Align(4); 1590 } 1591 1592 static SDValue CompactSwizzlableVector( 1593 SelectionDAG &DAG, SDValue VectorEntry, 1594 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1595 assert(RemapSwizzle.empty()); 1596 1597 SDLoc DL(VectorEntry); 1598 EVT EltTy = VectorEntry.getValueType().getVectorElementType(); 1599 1600 SDValue NewBldVec[4]; 1601 for (unsigned i = 0; i < 4; i++) 1602 NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, 1603 DAG.getIntPtrConstant(i, DL)); 1604 1605 for (unsigned i = 0; i < 4; i++) { 1606 if (NewBldVec[i].isUndef()) 1607 // We mask write here to teach later passes that the ith element of this 1608 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1609 // break false dependencies and additionally make assembly easier to read. 1610 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1611 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1612 if (C->isZero()) { 1613 RemapSwizzle[i] = 4; // SEL_0 1614 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1615 } else if (C->isExactlyValue(1.0)) { 1616 RemapSwizzle[i] = 5; // SEL_1 1617 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1618 } 1619 } 1620 1621 if (NewBldVec[i].isUndef()) 1622 continue; 1623 1624 for (unsigned j = 0; j < i; j++) { 1625 if (NewBldVec[i] == NewBldVec[j]) { 1626 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1627 RemapSwizzle[i] = j; 1628 break; 1629 } 1630 } 1631 } 1632 1633 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1634 NewBldVec); 1635 } 1636 1637 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1638 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1639 assert(RemapSwizzle.empty()); 1640 1641 SDLoc DL(VectorEntry); 1642 EVT EltTy = VectorEntry.getValueType().getVectorElementType(); 1643 1644 SDValue NewBldVec[4]; 1645 bool isUnmovable[4] = {false, false, false, false}; 1646 for (unsigned i = 0; i < 4; i++) 1647 NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, 1648 DAG.getIntPtrConstant(i, DL)); 1649 1650 for (unsigned i = 0; i < 4; i++) { 1651 RemapSwizzle[i] = i; 1652 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1653 unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1654 ->getZExtValue(); 1655 if (i == Idx) 1656 isUnmovable[Idx] = true; 1657 } 1658 } 1659 1660 for (unsigned i = 0; i < 4; i++) { 1661 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1662 unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1663 ->getZExtValue(); 1664 if (isUnmovable[Idx]) 1665 continue; 1666 // Swap i and Idx 1667 std::swap(NewBldVec[Idx], NewBldVec[i]); 1668 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1669 break; 1670 } 1671 } 1672 1673 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1674 NewBldVec); 1675 } 1676 1677 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], 1678 SelectionDAG &DAG, 1679 const SDLoc &DL) const { 1680 // Old -> New swizzle values 1681 DenseMap<unsigned, unsigned> SwizzleRemap; 1682 1683 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1684 for (unsigned i = 0; i < 4; i++) { 1685 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1686 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1687 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1688 } 1689 1690 SwizzleRemap.clear(); 1691 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1692 for (unsigned i = 0; i < 4; i++) { 1693 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1694 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1695 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1696 } 1697 1698 return BuildVector; 1699 } 1700 1701 SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, 1702 SelectionDAG &DAG) const { 1703 SDLoc DL(LoadNode); 1704 EVT VT = LoadNode->getValueType(0); 1705 SDValue Chain = LoadNode->getChain(); 1706 SDValue Ptr = LoadNode->getBasePtr(); 1707 assert (isa<ConstantSDNode>(Ptr)); 1708 1709 //TODO: Support smaller loads 1710 if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) 1711 return SDValue(); 1712 1713 if (LoadNode->getAlignment() < 4) 1714 return SDValue(); 1715 1716 int ConstantBlock = ConstantAddressBlock(Block); 1717 1718 SDValue Slots[4]; 1719 for (unsigned i = 0; i < 4; i++) { 1720 // We want Const position encoded with the following formula : 1721 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1722 // const_index is Ptr computed by llvm using an alignment of 16. 1723 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1724 // then div by 4 at the ISel step 1725 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1726 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1727 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1728 } 1729 EVT NewVT = MVT::v4i32; 1730 unsigned NumElements = 4; 1731 if (VT.isVector()) { 1732 NewVT = VT; 1733 NumElements = VT.getVectorNumElements(); 1734 } 1735 SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); 1736 if (!VT.isVector()) { 1737 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1738 DAG.getConstant(0, DL, MVT::i32)); 1739 } 1740 SDValue MergedValues[2] = { 1741 Result, 1742 Chain 1743 }; 1744 return DAG.getMergeValues(MergedValues, DL); 1745 } 1746 1747 //===----------------------------------------------------------------------===// 1748 // Custom DAG Optimizations 1749 //===----------------------------------------------------------------------===// 1750 1751 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1752 DAGCombinerInfo &DCI) const { 1753 SelectionDAG &DAG = DCI.DAG; 1754 SDLoc DL(N); 1755 1756 switch (N->getOpcode()) { 1757 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1758 case ISD::FP_ROUND: { 1759 SDValue Arg = N->getOperand(0); 1760 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1761 return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0), 1762 Arg.getOperand(0)); 1763 } 1764 break; 1765 } 1766 1767 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1768 // (i32 select_cc f32, f32, -1, 0 cc) 1769 // 1770 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1771 // this to one of the SET*_DX10 instructions. 1772 case ISD::FP_TO_SINT: { 1773 SDValue FNeg = N->getOperand(0); 1774 if (FNeg.getOpcode() != ISD::FNEG) { 1775 return SDValue(); 1776 } 1777 SDValue SelectCC = FNeg.getOperand(0); 1778 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1779 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1780 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1781 !isHWTrueValue(SelectCC.getOperand(2)) || 1782 !isHWFalseValue(SelectCC.getOperand(3))) { 1783 return SDValue(); 1784 } 1785 1786 return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0), 1787 SelectCC.getOperand(0), // LHS 1788 SelectCC.getOperand(1), // RHS 1789 DAG.getConstant(-1, DL, MVT::i32), // True 1790 DAG.getConstant(0, DL, MVT::i32), // False 1791 SelectCC.getOperand(4)); // CC 1792 } 1793 1794 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1795 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1796 case ISD::INSERT_VECTOR_ELT: { 1797 SDValue InVec = N->getOperand(0); 1798 SDValue InVal = N->getOperand(1); 1799 SDValue EltNo = N->getOperand(2); 1800 1801 // If the inserted element is an UNDEF, just use the input vector. 1802 if (InVal.isUndef()) 1803 return InVec; 1804 1805 EVT VT = InVec.getValueType(); 1806 1807 // If we can't generate a legal BUILD_VECTOR, exit 1808 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1809 return SDValue(); 1810 1811 // Check that we know which element is being inserted 1812 if (!isa<ConstantSDNode>(EltNo)) 1813 return SDValue(); 1814 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1815 1816 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1817 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1818 // vector elements. 1819 SmallVector<SDValue, 8> Ops; 1820 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1821 Ops.append(InVec.getNode()->op_begin(), 1822 InVec.getNode()->op_end()); 1823 } else if (InVec.isUndef()) { 1824 unsigned NElts = VT.getVectorNumElements(); 1825 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1826 } else { 1827 return SDValue(); 1828 } 1829 1830 // Insert the element 1831 if (Elt < Ops.size()) { 1832 // All the operands of BUILD_VECTOR must have the same type; 1833 // we enforce that here. 1834 EVT OpVT = Ops[0].getValueType(); 1835 if (InVal.getValueType() != OpVT) 1836 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1837 DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) : 1838 DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal); 1839 Ops[Elt] = InVal; 1840 } 1841 1842 // Return the new vector 1843 return DAG.getBuildVector(VT, DL, Ops); 1844 } 1845 1846 // Extract_vec (Build_vector) generated by custom lowering 1847 // also needs to be customly combined 1848 case ISD::EXTRACT_VECTOR_ELT: { 1849 SDValue Arg = N->getOperand(0); 1850 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1851 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1852 unsigned Element = Const->getZExtValue(); 1853 return Arg->getOperand(Element); 1854 } 1855 } 1856 if (Arg.getOpcode() == ISD::BITCAST && 1857 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 1858 (Arg.getOperand(0).getValueType().getVectorNumElements() == 1859 Arg.getValueType().getVectorNumElements())) { 1860 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1861 unsigned Element = Const->getZExtValue(); 1862 return DAG.getNode(ISD::BITCAST, DL, N->getVTList(), 1863 Arg->getOperand(0).getOperand(Element)); 1864 } 1865 } 1866 break; 1867 } 1868 1869 case ISD::SELECT_CC: { 1870 // Try common optimizations 1871 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) 1872 return Ret; 1873 1874 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1875 // selectcc x, y, a, b, inv(cc) 1876 // 1877 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1878 // selectcc x, y, a, b, cc 1879 SDValue LHS = N->getOperand(0); 1880 if (LHS.getOpcode() != ISD::SELECT_CC) { 1881 return SDValue(); 1882 } 1883 1884 SDValue RHS = N->getOperand(1); 1885 SDValue True = N->getOperand(2); 1886 SDValue False = N->getOperand(3); 1887 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1888 1889 if (LHS.getOperand(2).getNode() != True.getNode() || 1890 LHS.getOperand(3).getNode() != False.getNode() || 1891 RHS.getNode() != False.getNode()) { 1892 return SDValue(); 1893 } 1894 1895 switch (NCC) { 1896 default: return SDValue(); 1897 case ISD::SETNE: return LHS; 1898 case ISD::SETEQ: { 1899 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1900 LHSCC = ISD::getSetCCInverse(LHSCC, LHS.getOperand(0).getValueType()); 1901 if (DCI.isBeforeLegalizeOps() || 1902 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1903 return DAG.getSelectCC(DL, 1904 LHS.getOperand(0), 1905 LHS.getOperand(1), 1906 LHS.getOperand(2), 1907 LHS.getOperand(3), 1908 LHSCC); 1909 break; 1910 } 1911 } 1912 return SDValue(); 1913 } 1914 1915 case AMDGPUISD::R600_EXPORT: { 1916 SDValue Arg = N->getOperand(1); 1917 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1918 break; 1919 1920 SDValue NewArgs[8] = { 1921 N->getOperand(0), // Chain 1922 SDValue(), 1923 N->getOperand(2), // ArrayBase 1924 N->getOperand(3), // Type 1925 N->getOperand(4), // SWZ_X 1926 N->getOperand(5), // SWZ_Y 1927 N->getOperand(6), // SWZ_Z 1928 N->getOperand(7) // SWZ_W 1929 }; 1930 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 1931 return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs); 1932 } 1933 case AMDGPUISD::TEXTURE_FETCH: { 1934 SDValue Arg = N->getOperand(1); 1935 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1936 break; 1937 1938 SDValue NewArgs[19] = { 1939 N->getOperand(0), 1940 N->getOperand(1), 1941 N->getOperand(2), 1942 N->getOperand(3), 1943 N->getOperand(4), 1944 N->getOperand(5), 1945 N->getOperand(6), 1946 N->getOperand(7), 1947 N->getOperand(8), 1948 N->getOperand(9), 1949 N->getOperand(10), 1950 N->getOperand(11), 1951 N->getOperand(12), 1952 N->getOperand(13), 1953 N->getOperand(14), 1954 N->getOperand(15), 1955 N->getOperand(16), 1956 N->getOperand(17), 1957 N->getOperand(18), 1958 }; 1959 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 1960 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 1961 } 1962 1963 case ISD::LOAD: { 1964 LoadSDNode *LoadNode = cast<LoadSDNode>(N); 1965 SDValue Ptr = LoadNode->getBasePtr(); 1966 if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS && 1967 isa<ConstantSDNode>(Ptr)) 1968 return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG); 1969 break; 1970 } 1971 1972 default: break; 1973 } 1974 1975 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1976 } 1977 1978 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, 1979 SDValue &Src, SDValue &Neg, SDValue &Abs, 1980 SDValue &Sel, SDValue &Imm, 1981 SelectionDAG &DAG) const { 1982 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 1983 if (!Src.isMachineOpcode()) 1984 return false; 1985 1986 switch (Src.getMachineOpcode()) { 1987 case R600::FNEG_R600: 1988 if (!Neg.getNode()) 1989 return false; 1990 Src = Src.getOperand(0); 1991 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 1992 return true; 1993 case R600::FABS_R600: 1994 if (!Abs.getNode()) 1995 return false; 1996 Src = Src.getOperand(0); 1997 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 1998 return true; 1999 case R600::CONST_COPY: { 2000 unsigned Opcode = ParentNode->getMachineOpcode(); 2001 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2002 2003 if (!Sel.getNode()) 2004 return false; 2005 2006 SDValue CstOffset = Src.getOperand(0); 2007 if (ParentNode->getValueType(0).isVector()) 2008 return false; 2009 2010 // Gather constants values 2011 int SrcIndices[] = { 2012 TII->getOperandIdx(Opcode, R600::OpName::src0), 2013 TII->getOperandIdx(Opcode, R600::OpName::src1), 2014 TII->getOperandIdx(Opcode, R600::OpName::src2), 2015 TII->getOperandIdx(Opcode, R600::OpName::src0_X), 2016 TII->getOperandIdx(Opcode, R600::OpName::src0_Y), 2017 TII->getOperandIdx(Opcode, R600::OpName::src0_Z), 2018 TII->getOperandIdx(Opcode, R600::OpName::src0_W), 2019 TII->getOperandIdx(Opcode, R600::OpName::src1_X), 2020 TII->getOperandIdx(Opcode, R600::OpName::src1_Y), 2021 TII->getOperandIdx(Opcode, R600::OpName::src1_Z), 2022 TII->getOperandIdx(Opcode, R600::OpName::src1_W) 2023 }; 2024 std::vector<unsigned> Consts; 2025 for (int OtherSrcIdx : SrcIndices) { 2026 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2027 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2028 continue; 2029 if (HasDst) { 2030 OtherSrcIdx--; 2031 OtherSelIdx--; 2032 } 2033 if (RegisterSDNode *Reg = 2034 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2035 if (Reg->getReg() == R600::ALU_CONST) { 2036 ConstantSDNode *Cst 2037 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2038 Consts.push_back(Cst->getZExtValue()); 2039 } 2040 } 2041 } 2042 2043 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2044 Consts.push_back(Cst->getZExtValue()); 2045 if (!TII->fitsConstReadLimitations(Consts)) { 2046 return false; 2047 } 2048 2049 Sel = CstOffset; 2050 Src = DAG.getRegister(R600::ALU_CONST, MVT::f32); 2051 return true; 2052 } 2053 case R600::MOV_IMM_GLOBAL_ADDR: 2054 // Check if the Imm slot is used. Taken from below. 2055 if (cast<ConstantSDNode>(Imm)->getZExtValue()) 2056 return false; 2057 Imm = Src.getOperand(0); 2058 Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); 2059 return true; 2060 case R600::MOV_IMM_I32: 2061 case R600::MOV_IMM_F32: { 2062 unsigned ImmReg = R600::ALU_LITERAL_X; 2063 uint64_t ImmValue = 0; 2064 2065 if (Src.getMachineOpcode() == R600::MOV_IMM_F32) { 2066 ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Src.getOperand(0)); 2067 float FloatValue = FPC->getValueAPF().convertToFloat(); 2068 if (FloatValue == 0.0) { 2069 ImmReg = R600::ZERO; 2070 } else if (FloatValue == 0.5) { 2071 ImmReg = R600::HALF; 2072 } else if (FloatValue == 1.0) { 2073 ImmReg = R600::ONE; 2074 } else { 2075 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2076 } 2077 } else { 2078 ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0)); 2079 uint64_t Value = C->getZExtValue(); 2080 if (Value == 0) { 2081 ImmReg = R600::ZERO; 2082 } else if (Value == 1) { 2083 ImmReg = R600::ONE_INT; 2084 } else { 2085 ImmValue = Value; 2086 } 2087 } 2088 2089 // Check that we aren't already using an immediate. 2090 // XXX: It's possible for an instruction to have more than one 2091 // immediate operand, but this is not supported yet. 2092 if (ImmReg == R600::ALU_LITERAL_X) { 2093 if (!Imm.getNode()) 2094 return false; 2095 ConstantSDNode *C = cast<ConstantSDNode>(Imm); 2096 if (C->getZExtValue()) 2097 return false; 2098 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2099 } 2100 Src = DAG.getRegister(ImmReg, MVT::i32); 2101 return true; 2102 } 2103 default: 2104 return false; 2105 } 2106 } 2107 2108 /// Fold the instructions after selecting them 2109 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2110 SelectionDAG &DAG) const { 2111 const R600InstrInfo *TII = Subtarget->getInstrInfo(); 2112 if (!Node->isMachineOpcode()) 2113 return Node; 2114 2115 unsigned Opcode = Node->getMachineOpcode(); 2116 SDValue FakeOp; 2117 2118 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2119 2120 if (Opcode == R600::DOT_4) { 2121 int OperandIdx[] = { 2122 TII->getOperandIdx(Opcode, R600::OpName::src0_X), 2123 TII->getOperandIdx(Opcode, R600::OpName::src0_Y), 2124 TII->getOperandIdx(Opcode, R600::OpName::src0_Z), 2125 TII->getOperandIdx(Opcode, R600::OpName::src0_W), 2126 TII->getOperandIdx(Opcode, R600::OpName::src1_X), 2127 TII->getOperandIdx(Opcode, R600::OpName::src1_Y), 2128 TII->getOperandIdx(Opcode, R600::OpName::src1_Z), 2129 TII->getOperandIdx(Opcode, R600::OpName::src1_W) 2130 }; 2131 int NegIdx[] = { 2132 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X), 2133 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y), 2134 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z), 2135 TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W), 2136 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X), 2137 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y), 2138 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z), 2139 TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W) 2140 }; 2141 int AbsIdx[] = { 2142 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X), 2143 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y), 2144 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z), 2145 TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W), 2146 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X), 2147 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y), 2148 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z), 2149 TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W) 2150 }; 2151 for (unsigned i = 0; i < 8; i++) { 2152 if (OperandIdx[i] < 0) 2153 return Node; 2154 SDValue &Src = Ops[OperandIdx[i] - 1]; 2155 SDValue &Neg = Ops[NegIdx[i] - 1]; 2156 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2157 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2158 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2159 if (HasDst) 2160 SelIdx--; 2161 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2162 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2163 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2164 } 2165 } else if (Opcode == R600::REG_SEQUENCE) { 2166 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2167 SDValue &Src = Ops[i]; 2168 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2169 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2170 } 2171 } else { 2172 if (!TII->hasInstrModifiers(Opcode)) 2173 return Node; 2174 int OperandIdx[] = { 2175 TII->getOperandIdx(Opcode, R600::OpName::src0), 2176 TII->getOperandIdx(Opcode, R600::OpName::src1), 2177 TII->getOperandIdx(Opcode, R600::OpName::src2) 2178 }; 2179 int NegIdx[] = { 2180 TII->getOperandIdx(Opcode, R600::OpName::src0_neg), 2181 TII->getOperandIdx(Opcode, R600::OpName::src1_neg), 2182 TII->getOperandIdx(Opcode, R600::OpName::src2_neg) 2183 }; 2184 int AbsIdx[] = { 2185 TII->getOperandIdx(Opcode, R600::OpName::src0_abs), 2186 TII->getOperandIdx(Opcode, R600::OpName::src1_abs), 2187 -1 2188 }; 2189 for (unsigned i = 0; i < 3; i++) { 2190 if (OperandIdx[i] < 0) 2191 return Node; 2192 SDValue &Src = Ops[OperandIdx[i] - 1]; 2193 SDValue &Neg = Ops[NegIdx[i] - 1]; 2194 SDValue FakeAbs; 2195 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2196 bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; 2197 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2198 int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal); 2199 if (HasDst) { 2200 SelIdx--; 2201 ImmIdx--; 2202 } 2203 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2204 SDValue &Imm = Ops[ImmIdx]; 2205 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2206 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2207 } 2208 } 2209 2210 return Node; 2211 } 2212