10b57cec5SDimitry Andric //===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning 100b57cec5SDimitry Andric // of a MachineFunction. 110b57cec5SDimitry Andric // 120b57cec5SDimitry Andric // mov %SPL, %depot 130b57cec5SDimitry Andric // cvta.local %SP, %SPL 140b57cec5SDimitry Andric // 150b57cec5SDimitry Andric // Because Frame Index is a generic address and alloca can only return generic 160b57cec5SDimitry Andric // pointer, without this pass the instructions producing alloca'ed address will 170b57cec5SDimitry Andric // be based on %SP. NVPTXLowerAlloca tends to help replace store and load on 180b57cec5SDimitry Andric // this address with their .local versions, but this may introduce a lot of 190b57cec5SDimitry Andric // cvta.to.local instructions. Performance can be improved if we avoid casting 200b57cec5SDimitry Andric // address back and forth and directly calculate local address based on %SPL. 210b57cec5SDimitry Andric // This peephole pass optimizes these cases, for example 220b57cec5SDimitry Andric // 230b57cec5SDimitry Andric // It will transform the following pattern 24349cc55cSDimitry Andric // %0 = LEA_ADDRi64 %VRFrame64, 4 25*0fca6ea1SDimitry Andric // %1 = cvta_to_local_64 %0 260b57cec5SDimitry Andric // 270b57cec5SDimitry Andric // into 28349cc55cSDimitry Andric // %1 = LEA_ADDRi64 %VRFrameLocal64, 4 290b57cec5SDimitry Andric // 30349cc55cSDimitry Andric // %VRFrameLocal64 is the virtual register name of %SPL 310b57cec5SDimitry Andric // 320b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric #include "NVPTX.h" 35349cc55cSDimitry Andric #include "NVPTXRegisterInfo.h" 36349cc55cSDimitry Andric #include "NVPTXSubtarget.h" 370b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 380b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 390b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 400b57cec5SDimitry Andric #include "llvm/CodeGen/TargetInstrInfo.h" 410b57cec5SDimitry Andric #include "llvm/CodeGen/TargetRegisterInfo.h" 420b57cec5SDimitry Andric 430b57cec5SDimitry Andric using namespace llvm; 440b57cec5SDimitry Andric 450b57cec5SDimitry Andric #define DEBUG_TYPE "nvptx-peephole" 460b57cec5SDimitry Andric 470b57cec5SDimitry Andric namespace llvm { 480b57cec5SDimitry Andric void initializeNVPTXPeepholePass(PassRegistry &); 490b57cec5SDimitry Andric } 500b57cec5SDimitry Andric 510b57cec5SDimitry Andric namespace { 520b57cec5SDimitry Andric struct NVPTXPeephole : public MachineFunctionPass { 530b57cec5SDimitry Andric public: 540b57cec5SDimitry Andric static char ID; 550b57cec5SDimitry Andric NVPTXPeephole() : MachineFunctionPass(ID) { 560b57cec5SDimitry Andric initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry()); 570b57cec5SDimitry Andric } 580b57cec5SDimitry Andric 590b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 600b57cec5SDimitry Andric 610b57cec5SDimitry Andric StringRef getPassName() const override { 620b57cec5SDimitry Andric return "NVPTX optimize redundant cvta.to.local instruction"; 630b57cec5SDimitry Andric } 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 660b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 670b57cec5SDimitry Andric } 680b57cec5SDimitry Andric }; 690b57cec5SDimitry Andric } 700b57cec5SDimitry Andric 710b57cec5SDimitry Andric char NVPTXPeephole::ID = 0; 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false) 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { 760b57cec5SDimitry Andric auto &MBB = *Root.getParent(); 770b57cec5SDimitry Andric auto &MF = *MBB.getParent(); 780b57cec5SDimitry Andric // Check current instruction is cvta.to.local 79*0fca6ea1SDimitry Andric if (Root.getOpcode() != NVPTX::cvta_to_local_64 && 80*0fca6ea1SDimitry Andric Root.getOpcode() != NVPTX::cvta_to_local) 810b57cec5SDimitry Andric return false; 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric auto &Op = Root.getOperand(1); 840b57cec5SDimitry Andric const auto &MRI = MF.getRegInfo(); 850b57cec5SDimitry Andric MachineInstr *GenericAddrDef = nullptr; 86bdd1243dSDimitry Andric if (Op.isReg() && Op.getReg().isVirtual()) { 870b57cec5SDimitry Andric GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); 880b57cec5SDimitry Andric } 890b57cec5SDimitry Andric 900b57cec5SDimitry Andric // Check the register operand is uniquely defined by LEA_ADDRi instruction 910b57cec5SDimitry Andric if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB || 920b57cec5SDimitry Andric (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 && 930b57cec5SDimitry Andric GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) { 940b57cec5SDimitry Andric return false; 950b57cec5SDimitry Andric } 960b57cec5SDimitry Andric 97349cc55cSDimitry Andric const NVPTXRegisterInfo *NRI = 98349cc55cSDimitry Andric MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo(); 99349cc55cSDimitry Andric 1000b57cec5SDimitry Andric // Check the LEA_ADDRi operand is Frame index 1010b57cec5SDimitry Andric auto &BaseAddrOp = GenericAddrDef->getOperand(1); 102349cc55cSDimitry Andric if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NRI->getFrameRegister(MF)) { 1030b57cec5SDimitry Andric return true; 1040b57cec5SDimitry Andric } 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric return false; 1070b57cec5SDimitry Andric } 1080b57cec5SDimitry Andric 1090b57cec5SDimitry Andric static void CombineCVTAToLocal(MachineInstr &Root) { 1100b57cec5SDimitry Andric auto &MBB = *Root.getParent(); 1110b57cec5SDimitry Andric auto &MF = *MBB.getParent(); 1120b57cec5SDimitry Andric const auto &MRI = MF.getRegInfo(); 1130b57cec5SDimitry Andric const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 1140b57cec5SDimitry Andric auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 1150b57cec5SDimitry Andric 116349cc55cSDimitry Andric const NVPTXRegisterInfo *NRI = 117349cc55cSDimitry Andric MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo(); 118349cc55cSDimitry Andric 1190b57cec5SDimitry Andric MachineInstrBuilder MIB = 1200b57cec5SDimitry Andric BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()), 1210b57cec5SDimitry Andric Root.getOperand(0).getReg()) 122349cc55cSDimitry Andric .addReg(NRI->getFrameLocalRegister(MF)) 1230b57cec5SDimitry Andric .add(Prev.getOperand(2)); 1240b57cec5SDimitry Andric 1250b57cec5SDimitry Andric MBB.insert((MachineBasicBlock::iterator)&Root, MIB); 1260b57cec5SDimitry Andric 1270b57cec5SDimitry Andric // Check if MRI has only one non dbg use, which is Root 1280b57cec5SDimitry Andric if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) { 1290eae32dcSDimitry Andric Prev.eraseFromParent(); 1300b57cec5SDimitry Andric } 1310eae32dcSDimitry Andric Root.eraseFromParent(); 1320b57cec5SDimitry Andric } 1330b57cec5SDimitry Andric 1340b57cec5SDimitry Andric bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { 1350b57cec5SDimitry Andric if (skipFunction(MF.getFunction())) 1360b57cec5SDimitry Andric return false; 1370b57cec5SDimitry Andric 1380b57cec5SDimitry Andric bool Changed = false; 1390b57cec5SDimitry Andric // Loop over all of the basic blocks. 1400b57cec5SDimitry Andric for (auto &MBB : MF) { 1410b57cec5SDimitry Andric // Traverse the basic block. 1420b57cec5SDimitry Andric auto BlockIter = MBB.begin(); 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric while (BlockIter != MBB.end()) { 1450b57cec5SDimitry Andric auto &MI = *BlockIter++; 1460b57cec5SDimitry Andric if (isCVTAToLocalCombinationCandidate(MI)) { 1470b57cec5SDimitry Andric CombineCVTAToLocal(MI); 1480b57cec5SDimitry Andric Changed = true; 1490b57cec5SDimitry Andric } 1500b57cec5SDimitry Andric } // Instruction 1510b57cec5SDimitry Andric } // Basic Block 1520b57cec5SDimitry Andric 153349cc55cSDimitry Andric const NVPTXRegisterInfo *NRI = 154349cc55cSDimitry Andric MF.getSubtarget<NVPTXSubtarget>().getRegisterInfo(); 155349cc55cSDimitry Andric 1560b57cec5SDimitry Andric // Remove unnecessary %VRFrame = cvta.local %VRFrameLocal 1570b57cec5SDimitry Andric const auto &MRI = MF.getRegInfo(); 158349cc55cSDimitry Andric if (MRI.use_empty(NRI->getFrameRegister(MF))) { 159349cc55cSDimitry Andric if (auto MI = MRI.getUniqueVRegDef(NRI->getFrameRegister(MF))) { 1600eae32dcSDimitry Andric MI->eraseFromParent(); 1610b57cec5SDimitry Andric } 1620b57cec5SDimitry Andric } 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric return Changed; 1650b57cec5SDimitry Andric } 1660b57cec5SDimitry Andric 1670b57cec5SDimitry Andric MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); } 168