1 //===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass turns chains of integer comparisons into memcmp (the memcmp is 10 // later typically inlined as a chain of efficient hardware comparisons). This 11 // typically benefits c++ member or nonmember operator==(). 12 // 13 // The basic idea is to replace a longer chain of integer comparisons loaded 14 // from contiguous memory locations into a shorter chain of larger integer 15 // comparisons. Benefits are double: 16 // - There are less jumps, and therefore less opportunities for mispredictions 17 // and I-cache misses. 18 // - Code size is smaller, both because jumps are removed and because the 19 // encoding of a 2*n byte compare is smaller than that of two n-byte 20 // compares. 21 // 22 // Example: 23 // 24 // struct S { 25 // int a; 26 // char b; 27 // char c; 28 // uint16_t d; 29 // bool operator==(const S& o) const { 30 // return a == o.a && b == o.b && c == o.c && d == o.d; 31 // } 32 // }; 33 // 34 // Is optimized as : 35 // 36 // bool S::operator==(const S& o) const { 37 // return memcmp(this, &o, 8) == 0; 38 // } 39 // 40 // Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "llvm/Analysis/Loads.h" 45 #include "llvm/Analysis/TargetLibraryInfo.h" 46 #include "llvm/Analysis/TargetTransformInfo.h" 47 #include "llvm/IR/Function.h" 48 #include "llvm/IR/IRBuilder.h" 49 #include "llvm/Pass.h" 50 #include "llvm/Transforms/Scalar.h" 51 #include "llvm/Transforms/Utils/BuildLibCalls.h" 52 #include <algorithm> 53 #include <numeric> 54 #include <utility> 55 #include <vector> 56 57 using namespace llvm; 58 59 namespace { 60 61 #define DEBUG_TYPE "mergeicmps" 62 63 // Returns true if the instruction is a simple load or a simple store 64 static bool isSimpleLoadOrStore(const Instruction *I) { 65 if (const LoadInst *LI = dyn_cast<LoadInst>(I)) 66 return LI->isSimple(); 67 if (const StoreInst *SI = dyn_cast<StoreInst>(I)) 68 return SI->isSimple(); 69 return false; 70 } 71 72 // A BCE atom "Binary Compare Expression Atom" represents an integer load 73 // that is a constant offset from a base value, e.g. `a` or `o.c` in the example 74 // at the top. 75 struct BCEAtom { 76 BCEAtom() = default; 77 BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset) 78 : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {} 79 80 // We want to order BCEAtoms by (Base, Offset). However we cannot use 81 // the pointer values for Base because these are non-deterministic. 82 // To make sure that the sort order is stable, we first assign to each atom 83 // base value an index based on its order of appearance in the chain of 84 // comparisons. We call this index `BaseOrdering`. For example, for: 85 // b[3] == c[2] && a[1] == d[1] && b[4] == c[3] 86 // | block 1 | | block 2 | | block 3 | 87 // b gets assigned index 0 and a index 1, because b appears as LHS in block 1, 88 // which is before block 2. 89 // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable. 90 bool operator<(const BCEAtom &O) const { 91 return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(O.Offset); 92 } 93 94 GetElementPtrInst *GEP = nullptr; 95 LoadInst *LoadI = nullptr; 96 unsigned BaseId = 0; 97 APInt Offset; 98 }; 99 100 // A class that assigns increasing ids to values in the order in which they are 101 // seen. See comment in `BCEAtom::operator<()``. 102 class BaseIdentifier { 103 public: 104 // Returns the id for value `Base`, after assigning one if `Base` has not been 105 // seen before. 106 int getBaseId(const Value *Base) { 107 assert(Base && "invalid base"); 108 const auto Insertion = BaseToIndex.try_emplace(Base, Order); 109 if (Insertion.second) 110 ++Order; 111 return Insertion.first->second; 112 } 113 114 private: 115 unsigned Order = 1; 116 DenseMap<const Value*, int> BaseToIndex; 117 }; 118 119 // If this value is a load from a constant offset w.r.t. a base address, and 120 // there are no other users of the load or address, returns the base address and 121 // the offset. 122 BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { 123 auto *const LoadI = dyn_cast<LoadInst>(Val); 124 if (!LoadI) 125 return {}; 126 LLVM_DEBUG(dbgs() << "load\n"); 127 if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { 128 LLVM_DEBUG(dbgs() << "used outside of block\n"); 129 return {}; 130 } 131 // Do not optimize atomic loads to non-atomic memcmp 132 if (!LoadI->isSimple()) { 133 LLVM_DEBUG(dbgs() << "volatile or atomic\n"); 134 return {}; 135 } 136 Value *const Addr = LoadI->getOperand(0); 137 auto *const GEP = dyn_cast<GetElementPtrInst>(Addr); 138 if (!GEP) 139 return {}; 140 LLVM_DEBUG(dbgs() << "GEP\n"); 141 if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) { 142 LLVM_DEBUG(dbgs() << "used outside of block\n"); 143 return {}; 144 } 145 const auto &DL = GEP->getModule()->getDataLayout(); 146 if (!isDereferenceablePointer(GEP, DL)) { 147 LLVM_DEBUG(dbgs() << "not dereferenceable\n"); 148 // We need to make sure that we can do comparison in any order, so we 149 // require memory to be unconditionnally dereferencable. 150 return {}; 151 } 152 APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0); 153 if (!GEP->accumulateConstantOffset(DL, Offset)) 154 return {}; 155 return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()), 156 Offset); 157 } 158 159 // A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the 160 // example at the top. 161 // The block might do extra work besides the atom comparison, in which case 162 // doesOtherWork() returns true. Under some conditions, the block can be 163 // split into the atom comparison part and the "other work" part 164 // (see canSplit()). 165 // Note: the terminology is misleading: the comparison is symmetric, so there 166 // is no real {l/r}hs. What we want though is to have the same base on the 167 // left (resp. right), so that we can detect consecutive loads. To ensure this 168 // we put the smallest atom on the left. 169 class BCECmpBlock { 170 public: 171 BCECmpBlock() {} 172 173 BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits) 174 : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) { 175 if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_); 176 } 177 178 bool IsValid() const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0; } 179 180 // Assert the block is consistent: If valid, it should also have 181 // non-null members besides Lhs_ and Rhs_. 182 void AssertConsistent() const { 183 if (IsValid()) { 184 assert(BB); 185 assert(CmpI); 186 assert(BranchI); 187 } 188 } 189 190 const BCEAtom &Lhs() const { return Lhs_; } 191 const BCEAtom &Rhs() const { return Rhs_; } 192 int SizeBits() const { return SizeBits_; } 193 194 // Returns true if the block does other works besides comparison. 195 bool doesOtherWork() const; 196 197 // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp 198 // instructions in the block. 199 bool canSplit(AliasAnalysis *AA) const; 200 201 // Return true if this all the relevant instructions in the BCE-cmp-block can 202 // be sunk below this instruction. By doing this, we know we can separate the 203 // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the 204 // block. 205 bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &, 206 AliasAnalysis *AA) const; 207 208 // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block 209 // instructions. Split the old block and move all non-BCE-cmp-insts into the 210 // new parent block. 211 void split(BasicBlock *NewParent, AliasAnalysis *AA) const; 212 213 // The basic block where this comparison happens. 214 BasicBlock *BB = nullptr; 215 // The ICMP for this comparison. 216 ICmpInst *CmpI = nullptr; 217 // The terminating branch. 218 BranchInst *BranchI = nullptr; 219 // The block requires splitting. 220 bool RequireSplit = false; 221 222 private: 223 BCEAtom Lhs_; 224 BCEAtom Rhs_; 225 int SizeBits_ = 0; 226 }; 227 228 bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, 229 DenseSet<Instruction *> &BlockInsts, 230 AliasAnalysis *AA) const { 231 // If this instruction has side effects and its in middle of the BCE cmp block 232 // instructions, then bail for now. 233 if (Inst->mayHaveSideEffects()) { 234 // Bail if this is not a simple load or store 235 if (!isSimpleLoadOrStore(Inst)) 236 return false; 237 // Disallow stores that might alias the BCE operands 238 MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI); 239 MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI); 240 if (isModSet(AA->getModRefInfo(Inst, LLoc)) || 241 isModSet(AA->getModRefInfo(Inst, RLoc))) 242 return false; 243 } 244 // Make sure this instruction does not use any of the BCE cmp block 245 // instructions as operand. 246 for (auto BI : BlockInsts) { 247 if (is_contained(Inst->operands(), BI)) 248 return false; 249 } 250 return true; 251 } 252 253 void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis *AA) const { 254 DenseSet<Instruction *> BlockInsts( 255 {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); 256 llvm::SmallVector<Instruction *, 4> OtherInsts; 257 for (Instruction &Inst : *BB) { 258 if (BlockInsts.count(&Inst)) 259 continue; 260 assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) && 261 "Split unsplittable block"); 262 // This is a non-BCE-cmp-block instruction. And it can be separated 263 // from the BCE-cmp-block instruction. 264 OtherInsts.push_back(&Inst); 265 } 266 267 // Do the actual spliting. 268 for (Instruction *Inst : reverse(OtherInsts)) { 269 Inst->moveBefore(&*NewParent->begin()); 270 } 271 } 272 273 bool BCECmpBlock::canSplit(AliasAnalysis *AA) const { 274 DenseSet<Instruction *> BlockInsts( 275 {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); 276 for (Instruction &Inst : *BB) { 277 if (!BlockInsts.count(&Inst)) { 278 if (!canSinkBCECmpInst(&Inst, BlockInsts, AA)) 279 return false; 280 } 281 } 282 return true; 283 } 284 285 bool BCECmpBlock::doesOtherWork() const { 286 AssertConsistent(); 287 // All the instructions we care about in the BCE cmp block. 288 DenseSet<Instruction *> BlockInsts( 289 {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI}); 290 // TODO(courbet): Can we allow some other things ? This is very conservative. 291 // We might be able to get away with anything does not have any side 292 // effects outside of the basic block. 293 // Note: The GEPs and/or loads are not necessarily in the same block. 294 for (const Instruction &Inst : *BB) { 295 if (!BlockInsts.count(&Inst)) 296 return true; 297 } 298 return false; 299 } 300 301 // Visit the given comparison. If this is a comparison between two valid 302 // BCE atoms, returns the comparison. 303 BCECmpBlock visitICmp(const ICmpInst *const CmpI, 304 const ICmpInst::Predicate ExpectedPredicate, 305 BaseIdentifier &BaseId) { 306 // The comparison can only be used once: 307 // - For intermediate blocks, as a branch condition. 308 // - For the final block, as an incoming value for the Phi. 309 // If there are any other uses of the comparison, we cannot merge it with 310 // other comparisons as we would create an orphan use of the value. 311 if (!CmpI->hasOneUse()) { 312 LLVM_DEBUG(dbgs() << "cmp has several uses\n"); 313 return {}; 314 } 315 if (CmpI->getPredicate() != ExpectedPredicate) 316 return {}; 317 LLVM_DEBUG(dbgs() << "cmp " 318 << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") 319 << "\n"); 320 auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId); 321 if (!Lhs.BaseId) 322 return {}; 323 auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId); 324 if (!Rhs.BaseId) 325 return {}; 326 const auto &DL = CmpI->getModule()->getDataLayout(); 327 return BCECmpBlock(std::move(Lhs), std::move(Rhs), 328 DL.getTypeSizeInBits(CmpI->getOperand(0)->getType())); 329 } 330 331 // Visit the given comparison block. If this is a comparison between two valid 332 // BCE atoms, returns the comparison. 333 BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, 334 const BasicBlock *const PhiBlock, 335 BaseIdentifier &BaseId) { 336 if (Block->empty()) return {}; 337 auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); 338 if (!BranchI) return {}; 339 LLVM_DEBUG(dbgs() << "branch\n"); 340 if (BranchI->isUnconditional()) { 341 // In this case, we expect an incoming value which is the result of the 342 // comparison. This is the last link in the chain of comparisons (note 343 // that this does not mean that this is the last incoming value, blocks 344 // can be reordered). 345 auto *const CmpI = dyn_cast<ICmpInst>(Val); 346 if (!CmpI) return {}; 347 LLVM_DEBUG(dbgs() << "icmp\n"); 348 auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId); 349 Result.CmpI = CmpI; 350 Result.BranchI = BranchI; 351 return Result; 352 } else { 353 // In this case, we expect a constant incoming value (the comparison is 354 // chained). 355 const auto *const Const = dyn_cast<ConstantInt>(Val); 356 LLVM_DEBUG(dbgs() << "const\n"); 357 if (!Const->isZero()) return {}; 358 LLVM_DEBUG(dbgs() << "false\n"); 359 auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition()); 360 if (!CmpI) return {}; 361 LLVM_DEBUG(dbgs() << "icmp\n"); 362 assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch"); 363 BasicBlock *const FalseBlock = BranchI->getSuccessor(1); 364 auto Result = visitICmp( 365 CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, 366 BaseId); 367 Result.CmpI = CmpI; 368 Result.BranchI = BranchI; 369 return Result; 370 } 371 return {}; 372 } 373 374 static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons, 375 BCECmpBlock &Comparison) { 376 LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName() 377 << "': Found cmp of " << Comparison.SizeBits() 378 << " bits between " << Comparison.Lhs().BaseId << " + " 379 << Comparison.Lhs().Offset << " and " 380 << Comparison.Rhs().BaseId << " + " 381 << Comparison.Rhs().Offset << "\n"); 382 LLVM_DEBUG(dbgs() << "\n"); 383 Comparisons.push_back(Comparison); 384 } 385 386 // A chain of comparisons. 387 class BCECmpChain { 388 public: 389 BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, 390 AliasAnalysis *AA); 391 392 int size() const { return Comparisons_.size(); } 393 394 #ifdef MERGEICMPS_DOT_ON 395 void dump() const; 396 #endif // MERGEICMPS_DOT_ON 397 398 bool simplify(const TargetLibraryInfo *const TLI, AliasAnalysis *AA); 399 400 private: 401 static bool IsContiguous(const BCECmpBlock &First, 402 const BCECmpBlock &Second) { 403 return First.Lhs().BaseId == Second.Lhs().BaseId && 404 First.Rhs().BaseId == Second.Rhs().BaseId && 405 First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && 406 First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; 407 } 408 409 // Merges the given comparison blocks into one memcmp block and update 410 // branches. Comparisons are assumed to be continguous. If NextBBInChain is 411 // null, the merged block will link to the phi block. 412 void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, 413 BasicBlock *const NextBBInChain, PHINode &Phi, 414 const TargetLibraryInfo *const TLI, AliasAnalysis *AA); 415 416 PHINode &Phi_; 417 std::vector<BCECmpBlock> Comparisons_; 418 // The original entry block (before sorting); 419 BasicBlock *EntryBlock_; 420 }; 421 422 BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi, 423 AliasAnalysis *AA) 424 : Phi_(Phi) { 425 assert(!Blocks.empty() && "a chain should have at least one block"); 426 // Now look inside blocks to check for BCE comparisons. 427 std::vector<BCECmpBlock> Comparisons; 428 BaseIdentifier BaseId; 429 for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) { 430 BasicBlock *const Block = Blocks[BlockIdx]; 431 assert(Block && "invalid block"); 432 BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block), 433 Block, Phi.getParent(), BaseId); 434 Comparison.BB = Block; 435 if (!Comparison.IsValid()) { 436 LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n"); 437 return; 438 } 439 if (Comparison.doesOtherWork()) { 440 LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName() 441 << "' does extra work besides compare\n"); 442 if (Comparisons.empty()) { 443 // This is the initial block in the chain, in case this block does other 444 // work, we can try to split the block and move the irrelevant 445 // instructions to the predecessor. 446 // 447 // If this is not the initial block in the chain, splitting it wont 448 // work. 449 // 450 // As once split, there will still be instructions before the BCE cmp 451 // instructions that do other work in program order, i.e. within the 452 // chain before sorting. Unless we can abort the chain at this point 453 // and start anew. 454 // 455 // NOTE: we only handle block with single predecessor for now. 456 if (Comparison.canSplit(AA)) { 457 LLVM_DEBUG(dbgs() 458 << "Split initial block '" << Comparison.BB->getName() 459 << "' that does extra work besides compare\n"); 460 Comparison.RequireSplit = true; 461 enqueueBlock(Comparisons, Comparison); 462 } else { 463 LLVM_DEBUG(dbgs() 464 << "ignoring initial block '" << Comparison.BB->getName() 465 << "' that does extra work besides compare\n"); 466 } 467 continue; 468 } 469 // TODO(courbet): Right now we abort the whole chain. We could be 470 // merging only the blocks that don't do other work and resume the 471 // chain from there. For example: 472 // if (a[0] == b[0]) { // bb1 473 // if (a[1] == b[1]) { // bb2 474 // some_value = 3; //bb3 475 // if (a[2] == b[2]) { //bb3 476 // do a ton of stuff //bb4 477 // } 478 // } 479 // } 480 // 481 // This is: 482 // 483 // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+ 484 // \ \ \ \ 485 // ne ne ne \ 486 // \ \ \ v 487 // +------------+-----------+----------> bb_phi 488 // 489 // We can only merge the first two comparisons, because bb3* does 490 // "other work" (setting some_value to 3). 491 // We could still merge bb1 and bb2 though. 492 return; 493 } 494 enqueueBlock(Comparisons, Comparison); 495 } 496 497 // It is possible we have no suitable comparison to merge. 498 if (Comparisons.empty()) { 499 LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n"); 500 return; 501 } 502 EntryBlock_ = Comparisons[0].BB; 503 Comparisons_ = std::move(Comparisons); 504 #ifdef MERGEICMPS_DOT_ON 505 errs() << "BEFORE REORDERING:\n\n"; 506 dump(); 507 #endif // MERGEICMPS_DOT_ON 508 // Reorder blocks by LHS. We can do that without changing the 509 // semantics because we are only accessing dereferencable memory. 510 llvm::sort(Comparisons_, 511 [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) { 512 return LhsBlock.Lhs() < RhsBlock.Lhs(); 513 }); 514 #ifdef MERGEICMPS_DOT_ON 515 errs() << "AFTER REORDERING:\n\n"; 516 dump(); 517 #endif // MERGEICMPS_DOT_ON 518 } 519 520 #ifdef MERGEICMPS_DOT_ON 521 void BCECmpChain::dump() const { 522 errs() << "digraph dag {\n"; 523 errs() << " graph [bgcolor=transparent];\n"; 524 errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n"; 525 errs() << " edge [color=black];\n"; 526 for (size_t I = 0; I < Comparisons_.size(); ++I) { 527 const auto &Comparison = Comparisons_[I]; 528 errs() << " \"" << I << "\" [label=\"%" 529 << Comparison.Lhs().Base()->getName() << " + " 530 << Comparison.Lhs().Offset << " == %" 531 << Comparison.Rhs().Base()->getName() << " + " 532 << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8) 533 << " bytes)\"];\n"; 534 const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB); 535 if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n"; 536 errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n"; 537 } 538 errs() << " \"Phi\" [label=\"Phi\"];\n"; 539 errs() << "}\n\n"; 540 } 541 #endif // MERGEICMPS_DOT_ON 542 543 bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI, 544 AliasAnalysis *AA) { 545 // First pass to check if there is at least one merge. If not, we don't do 546 // anything and we keep analysis passes intact. 547 { 548 bool AtLeastOneMerged = false; 549 for (size_t I = 1; I < Comparisons_.size(); ++I) { 550 if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { 551 AtLeastOneMerged = true; 552 break; 553 } 554 } 555 if (!AtLeastOneMerged) return false; 556 } 557 558 // Remove phi references to comparison blocks, they will be rebuilt as we 559 // merge the blocks. 560 for (const auto &Comparison : Comparisons_) { 561 Phi_.removeIncomingValue(Comparison.BB, false); 562 } 563 564 // If entry block is part of the chain, we need to make the first block 565 // of the chain the new entry block of the function. 566 BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock(); 567 for (size_t I = 1; I < Comparisons_.size(); ++I) { 568 if (Entry == Comparisons_[I].BB) { 569 BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "", 570 Entry->getParent(), Entry); 571 BranchInst::Create(Entry, NEntryBB); 572 break; 573 } 574 } 575 576 // Point the predecessors of the chain to the first comparison block (which is 577 // the new entry point) and update the entry block of the chain. 578 if (EntryBlock_ != Comparisons_[0].BB) { 579 EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); 580 EntryBlock_ = Comparisons_[0].BB; 581 } 582 583 // Effectively merge blocks. 584 int NumMerged = 1; 585 for (size_t I = 1; I < Comparisons_.size(); ++I) { 586 if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { 587 ++NumMerged; 588 } else { 589 // Merge all previous comparisons and start a new merge block. 590 mergeComparisons( 591 makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), 592 Comparisons_[I].BB, Phi_, TLI, AA); 593 NumMerged = 1; 594 } 595 } 596 mergeComparisons(makeArrayRef(Comparisons_) 597 .slice(Comparisons_.size() - NumMerged, NumMerged), 598 nullptr, Phi_, TLI, AA); 599 600 return true; 601 } 602 603 void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, 604 BasicBlock *const NextBBInChain, 605 PHINode &Phi, 606 const TargetLibraryInfo *const TLI, 607 AliasAnalysis *AA) { 608 assert(!Comparisons.empty()); 609 const auto &FirstComparison = *Comparisons.begin(); 610 BasicBlock *const BB = FirstComparison.BB; 611 LLVMContext &Context = BB->getContext(); 612 613 if (Comparisons.size() >= 2) { 614 // If there is one block that requires splitting, we do it now, i.e. 615 // just before we know we will collapse the chain. The instructions 616 // can be executed before any of the instructions in the chain. 617 auto C = std::find_if(Comparisons.begin(), Comparisons.end(), 618 [](const BCECmpBlock &B) { return B.RequireSplit; }); 619 if (C != Comparisons.end()) 620 C->split(EntryBlock_, AA); 621 622 LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); 623 const auto TotalSize = 624 std::accumulate(Comparisons.begin(), Comparisons.end(), 0, 625 [](int Size, const BCECmpBlock &C) { 626 return Size + C.SizeBits(); 627 }) / 628 8; 629 630 // Incoming edges do not need to be updated, and both GEPs are already 631 // computing the right address, we just need to: 632 // - replace the two loads and the icmp with the memcmp 633 // - update the branch 634 // - update the incoming values in the phi. 635 FirstComparison.BranchI->eraseFromParent(); 636 FirstComparison.CmpI->eraseFromParent(); 637 FirstComparison.Lhs().LoadI->eraseFromParent(); 638 FirstComparison.Rhs().LoadI->eraseFromParent(); 639 640 IRBuilder<> Builder(BB); 641 const auto &DL = Phi.getModule()->getDataLayout(); 642 Value *const MemCmpCall = emitMemCmp( 643 FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, 644 ConstantInt::get(DL.getIntPtrType(Context), TotalSize), 645 Builder, DL, TLI); 646 Value *const MemCmpIsZero = Builder.CreateICmpEQ( 647 MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); 648 649 // Add a branch to the next basic block in the chain. 650 if (NextBBInChain) { 651 Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent()); 652 Phi.addIncoming(ConstantInt::getFalse(Context), BB); 653 } else { 654 Builder.CreateBr(Phi.getParent()); 655 Phi.addIncoming(MemCmpIsZero, BB); 656 } 657 658 // Delete merged blocks. 659 for (size_t I = 1; I < Comparisons.size(); ++I) { 660 BasicBlock *CBB = Comparisons[I].BB; 661 CBB->replaceAllUsesWith(BB); 662 CBB->eraseFromParent(); 663 } 664 } else { 665 assert(Comparisons.size() == 1); 666 // There are no blocks to merge, but we still need to update the branches. 667 LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); 668 if (NextBBInChain) { 669 if (FirstComparison.BranchI->isConditional()) { 670 LLVM_DEBUG(dbgs() << "conditional -> conditional\n"); 671 // Just update the "true" target, the "false" target should already be 672 // the phi block. 673 assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); 674 FirstComparison.BranchI->setSuccessor(0, NextBBInChain); 675 Phi.addIncoming(ConstantInt::getFalse(Context), BB); 676 } else { 677 LLVM_DEBUG(dbgs() << "unconditional -> conditional\n"); 678 // Replace the unconditional branch by a conditional one. 679 FirstComparison.BranchI->eraseFromParent(); 680 IRBuilder<> Builder(BB); 681 Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain, 682 Phi.getParent()); 683 Phi.addIncoming(FirstComparison.CmpI, BB); 684 } 685 } else { 686 if (FirstComparison.BranchI->isConditional()) { 687 LLVM_DEBUG(dbgs() << "conditional -> unconditional\n"); 688 // Replace the conditional branch by an unconditional one. 689 FirstComparison.BranchI->eraseFromParent(); 690 IRBuilder<> Builder(BB); 691 Builder.CreateBr(Phi.getParent()); 692 Phi.addIncoming(FirstComparison.CmpI, BB); 693 } else { 694 LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n"); 695 Phi.addIncoming(FirstComparison.CmpI, BB); 696 } 697 } 698 } 699 } 700 701 std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, 702 BasicBlock *const LastBlock, 703 int NumBlocks) { 704 // Walk up from the last block to find other blocks. 705 std::vector<BasicBlock *> Blocks(NumBlocks); 706 assert(LastBlock && "invalid last block"); 707 BasicBlock *CurBlock = LastBlock; 708 for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) { 709 if (CurBlock->hasAddressTaken()) { 710 // Somebody is jumping to the block through an address, all bets are 711 // off. 712 LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex 713 << " has its address taken\n"); 714 return {}; 715 } 716 Blocks[BlockIndex] = CurBlock; 717 auto *SinglePredecessor = CurBlock->getSinglePredecessor(); 718 if (!SinglePredecessor) { 719 // The block has two or more predecessors. 720 LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex 721 << " has two or more predecessors\n"); 722 return {}; 723 } 724 if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) { 725 // The block does not link back to the phi. 726 LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex 727 << " does not link back to the phi\n"); 728 return {}; 729 } 730 CurBlock = SinglePredecessor; 731 } 732 Blocks[0] = CurBlock; 733 return Blocks; 734 } 735 736 bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI, 737 AliasAnalysis *AA) { 738 LLVM_DEBUG(dbgs() << "processPhi()\n"); 739 if (Phi.getNumIncomingValues() <= 1) { 740 LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n"); 741 return false; 742 } 743 // We are looking for something that has the following structure: 744 // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+ 745 // \ \ \ \ 746 // ne ne ne \ 747 // \ \ \ v 748 // +------------+-----------+----------> bb_phi 749 // 750 // - The last basic block (bb4 here) must branch unconditionally to bb_phi. 751 // It's the only block that contributes a non-constant value to the Phi. 752 // - All other blocks (b1, b2, b3) must have exactly two successors, one of 753 // them being the phi block. 754 // - All intermediate blocks (bb2, bb3) must have only one predecessor. 755 // - Blocks cannot do other work besides the comparison, see doesOtherWork() 756 757 // The blocks are not necessarily ordered in the phi, so we start from the 758 // last block and reconstruct the order. 759 BasicBlock *LastBlock = nullptr; 760 for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) { 761 if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue; 762 if (LastBlock) { 763 // There are several non-constant values. 764 LLVM_DEBUG(dbgs() << "skip: several non-constant values\n"); 765 return false; 766 } 767 if (!isa<ICmpInst>(Phi.getIncomingValue(I)) || 768 cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() != 769 Phi.getIncomingBlock(I)) { 770 // Non-constant incoming value is not from a cmp instruction or not 771 // produced by the last block. We could end up processing the value 772 // producing block more than once. 773 // 774 // This is an uncommon case, so we bail. 775 LLVM_DEBUG( 776 dbgs() 777 << "skip: non-constant value not from cmp or not from last block.\n"); 778 return false; 779 } 780 LastBlock = Phi.getIncomingBlock(I); 781 } 782 if (!LastBlock) { 783 // There is no non-constant block. 784 LLVM_DEBUG(dbgs() << "skip: no non-constant block\n"); 785 return false; 786 } 787 if (LastBlock->getSingleSuccessor() != Phi.getParent()) { 788 LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n"); 789 return false; 790 } 791 792 const auto Blocks = 793 getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); 794 if (Blocks.empty()) return false; 795 BCECmpChain CmpChain(Blocks, Phi, AA); 796 797 if (CmpChain.size() < 2) { 798 LLVM_DEBUG(dbgs() << "skip: only one compare block\n"); 799 return false; 800 } 801 802 return CmpChain.simplify(TLI, AA); 803 } 804 805 class MergeICmps : public FunctionPass { 806 public: 807 static char ID; 808 809 MergeICmps() : FunctionPass(ID) { 810 initializeMergeICmpsPass(*PassRegistry::getPassRegistry()); 811 } 812 813 bool runOnFunction(Function &F) override { 814 if (skipFunction(F)) return false; 815 const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 816 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 817 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 818 auto PA = runImpl(F, &TLI, &TTI, AA); 819 return !PA.areAllPreserved(); 820 } 821 822 private: 823 void getAnalysisUsage(AnalysisUsage &AU) const override { 824 AU.addRequired<TargetLibraryInfoWrapperPass>(); 825 AU.addRequired<TargetTransformInfoWrapperPass>(); 826 AU.addRequired<AAResultsWrapperPass>(); 827 } 828 829 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, 830 const TargetTransformInfo *TTI, AliasAnalysis *AA); 831 }; 832 833 PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI, 834 const TargetTransformInfo *TTI, 835 AliasAnalysis *AA) { 836 LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); 837 838 // We only try merging comparisons if the target wants to expand memcmp later. 839 // The rationale is to avoid turning small chains into memcmp calls. 840 if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all(); 841 842 // If we don't have memcmp avaiable we can't emit calls to it. 843 if (!TLI->has(LibFunc_memcmp)) 844 return PreservedAnalyses::all(); 845 846 bool MadeChange = false; 847 848 for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { 849 // A Phi operation is always first in a basic block. 850 if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin())) 851 MadeChange |= processPhi(*Phi, TLI, AA); 852 } 853 854 if (MadeChange) return PreservedAnalyses::none(); 855 return PreservedAnalyses::all(); 856 } 857 858 } // namespace 859 860 char MergeICmps::ID = 0; 861 INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps", 862 "Merge contiguous icmps into a memcmp", false, false) 863 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 864 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 865 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 866 INITIALIZE_PASS_END(MergeICmps, "mergeicmps", 867 "Merge contiguous icmps into a memcmp", false, false) 868 869 Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); } 870