1 //===- SeparateConstOffsetFromGEP.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loop unrolling may create many similar GEPs for array accesses. 10 // e.g., a 2-level loop 11 // 12 // float a[32][32]; // global variable 13 // 14 // for (int i = 0; i < 2; ++i) { 15 // for (int j = 0; j < 2; ++j) { 16 // ... 17 // ... = a[x + i][y + j]; 18 // ... 19 // } 20 // } 21 // 22 // will probably be unrolled to: 23 // 24 // gep %a, 0, %x, %y; load 25 // gep %a, 0, %x, %y + 1; load 26 // gep %a, 0, %x + 1, %y; load 27 // gep %a, 0, %x + 1, %y + 1; load 28 // 29 // LLVM's GVN does not use partial redundancy elimination yet, and is thus 30 // unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs 31 // significant slowdown in targets with limited addressing modes. For instance, 32 // because the PTX target does not support the reg+reg addressing mode, the 33 // NVPTX backend emits PTX code that literally computes the pointer address of 34 // each GEP, wasting tons of registers. It emits the following PTX for the 35 // first load and similar PTX for other loads. 36 // 37 // mov.u32 %r1, %x; 38 // mov.u32 %r2, %y; 39 // mul.wide.u32 %rl2, %r1, 128; 40 // mov.u64 %rl3, a; 41 // add.s64 %rl4, %rl3, %rl2; 42 // mul.wide.u32 %rl5, %r2, 4; 43 // add.s64 %rl6, %rl4, %rl5; 44 // ld.global.f32 %f1, [%rl6]; 45 // 46 // To reduce the register pressure, the optimization implemented in this file 47 // merges the common part of a group of GEPs, so we can compute each pointer 48 // address by adding a simple offset to the common part, saving many registers. 49 // 50 // It works by splitting each GEP into a variadic base and a constant offset. 51 // The variadic base can be computed once and reused by multiple GEPs, and the 52 // constant offsets can be nicely folded into the reg+immediate addressing mode 53 // (supported by most targets) without using any extra register. 54 // 55 // For instance, we transform the four GEPs and four loads in the above example 56 // into: 57 // 58 // base = gep a, 0, x, y 59 // load base 60 // laod base + 1 * sizeof(float) 61 // load base + 32 * sizeof(float) 62 // load base + 33 * sizeof(float) 63 // 64 // Given the transformed IR, a backend that supports the reg+immediate 65 // addressing mode can easily fold the pointer arithmetics into the loads. For 66 // example, the NVPTX backend can easily fold the pointer arithmetics into the 67 // ld.global.f32 instructions, and the resultant PTX uses much fewer registers. 68 // 69 // mov.u32 %r1, %tid.x; 70 // mov.u32 %r2, %tid.y; 71 // mul.wide.u32 %rl2, %r1, 128; 72 // mov.u64 %rl3, a; 73 // add.s64 %rl4, %rl3, %rl2; 74 // mul.wide.u32 %rl5, %r2, 4; 75 // add.s64 %rl6, %rl4, %rl5; 76 // ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX 77 // ld.global.f32 %f2, [%rl6+4]; // much better 78 // ld.global.f32 %f3, [%rl6+128]; // much better 79 // ld.global.f32 %f4, [%rl6+132]; // much better 80 // 81 // Another improvement enabled by the LowerGEP flag is to lower a GEP with 82 // multiple indices to either multiple GEPs with a single index or arithmetic 83 // operations (depending on whether the target uses alias analysis in codegen). 84 // Such transformation can have following benefits: 85 // (1) It can always extract constants in the indices of structure type. 86 // (2) After such Lowering, there are more optimization opportunities such as 87 // CSE, LICM and CGP. 88 // 89 // E.g. The following GEPs have multiple indices: 90 // BB1: 91 // %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3 92 // load %p 93 // ... 94 // BB2: 95 // %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2 96 // load %p2 97 // ... 98 // 99 // We can not do CSE to the common part related to index "i64 %i". Lowering 100 // GEPs can achieve such goals. 101 // If the target does not use alias analysis in codegen, this pass will 102 // lower a GEP with multiple indices into arithmetic operations: 103 // BB1: 104 // %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity 105 // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity 106 // %3 = add i64 %1, %2 ; CSE opportunity 107 // %4 = mul i64 %j1, length_of_struct 108 // %5 = add i64 %3, %4 109 // %6 = add i64 %3, struct_field_3 ; Constant offset 110 // %p = inttoptr i64 %6 to i32* 111 // load %p 112 // ... 113 // BB2: 114 // %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity 115 // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity 116 // %9 = add i64 %7, %8 ; CSE opportunity 117 // %10 = mul i64 %j2, length_of_struct 118 // %11 = add i64 %9, %10 119 // %12 = add i64 %11, struct_field_2 ; Constant offset 120 // %p = inttoptr i64 %12 to i32* 121 // load %p2 122 // ... 123 // 124 // If the target uses alias analysis in codegen, this pass will lower a GEP 125 // with multiple indices into multiple GEPs with a single index: 126 // BB1: 127 // %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity 128 // %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity 129 // %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity 130 // %4 = mul i64 %j1, length_of_struct 131 // %5 = getelementptr i8* %3, i64 %4 132 // %6 = getelementptr i8* %5, struct_field_3 ; Constant offset 133 // %p = bitcast i8* %6 to i32* 134 // load %p 135 // ... 136 // BB2: 137 // %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity 138 // %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity 139 // %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity 140 // %10 = mul i64 %j2, length_of_struct 141 // %11 = getelementptr i8* %9, i64 %10 142 // %12 = getelementptr i8* %11, struct_field_2 ; Constant offset 143 // %p2 = bitcast i8* %12 to i32* 144 // load %p2 145 // ... 146 // 147 // Lowering GEPs can also benefit other passes such as LICM and CGP. 148 // LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple 149 // indices if one of the index is variant. If we lower such GEP into invariant 150 // parts and variant parts, LICM can hoist/sink those invariant parts. 151 // CGP (CodeGen Prepare) tries to sink address calculations that match the 152 // target's addressing modes. A GEP with multiple indices may not match and will 153 // not be sunk. If we lower such GEP into smaller parts, CGP may sink some of 154 // them. So we end up with a better addressing mode. 155 // 156 //===----------------------------------------------------------------------===// 157 158 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" 159 #include "llvm/ADT/APInt.h" 160 #include "llvm/ADT/DenseMap.h" 161 #include "llvm/ADT/DepthFirstIterator.h" 162 #include "llvm/ADT/SmallVector.h" 163 #include "llvm/Analysis/LoopInfo.h" 164 #include "llvm/Analysis/MemoryBuiltins.h" 165 #include "llvm/Analysis/TargetLibraryInfo.h" 166 #include "llvm/Analysis/TargetTransformInfo.h" 167 #include "llvm/Analysis/ValueTracking.h" 168 #include "llvm/IR/BasicBlock.h" 169 #include "llvm/IR/Constant.h" 170 #include "llvm/IR/Constants.h" 171 #include "llvm/IR/DataLayout.h" 172 #include "llvm/IR/DerivedTypes.h" 173 #include "llvm/IR/Dominators.h" 174 #include "llvm/IR/Function.h" 175 #include "llvm/IR/GetElementPtrTypeIterator.h" 176 #include "llvm/IR/IRBuilder.h" 177 #include "llvm/IR/InstrTypes.h" 178 #include "llvm/IR/Instruction.h" 179 #include "llvm/IR/Instructions.h" 180 #include "llvm/IR/Module.h" 181 #include "llvm/IR/PassManager.h" 182 #include "llvm/IR/PatternMatch.h" 183 #include "llvm/IR/Type.h" 184 #include "llvm/IR/User.h" 185 #include "llvm/IR/Value.h" 186 #include "llvm/InitializePasses.h" 187 #include "llvm/Pass.h" 188 #include "llvm/Support/Casting.h" 189 #include "llvm/Support/CommandLine.h" 190 #include "llvm/Support/ErrorHandling.h" 191 #include "llvm/Support/raw_ostream.h" 192 #include "llvm/Transforms/Scalar.h" 193 #include "llvm/Transforms/Utils/Local.h" 194 #include <cassert> 195 #include <cstdint> 196 #include <string> 197 198 using namespace llvm; 199 using namespace llvm::PatternMatch; 200 201 static cl::opt<bool> DisableSeparateConstOffsetFromGEP( 202 "disable-separate-const-offset-from-gep", cl::init(false), 203 cl::desc("Do not separate the constant offset from a GEP instruction"), 204 cl::Hidden); 205 206 // Setting this flag may emit false positives when the input module already 207 // contains dead instructions. Therefore, we set it only in unit tests that are 208 // free of dead code. 209 static cl::opt<bool> 210 VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false), 211 cl::desc("Verify this pass produces no dead code"), 212 cl::Hidden); 213 214 namespace { 215 216 /// A helper class for separating a constant offset from a GEP index. 217 /// 218 /// In real programs, a GEP index may be more complicated than a simple addition 219 /// of something and a constant integer which can be trivially splitted. For 220 /// example, to split ((a << 3) | 5) + b, we need to search deeper for the 221 /// constant offset, so that we can separate the index to (a << 3) + b and 5. 222 /// 223 /// Therefore, this class looks into the expression that computes a given GEP 224 /// index, and tries to find a constant integer that can be hoisted to the 225 /// outermost level of the expression as an addition. Not every constant in an 226 /// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a + 227 /// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case, 228 /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15). 229 class ConstantOffsetExtractor { 230 public: 231 /// Extracts a constant offset from the given GEP index. It returns the 232 /// new index representing the remainder (equal to the original index minus 233 /// the constant offset), or nullptr if we cannot extract a constant offset. 234 /// \p Idx The given GEP index 235 /// \p GEP The given GEP 236 /// \p UserChainTail Outputs the tail of UserChain so that we can 237 /// garbage-collect unused instructions in UserChain. 238 static Value *Extract(Value *Idx, GetElementPtrInst *GEP, 239 User *&UserChainTail); 240 241 /// Looks for a constant offset from the given GEP index without extracting 242 /// it. It returns the numeric value of the extracted constant offset (0 if 243 /// failed). The meaning of the arguments are the same as Extract. 244 static int64_t Find(Value *Idx, GetElementPtrInst *GEP); 245 246 private: 247 ConstantOffsetExtractor(Instruction *InsertionPt) 248 : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()) {} 249 250 /// Searches the expression that computes V for a non-zero constant C s.t. 251 /// V can be reassociated into the form V' + C. If the searching is 252 /// successful, returns C and update UserChain as a def-use chain from C to V; 253 /// otherwise, UserChain is empty. 254 /// 255 /// \p V The given expression 256 /// \p SignExtended Whether V will be sign-extended in the computation of the 257 /// GEP index 258 /// \p ZeroExtended Whether V will be zero-extended in the computation of the 259 /// GEP index 260 /// \p NonNegative Whether V is guaranteed to be non-negative. For example, 261 /// an index of an inbounds GEP is guaranteed to be 262 /// non-negative. Levaraging this, we can better split 263 /// inbounds GEPs. 264 APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative); 265 266 /// A helper function to look into both operands of a binary operator. 267 APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended, 268 bool ZeroExtended); 269 270 /// After finding the constant offset C from the GEP index I, we build a new 271 /// index I' s.t. I' + C = I. This function builds and returns the new 272 /// index I' according to UserChain produced by function "find". 273 /// 274 /// The building conceptually takes two steps: 275 /// 1) iteratively distribute s/zext towards the leaves of the expression tree 276 /// that computes I 277 /// 2) reassociate the expression tree to the form I' + C. 278 /// 279 /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute 280 /// sext to a, b and 5 so that we have 281 /// sext(a) + (sext(b) + 5). 282 /// Then, we reassociate it to 283 /// (sext(a) + sext(b)) + 5. 284 /// Given this form, we know I' is sext(a) + sext(b). 285 Value *rebuildWithoutConstOffset(); 286 287 /// After the first step of rebuilding the GEP index without the constant 288 /// offset, distribute s/zext to the operands of all operators in UserChain. 289 /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => 290 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). 291 /// 292 /// The function also updates UserChain to point to new subexpressions after 293 /// distributing s/zext. e.g., the old UserChain of the above example is 294 /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), 295 /// and the new UserChain is 296 /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> 297 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) 298 /// 299 /// \p ChainIndex The index to UserChain. ChainIndex is initially 300 /// UserChain.size() - 1, and is decremented during 301 /// the recursion. 302 Value *distributeExtsAndCloneChain(unsigned ChainIndex); 303 304 /// Reassociates the GEP index to the form I' + C and returns I'. 305 Value *removeConstOffset(unsigned ChainIndex); 306 307 /// A helper function to apply ExtInsts, a list of s/zext, to value V. 308 /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function 309 /// returns "sext i32 (zext i16 V to i32) to i64". 310 Value *applyExts(Value *V); 311 312 /// A helper function that returns whether we can trace into the operands 313 /// of binary operator BO for a constant offset. 314 /// 315 /// \p SignExtended Whether BO is surrounded by sext 316 /// \p ZeroExtended Whether BO is surrounded by zext 317 /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound 318 /// array index. 319 bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, 320 bool NonNegative); 321 322 /// The path from the constant offset to the old GEP index. e.g., if the GEP 323 /// index is "a * b + (c + 5)". After running function find, UserChain[0] will 324 /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and 325 /// UserChain[2] will be the entire expression "a * b + (c + 5)". 326 /// 327 /// This path helps to rebuild the new GEP index. 328 SmallVector<User *, 8> UserChain; 329 330 /// A data structure used in rebuildWithoutConstOffset. Contains all 331 /// sext/zext instructions along UserChain. 332 SmallVector<CastInst *, 16> ExtInsts; 333 334 /// Insertion position of cloned instructions. 335 Instruction *IP; 336 337 const DataLayout &DL; 338 }; 339 340 /// A pass that tries to split every GEP in the function into a variadic 341 /// base and a constant offset. It is a FunctionPass because searching for the 342 /// constant offset may inspect other basic blocks. 343 class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass { 344 public: 345 static char ID; 346 347 SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false) 348 : FunctionPass(ID), LowerGEP(LowerGEP) { 349 initializeSeparateConstOffsetFromGEPLegacyPassPass( 350 *PassRegistry::getPassRegistry()); 351 } 352 353 void getAnalysisUsage(AnalysisUsage &AU) const override { 354 AU.addRequired<DominatorTreeWrapperPass>(); 355 AU.addRequired<TargetTransformInfoWrapperPass>(); 356 AU.addRequired<LoopInfoWrapperPass>(); 357 AU.setPreservesCFG(); 358 AU.addRequired<TargetLibraryInfoWrapperPass>(); 359 } 360 361 bool runOnFunction(Function &F) override; 362 363 private: 364 bool LowerGEP; 365 }; 366 367 /// A pass that tries to split every GEP in the function into a variadic 368 /// base and a constant offset. It is a FunctionPass because searching for the 369 /// constant offset may inspect other basic blocks. 370 class SeparateConstOffsetFromGEP { 371 public: 372 SeparateConstOffsetFromGEP( 373 DominatorTree *DT, LoopInfo *LI, TargetLibraryInfo *TLI, 374 function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP) 375 : DT(DT), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {} 376 377 bool run(Function &F); 378 379 private: 380 /// Track the operands of an add or sub. 381 using ExprKey = std::pair<Value *, Value *>; 382 383 /// Create a pair for use as a map key for a commutable operation. 384 static ExprKey createNormalizedCommutablePair(Value *A, Value *B) { 385 if (A < B) 386 return {A, B}; 387 return {B, A}; 388 } 389 390 /// Tries to split the given GEP into a variadic base and a constant offset, 391 /// and returns true if the splitting succeeds. 392 bool splitGEP(GetElementPtrInst *GEP); 393 394 /// Lower a GEP with multiple indices into multiple GEPs with a single index. 395 /// Function splitGEP already split the original GEP into a variadic part and 396 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 397 /// variadic part into a set of GEPs with a single index and applies 398 /// AccumulativeByteOffset to it. 399 /// \p Variadic The variadic part of the original GEP. 400 /// \p AccumulativeByteOffset The constant offset. 401 void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic, 402 int64_t AccumulativeByteOffset); 403 404 /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form. 405 /// Function splitGEP already split the original GEP into a variadic part and 406 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the 407 /// variadic part into a set of arithmetic operations and applies 408 /// AccumulativeByteOffset to it. 409 /// \p Variadic The variadic part of the original GEP. 410 /// \p AccumulativeByteOffset The constant offset. 411 void lowerToArithmetics(GetElementPtrInst *Variadic, 412 int64_t AccumulativeByteOffset); 413 414 /// Finds the constant offset within each index and accumulates them. If 415 /// LowerGEP is true, it finds in indices of both sequential and structure 416 /// types, otherwise it only finds in sequential indices. The output 417 /// NeedsExtraction indicates whether we successfully find a non-zero constant 418 /// offset. 419 int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); 420 421 /// Canonicalize array indices to pointer-size integers. This helps to 422 /// simplify the logic of splitting a GEP. For example, if a + b is a 423 /// pointer-size integer, we have 424 /// gep base, a + b = gep (gep base, a), b 425 /// However, this equality may not hold if the size of a + b is smaller than 426 /// the pointer size, because LLVM conceptually sign-extends GEP indices to 427 /// pointer size before computing the address 428 /// (http://llvm.org/docs/LangRef.html#id181). 429 /// 430 /// This canonicalization is very likely already done in clang and 431 /// instcombine. Therefore, the program will probably remain the same. 432 /// 433 /// Returns true if the module changes. 434 /// 435 /// Verified in @i32_add in split-gep.ll 436 bool canonicalizeArrayIndicesToIndexSize(GetElementPtrInst *GEP); 437 438 /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. 439 /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting 440 /// the constant offset. After extraction, it becomes desirable to reunion the 441 /// distributed sexts. For example, 442 /// 443 /// &a[sext(i +nsw (j +nsw 5)] 444 /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] 445 /// => constant extraction &a[sext(i) + sext(j)] + 5 446 /// => reunion &a[sext(i +nsw j)] + 5 447 bool reuniteExts(Function &F); 448 449 /// A helper that reunites sexts in an instruction. 450 bool reuniteExts(Instruction *I); 451 452 /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. 453 Instruction *findClosestMatchingDominator( 454 ExprKey Key, Instruction *Dominatee, 455 DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs); 456 457 /// Verify F is free of dead code. 458 void verifyNoDeadCode(Function &F); 459 460 bool hasMoreThanOneUseInLoop(Value *v, Loop *L); 461 462 // Swap the index operand of two GEP. 463 void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); 464 465 // Check if it is safe to swap operand of two GEP. 466 bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, 467 Loop *CurLoop); 468 469 const DataLayout *DL = nullptr; 470 DominatorTree *DT = nullptr; 471 LoopInfo *LI; 472 TargetLibraryInfo *TLI; 473 // Retrieved lazily since not always used. 474 function_ref<TargetTransformInfo &(Function &)> GetTTI; 475 476 /// Whether to lower a GEP with multiple indices into arithmetic operations or 477 /// multiple GEPs with a single index. 478 bool LowerGEP; 479 480 DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingAdds; 481 DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs; 482 }; 483 484 } // end anonymous namespace 485 486 char SeparateConstOffsetFromGEPLegacyPass::ID = 0; 487 488 INITIALIZE_PASS_BEGIN( 489 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", 490 "Split GEPs to a variadic base and a constant offset for better CSE", false, 491 false) 492 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 493 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 494 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 495 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 496 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 497 INITIALIZE_PASS_END( 498 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep", 499 "Split GEPs to a variadic base and a constant offset for better CSE", false, 500 false) 501 502 FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) { 503 return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP); 504 } 505 506 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, 507 bool ZeroExtended, 508 BinaryOperator *BO, 509 bool NonNegative) { 510 // We only consider ADD, SUB and OR, because a non-zero constant found in 511 // expressions composed of these operations can be easily hoisted as a 512 // constant offset by reassociation. 513 if (BO->getOpcode() != Instruction::Add && 514 BO->getOpcode() != Instruction::Sub && 515 BO->getOpcode() != Instruction::Or) { 516 return false; 517 } 518 519 Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); 520 // Do not trace into "or" unless it is equivalent to "add". 521 // This is the case if the or's disjoint flag is set. 522 if (BO->getOpcode() == Instruction::Or && 523 !cast<PossiblyDisjointInst>(BO)->isDisjoint()) 524 return false; 525 526 // FIXME: We don't currently support constants from the RHS of subs, 527 // when we are zero-extended, because we need a way to zero-extended 528 // them before they are negated. 529 if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub) 530 return false; 531 532 // In addition, tracing into BO requires that its surrounding s/zext (if 533 // any) is distributable to both operands. 534 // 535 // Suppose BO = A op B. 536 // SignExtended | ZeroExtended | Distributable? 537 // --------------+--------------+---------------------------------- 538 // 0 | 0 | true because no s/zext exists 539 // 0 | 1 | zext(BO) == zext(A) op zext(B) 540 // 1 | 0 | sext(BO) == sext(A) op sext(B) 541 // 1 | 1 | zext(sext(BO)) == 542 // | | zext(sext(A)) op zext(sext(B)) 543 if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) { 544 // If a + b >= 0 and (a >= 0 or b >= 0), then 545 // sext(a + b) = sext(a) + sext(b) 546 // even if the addition is not marked nsw. 547 // 548 // Leveraging this invariant, we can trace into an sext'ed inbound GEP 549 // index if the constant offset is non-negative. 550 // 551 // Verified in @sext_add in split-gep.ll. 552 if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) { 553 if (!ConstLHS->isNegative()) 554 return true; 555 } 556 if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) { 557 if (!ConstRHS->isNegative()) 558 return true; 559 } 560 } 561 562 // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B) 563 // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B) 564 if (BO->getOpcode() == Instruction::Add || 565 BO->getOpcode() == Instruction::Sub) { 566 if (SignExtended && !BO->hasNoSignedWrap()) 567 return false; 568 if (ZeroExtended && !BO->hasNoUnsignedWrap()) 569 return false; 570 } 571 572 return true; 573 } 574 575 APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO, 576 bool SignExtended, 577 bool ZeroExtended) { 578 // Save off the current height of the chain, in case we need to restore it. 579 size_t ChainLength = UserChain.size(); 580 581 // BO being non-negative does not shed light on whether its operands are 582 // non-negative. Clear the NonNegative flag here. 583 APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended, 584 /* NonNegative */ false); 585 // If we found a constant offset in the left operand, stop and return that. 586 // This shortcut might cause us to miss opportunities of combining the 587 // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9. 588 // However, such cases are probably already handled by -instcombine, 589 // given this pass runs after the standard optimizations. 590 if (ConstantOffset != 0) return ConstantOffset; 591 592 // Reset the chain back to where it was when we started exploring this node, 593 // since visiting the LHS didn't pan out. 594 UserChain.resize(ChainLength); 595 596 ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended, 597 /* NonNegative */ false); 598 // If U is a sub operator, negate the constant offset found in the right 599 // operand. 600 if (BO->getOpcode() == Instruction::Sub) 601 ConstantOffset = -ConstantOffset; 602 603 // If RHS wasn't a suitable candidate either, reset the chain again. 604 if (ConstantOffset == 0) 605 UserChain.resize(ChainLength); 606 607 return ConstantOffset; 608 } 609 610 APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, 611 bool ZeroExtended, bool NonNegative) { 612 // TODO(jingyue): We could trace into integer/pointer casts, such as 613 // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only 614 // integers because it gives good enough results for our benchmarks. 615 unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); 616 617 // We cannot do much with Values that are not a User, such as an Argument. 618 User *U = dyn_cast<User>(V); 619 if (U == nullptr) return APInt(BitWidth, 0); 620 621 APInt ConstantOffset(BitWidth, 0); 622 if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { 623 // Hooray, we found it! 624 ConstantOffset = CI->getValue(); 625 } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) { 626 // Trace into subexpressions for more hoisting opportunities. 627 if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) 628 ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); 629 } else if (isa<TruncInst>(V)) { 630 ConstantOffset = 631 find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) 632 .trunc(BitWidth); 633 } else if (isa<SExtInst>(V)) { 634 ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, 635 ZeroExtended, NonNegative).sext(BitWidth); 636 } else if (isa<ZExtInst>(V)) { 637 // As an optimization, we can clear the SignExtended flag because 638 // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll. 639 // 640 // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0. 641 ConstantOffset = 642 find(U->getOperand(0), /* SignExtended */ false, 643 /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth); 644 } 645 646 // If we found a non-zero constant offset, add it to the path for 647 // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't 648 // help this optimization. 649 if (ConstantOffset != 0) 650 UserChain.push_back(U); 651 return ConstantOffset; 652 } 653 654 Value *ConstantOffsetExtractor::applyExts(Value *V) { 655 Value *Current = V; 656 // ExtInsts is built in the use-def order. Therefore, we apply them to V 657 // in the reversed order. 658 for (CastInst *I : llvm::reverse(ExtInsts)) { 659 if (Constant *C = dyn_cast<Constant>(Current)) { 660 // Try to constant fold the cast. 661 Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL); 662 if (Current) 663 continue; 664 } 665 666 Instruction *Ext = I->clone(); 667 Ext->setOperand(0, Current); 668 Ext->insertBefore(IP); 669 Current = Ext; 670 } 671 return Current; 672 } 673 674 Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { 675 distributeExtsAndCloneChain(UserChain.size() - 1); 676 // Remove all nullptrs (used to be s/zext) from UserChain. 677 unsigned NewSize = 0; 678 for (User *I : UserChain) { 679 if (I != nullptr) { 680 UserChain[NewSize] = I; 681 NewSize++; 682 } 683 } 684 UserChain.resize(NewSize); 685 return removeConstOffset(UserChain.size() - 1); 686 } 687 688 Value * 689 ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { 690 User *U = UserChain[ChainIndex]; 691 if (ChainIndex == 0) { 692 assert(isa<ConstantInt>(U)); 693 // If U is a ConstantInt, applyExts will return a ConstantInt as well. 694 return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); 695 } 696 697 if (CastInst *Cast = dyn_cast<CastInst>(U)) { 698 assert( 699 (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && 700 "Only following instructions can be traced: sext, zext & trunc"); 701 ExtInsts.push_back(Cast); 702 UserChain[ChainIndex] = nullptr; 703 return distributeExtsAndCloneChain(ChainIndex - 1); 704 } 705 706 // Function find only trace into BinaryOperator and CastInst. 707 BinaryOperator *BO = cast<BinaryOperator>(U); 708 // OpNo = which operand of BO is UserChain[ChainIndex - 1] 709 unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 710 Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); 711 Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); 712 713 BinaryOperator *NewBO = nullptr; 714 if (OpNo == 0) { 715 NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther, 716 BO->getName(), IP); 717 } else { 718 NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain, 719 BO->getName(), IP); 720 } 721 return UserChain[ChainIndex] = NewBO; 722 } 723 724 Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { 725 if (ChainIndex == 0) { 726 assert(isa<ConstantInt>(UserChain[ChainIndex])); 727 return ConstantInt::getNullValue(UserChain[ChainIndex]->getType()); 728 } 729 730 BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); 731 assert((BO->use_empty() || BO->hasOneUse()) && 732 "distributeExtsAndCloneChain clones each BinaryOperator in " 733 "UserChain, so no one should be used more than " 734 "once"); 735 736 unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); 737 assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); 738 Value *NextInChain = removeConstOffset(ChainIndex - 1); 739 Value *TheOther = BO->getOperand(1 - OpNo); 740 741 // If NextInChain is 0 and not the LHS of a sub, we can simplify the 742 // sub-expression to be just TheOther. 743 if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) { 744 if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) 745 return TheOther; 746 } 747 748 BinaryOperator::BinaryOps NewOp = BO->getOpcode(); 749 if (BO->getOpcode() == Instruction::Or) { 750 // Rebuild "or" as "add", because "or" may be invalid for the new 751 // expression. 752 // 753 // For instance, given 754 // a | (b + 5) where a and b + 5 have no common bits, 755 // we can extract 5 as the constant offset. 756 // 757 // However, reusing the "or" in the new index would give us 758 // (a | b) + 5 759 // which does not equal a | (b + 5). 760 // 761 // Replacing the "or" with "add" is fine, because 762 // a | (b + 5) = a + (b + 5) = (a + b) + 5 763 NewOp = Instruction::Add; 764 } 765 766 BinaryOperator *NewBO; 767 if (OpNo == 0) { 768 NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP); 769 } else { 770 NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP); 771 } 772 NewBO->takeName(BO); 773 return NewBO; 774 } 775 776 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, 777 User *&UserChainTail) { 778 ConstantOffsetExtractor Extractor(GEP); 779 // Find a non-zero constant offset first. 780 APInt ConstantOffset = 781 Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 782 GEP->isInBounds()); 783 if (ConstantOffset == 0) { 784 UserChainTail = nullptr; 785 return nullptr; 786 } 787 // Separates the constant offset from the GEP index. 788 Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset(); 789 UserChainTail = Extractor.UserChain.back(); 790 return IdxWithoutConstOffset; 791 } 792 793 int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { 794 // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. 795 return ConstantOffsetExtractor(GEP) 796 .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, 797 GEP->isInBounds()) 798 .getSExtValue(); 799 } 800 801 bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize( 802 GetElementPtrInst *GEP) { 803 bool Changed = false; 804 Type *PtrIdxTy = DL->getIndexType(GEP->getType()); 805 gep_type_iterator GTI = gep_type_begin(*GEP); 806 for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); 807 I != E; ++I, ++GTI) { 808 // Skip struct member indices which must be i32. 809 if (GTI.isSequential()) { 810 if ((*I)->getType() != PtrIdxTy) { 811 *I = CastInst::CreateIntegerCast(*I, PtrIdxTy, true, "idxprom", GEP); 812 Changed = true; 813 } 814 } 815 } 816 return Changed; 817 } 818 819 int64_t 820 SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, 821 bool &NeedsExtraction) { 822 NeedsExtraction = false; 823 int64_t AccumulativeByteOffset = 0; 824 gep_type_iterator GTI = gep_type_begin(*GEP); 825 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 826 if (GTI.isSequential()) { 827 // Constant offsets of scalable types are not really constant. 828 if (GTI.getIndexedType()->isScalableTy()) 829 continue; 830 831 // Tries to extract a constant offset from this GEP index. 832 int64_t ConstantOffset = 833 ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); 834 if (ConstantOffset != 0) { 835 NeedsExtraction = true; 836 // A GEP may have multiple indices. We accumulate the extracted 837 // constant offset to a byte offset, and later offset the remainder of 838 // the original GEP with this byte offset. 839 AccumulativeByteOffset += 840 ConstantOffset * GTI.getSequentialElementStride(*DL); 841 } 842 } else if (LowerGEP) { 843 StructType *StTy = GTI.getStructType(); 844 uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue(); 845 // Skip field 0 as the offset is always 0. 846 if (Field != 0) { 847 NeedsExtraction = true; 848 AccumulativeByteOffset += 849 DL->getStructLayout(StTy)->getElementOffset(Field); 850 } 851 } 852 } 853 return AccumulativeByteOffset; 854 } 855 856 void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( 857 GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { 858 IRBuilder<> Builder(Variadic); 859 Type *PtrIndexTy = DL->getIndexType(Variadic->getType()); 860 861 Value *ResultPtr = Variadic->getOperand(0); 862 Loop *L = LI->getLoopFor(Variadic->getParent()); 863 // Check if the base is not loop invariant or used more than once. 864 bool isSwapCandidate = 865 L && L->isLoopInvariant(ResultPtr) && 866 !hasMoreThanOneUseInLoop(ResultPtr, L); 867 Value *FirstResult = nullptr; 868 869 gep_type_iterator GTI = gep_type_begin(*Variadic); 870 // Create an ugly GEP for each sequential index. We don't create GEPs for 871 // structure indices, as they are accumulated in the constant offset index. 872 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 873 if (GTI.isSequential()) { 874 Value *Idx = Variadic->getOperand(I); 875 // Skip zero indices. 876 if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 877 if (CI->isZero()) 878 continue; 879 880 APInt ElementSize = APInt(PtrIndexTy->getIntegerBitWidth(), 881 GTI.getSequentialElementStride(*DL)); 882 // Scale the index by element size. 883 if (ElementSize != 1) { 884 if (ElementSize.isPowerOf2()) { 885 Idx = Builder.CreateShl( 886 Idx, ConstantInt::get(PtrIndexTy, ElementSize.logBase2())); 887 } else { 888 Idx = 889 Builder.CreateMul(Idx, ConstantInt::get(PtrIndexTy, ElementSize)); 890 } 891 } 892 // Create an ugly GEP with a single index for each index. 893 ResultPtr = Builder.CreatePtrAdd(ResultPtr, Idx, "uglygep"); 894 if (FirstResult == nullptr) 895 FirstResult = ResultPtr; 896 } 897 } 898 899 // Create a GEP with the constant offset index. 900 if (AccumulativeByteOffset != 0) { 901 Value *Offset = ConstantInt::get(PtrIndexTy, AccumulativeByteOffset); 902 ResultPtr = Builder.CreatePtrAdd(ResultPtr, Offset, "uglygep"); 903 } else 904 isSwapCandidate = false; 905 906 // If we created a GEP with constant index, and the base is loop invariant, 907 // then we swap the first one with it, so LICM can move constant GEP out 908 // later. 909 auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult); 910 auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); 911 if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) 912 swapGEPOperand(FirstGEP, SecondGEP); 913 914 Variadic->replaceAllUsesWith(ResultPtr); 915 Variadic->eraseFromParent(); 916 } 917 918 void 919 SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic, 920 int64_t AccumulativeByteOffset) { 921 IRBuilder<> Builder(Variadic); 922 Type *IntPtrTy = DL->getIntPtrType(Variadic->getType()); 923 assert(IntPtrTy == DL->getIndexType(Variadic->getType()) && 924 "Pointer type must match index type for arithmetic-based lowering of " 925 "split GEPs"); 926 927 Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy); 928 gep_type_iterator GTI = gep_type_begin(*Variadic); 929 // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We 930 // don't create arithmetics for structure indices, as they are accumulated 931 // in the constant offset index. 932 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) { 933 if (GTI.isSequential()) { 934 Value *Idx = Variadic->getOperand(I); 935 // Skip zero indices. 936 if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) 937 if (CI->isZero()) 938 continue; 939 940 APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), 941 GTI.getSequentialElementStride(*DL)); 942 // Scale the index by element size. 943 if (ElementSize != 1) { 944 if (ElementSize.isPowerOf2()) { 945 Idx = Builder.CreateShl( 946 Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2())); 947 } else { 948 Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); 949 } 950 } 951 // Create an ADD for each index. 952 ResultPtr = Builder.CreateAdd(ResultPtr, Idx); 953 } 954 } 955 956 // Create an ADD for the constant offset index. 957 if (AccumulativeByteOffset != 0) { 958 ResultPtr = Builder.CreateAdd( 959 ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); 960 } 961 962 ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType()); 963 Variadic->replaceAllUsesWith(ResultPtr); 964 Variadic->eraseFromParent(); 965 } 966 967 bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { 968 // Skip vector GEPs. 969 if (GEP->getType()->isVectorTy()) 970 return false; 971 972 // The backend can already nicely handle the case where all indices are 973 // constant. 974 if (GEP->hasAllConstantIndices()) 975 return false; 976 977 bool Changed = canonicalizeArrayIndicesToIndexSize(GEP); 978 979 bool NeedsExtraction; 980 int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); 981 982 if (!NeedsExtraction) 983 return Changed; 984 985 TargetTransformInfo &TTI = GetTTI(*GEP->getFunction()); 986 987 // If LowerGEP is disabled, before really splitting the GEP, check whether the 988 // backend supports the addressing mode we are about to produce. If no, this 989 // splitting probably won't be beneficial. 990 // If LowerGEP is enabled, even the extracted constant offset can not match 991 // the addressing mode, we can still do optimizations to other lowered parts 992 // of variable indices. Therefore, we don't check for addressing modes in that 993 // case. 994 if (!LowerGEP) { 995 unsigned AddrSpace = GEP->getPointerAddressSpace(); 996 if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), 997 /*BaseGV=*/nullptr, AccumulativeByteOffset, 998 /*HasBaseReg=*/true, /*Scale=*/0, 999 AddrSpace)) { 1000 return Changed; 1001 } 1002 } 1003 1004 // Remove the constant offset in each sequential index. The resultant GEP 1005 // computes the variadic base. 1006 // Notice that we don't remove struct field indices here. If LowerGEP is 1007 // disabled, a structure index is not accumulated and we still use the old 1008 // one. If LowerGEP is enabled, a structure index is accumulated in the 1009 // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later 1010 // handle the constant offset and won't need a new structure index. 1011 gep_type_iterator GTI = gep_type_begin(*GEP); 1012 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { 1013 if (GTI.isSequential()) { 1014 // Constant offsets of scalable types are not really constant. 1015 if (GTI.getIndexedType()->isScalableTy()) 1016 continue; 1017 1018 // Splits this GEP index into a variadic part and a constant offset, and 1019 // uses the variadic part as the new index. 1020 Value *OldIdx = GEP->getOperand(I); 1021 User *UserChainTail; 1022 Value *NewIdx = 1023 ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail); 1024 if (NewIdx != nullptr) { 1025 // Switches to the index with the constant offset removed. 1026 GEP->setOperand(I, NewIdx); 1027 // After switching to the new index, we can garbage-collect UserChain 1028 // and the old index if they are not used. 1029 RecursivelyDeleteTriviallyDeadInstructions(UserChainTail); 1030 RecursivelyDeleteTriviallyDeadInstructions(OldIdx); 1031 } 1032 } 1033 } 1034 1035 // Clear the inbounds attribute because the new index may be off-bound. 1036 // e.g., 1037 // 1038 // b = add i64 a, 5 1039 // addr = gep inbounds float, float* p, i64 b 1040 // 1041 // is transformed to: 1042 // 1043 // addr2 = gep float, float* p, i64 a ; inbounds removed 1044 // addr = gep inbounds float, float* addr2, i64 5 1045 // 1046 // If a is -4, although the old index b is in bounds, the new index a is 1047 // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the 1048 // inbounds keyword is not present, the offsets are added to the base 1049 // address with silently-wrapping two's complement arithmetic". 1050 // Therefore, the final code will be a semantically equivalent. 1051 // 1052 // TODO(jingyue): do some range analysis to keep as many inbounds as 1053 // possible. GEPs with inbounds are more friendly to alias analysis. 1054 bool GEPWasInBounds = GEP->isInBounds(); 1055 GEP->setIsInBounds(false); 1056 1057 // Lowers a GEP to either GEPs with a single index or arithmetic operations. 1058 if (LowerGEP) { 1059 // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to 1060 // arithmetic operations if the target uses alias analysis in codegen. 1061 // Additionally, pointers that aren't integral (and so can't be safely 1062 // converted to integers) or those whose offset size is different from their 1063 // pointer size (which means that doing integer arithmetic on them could 1064 // affect that data) can't be lowered in this way. 1065 unsigned AddrSpace = GEP->getPointerAddressSpace(); 1066 bool PointerHasExtraData = DL->getPointerSizeInBits(AddrSpace) != 1067 DL->getIndexSizeInBits(AddrSpace); 1068 if (TTI.useAA() || DL->isNonIntegralAddressSpace(AddrSpace) || 1069 PointerHasExtraData) 1070 lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset); 1071 else 1072 lowerToArithmetics(GEP, AccumulativeByteOffset); 1073 return true; 1074 } 1075 1076 // No need to create another GEP if the accumulative byte offset is 0. 1077 if (AccumulativeByteOffset == 0) 1078 return true; 1079 1080 // Offsets the base with the accumulative byte offset. 1081 // 1082 // %gep ; the base 1083 // ... %gep ... 1084 // 1085 // => add the offset 1086 // 1087 // %gep2 ; clone of %gep 1088 // %new.gep = gep i8, %gep2, %offset 1089 // %gep ; will be removed 1090 // ... %gep ... 1091 // 1092 // => replace all uses of %gep with %new.gep and remove %gep 1093 // 1094 // %gep2 ; clone of %gep 1095 // %new.gep = gep i8, %gep2, %offset 1096 // ... %new.gep ... 1097 Instruction *NewGEP = GEP->clone(); 1098 NewGEP->insertBefore(GEP); 1099 1100 Type *PtrIdxTy = DL->getIndexType(GEP->getType()); 1101 IRBuilder<> Builder(GEP); 1102 NewGEP = cast<Instruction>(Builder.CreatePtrAdd( 1103 NewGEP, ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true), 1104 GEP->getName(), GEPWasInBounds)); 1105 NewGEP->copyMetadata(*GEP); 1106 1107 GEP->replaceAllUsesWith(NewGEP); 1108 GEP->eraseFromParent(); 1109 1110 return true; 1111 } 1112 1113 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) { 1114 if (skipFunction(F)) 1115 return false; 1116 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1117 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1118 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 1119 auto GetTTI = [this](Function &F) -> TargetTransformInfo & { 1120 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1121 }; 1122 SeparateConstOffsetFromGEP Impl(DT, LI, TLI, GetTTI, LowerGEP); 1123 return Impl.run(F); 1124 } 1125 1126 bool SeparateConstOffsetFromGEP::run(Function &F) { 1127 if (DisableSeparateConstOffsetFromGEP) 1128 return false; 1129 1130 DL = &F.getParent()->getDataLayout(); 1131 bool Changed = false; 1132 for (BasicBlock &B : F) { 1133 if (!DT->isReachableFromEntry(&B)) 1134 continue; 1135 1136 for (Instruction &I : llvm::make_early_inc_range(B)) 1137 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) 1138 Changed |= splitGEP(GEP); 1139 // No need to split GEP ConstantExprs because all its indices are constant 1140 // already. 1141 } 1142 1143 Changed |= reuniteExts(F); 1144 1145 if (VerifyNoDeadCode) 1146 verifyNoDeadCode(F); 1147 1148 return Changed; 1149 } 1150 1151 Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( 1152 ExprKey Key, Instruction *Dominatee, 1153 DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs) { 1154 auto Pos = DominatingExprs.find(Key); 1155 if (Pos == DominatingExprs.end()) 1156 return nullptr; 1157 1158 auto &Candidates = Pos->second; 1159 // Because we process the basic blocks in pre-order of the dominator tree, a 1160 // candidate that doesn't dominate the current instruction won't dominate any 1161 // future instruction either. Therefore, we pop it out of the stack. This 1162 // optimization makes the algorithm O(n). 1163 while (!Candidates.empty()) { 1164 Instruction *Candidate = Candidates.back(); 1165 if (DT->dominates(Candidate, Dominatee)) 1166 return Candidate; 1167 Candidates.pop_back(); 1168 } 1169 return nullptr; 1170 } 1171 1172 bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { 1173 if (!I->getType()->isIntOrIntVectorTy()) 1174 return false; 1175 1176 // Dom: LHS+RHS 1177 // I: sext(LHS)+sext(RHS) 1178 // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). 1179 // TODO: handle zext 1180 Value *LHS = nullptr, *RHS = nullptr; 1181 if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 1182 if (LHS->getType() == RHS->getType()) { 1183 ExprKey Key = createNormalizedCommutablePair(LHS, RHS); 1184 if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) { 1185 Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); 1186 NewSExt->takeName(I); 1187 I->replaceAllUsesWith(NewSExt); 1188 RecursivelyDeleteTriviallyDeadInstructions(I); 1189 return true; 1190 } 1191 } 1192 } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { 1193 if (LHS->getType() == RHS->getType()) { 1194 if (auto *Dom = 1195 findClosestMatchingDominator({LHS, RHS}, I, DominatingSubs)) { 1196 Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); 1197 NewSExt->takeName(I); 1198 I->replaceAllUsesWith(NewSExt); 1199 RecursivelyDeleteTriviallyDeadInstructions(I); 1200 return true; 1201 } 1202 } 1203 } 1204 1205 // Add I to DominatingExprs if it's an add/sub that can't sign overflow. 1206 if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) { 1207 if (programUndefinedIfPoison(I)) { 1208 ExprKey Key = createNormalizedCommutablePair(LHS, RHS); 1209 DominatingAdds[Key].push_back(I); 1210 } 1211 } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { 1212 if (programUndefinedIfPoison(I)) 1213 DominatingSubs[{LHS, RHS}].push_back(I); 1214 } 1215 return false; 1216 } 1217 1218 bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { 1219 bool Changed = false; 1220 DominatingAdds.clear(); 1221 DominatingSubs.clear(); 1222 for (const auto Node : depth_first(DT)) { 1223 BasicBlock *BB = Node->getBlock(); 1224 for (Instruction &I : llvm::make_early_inc_range(*BB)) 1225 Changed |= reuniteExts(&I); 1226 } 1227 return Changed; 1228 } 1229 1230 void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { 1231 for (BasicBlock &B : F) { 1232 for (Instruction &I : B) { 1233 if (isInstructionTriviallyDead(&I)) { 1234 std::string ErrMessage; 1235 raw_string_ostream RSO(ErrMessage); 1236 RSO << "Dead instruction detected!\n" << I << "\n"; 1237 llvm_unreachable(RSO.str().c_str()); 1238 } 1239 } 1240 } 1241 } 1242 1243 bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( 1244 GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { 1245 if (!FirstGEP || !FirstGEP->hasOneUse()) 1246 return false; 1247 1248 if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) 1249 return false; 1250 1251 if (FirstGEP == SecondGEP) 1252 return false; 1253 1254 unsigned FirstNum = FirstGEP->getNumOperands(); 1255 unsigned SecondNum = SecondGEP->getNumOperands(); 1256 // Give up if the number of operands are not 2. 1257 if (FirstNum != SecondNum || FirstNum != 2) 1258 return false; 1259 1260 Value *FirstBase = FirstGEP->getOperand(0); 1261 Value *SecondBase = SecondGEP->getOperand(0); 1262 Value *FirstOffset = FirstGEP->getOperand(1); 1263 // Give up if the index of the first GEP is loop invariant. 1264 if (CurLoop->isLoopInvariant(FirstOffset)) 1265 return false; 1266 1267 // Give up if base doesn't have same type. 1268 if (FirstBase->getType() != SecondBase->getType()) 1269 return false; 1270 1271 Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); 1272 1273 // Check if the second operand of first GEP has constant coefficient. 1274 // For an example, for the following code, we won't gain anything by 1275 // hoisting the second GEP out because the second GEP can be folded away. 1276 // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 1277 // %67 = shl i64 %scevgep.sum.ur159, 2 1278 // %uglygep160 = getelementptr i8* %65, i64 %67 1279 // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 1280 1281 // Skip constant shift instruction which may be generated by Splitting GEPs. 1282 if (FirstOffsetDef && FirstOffsetDef->isShift() && 1283 isa<ConstantInt>(FirstOffsetDef->getOperand(1))) 1284 FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); 1285 1286 // Give up if FirstOffsetDef is an Add or Sub with constant. 1287 // Because it may not profitable at all due to constant folding. 1288 if (FirstOffsetDef) 1289 if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { 1290 unsigned opc = BO->getOpcode(); 1291 if ((opc == Instruction::Add || opc == Instruction::Sub) && 1292 (isa<ConstantInt>(BO->getOperand(0)) || 1293 isa<ConstantInt>(BO->getOperand(1)))) 1294 return false; 1295 } 1296 return true; 1297 } 1298 1299 bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { 1300 int UsesInLoop = 0; 1301 for (User *U : V->users()) { 1302 if (Instruction *User = dyn_cast<Instruction>(U)) 1303 if (L->contains(User)) 1304 if (++UsesInLoop > 1) 1305 return true; 1306 } 1307 return false; 1308 } 1309 1310 void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, 1311 GetElementPtrInst *Second) { 1312 Value *Offset1 = First->getOperand(1); 1313 Value *Offset2 = Second->getOperand(1); 1314 First->setOperand(1, Offset2); 1315 Second->setOperand(1, Offset1); 1316 1317 // We changed p+o+c to p+c+o, p+c may not be inbound anymore. 1318 const DataLayout &DAL = First->getModule()->getDataLayout(); 1319 APInt Offset(DAL.getIndexSizeInBits( 1320 cast<PointerType>(First->getType())->getAddressSpace()), 1321 0); 1322 Value *NewBase = 1323 First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); 1324 uint64_t ObjectSize; 1325 if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || 1326 Offset.ugt(ObjectSize)) { 1327 First->setIsInBounds(false); 1328 Second->setIsInBounds(false); 1329 } else 1330 First->setIsInBounds(true); 1331 } 1332 1333 void SeparateConstOffsetFromGEPPass::printPipeline( 1334 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 1335 static_cast<PassInfoMixin<SeparateConstOffsetFromGEPPass> *>(this) 1336 ->printPipeline(OS, MapClassName2PassName); 1337 OS << '<'; 1338 if (LowerGEP) 1339 OS << "lower-gep"; 1340 OS << '>'; 1341 } 1342 1343 PreservedAnalyses 1344 SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) { 1345 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); 1346 auto *LI = &AM.getResult<LoopAnalysis>(F); 1347 auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F); 1348 auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & { 1349 return AM.getResult<TargetIRAnalysis>(F); 1350 }; 1351 SeparateConstOffsetFromGEP Impl(DT, LI, TLI, GetTTI, LowerGEP); 1352 if (!Impl.run(F)) 1353 return PreservedAnalyses::all(); 1354 PreservedAnalyses PA; 1355 PA.preserveSet<CFGAnalyses>(); 1356 return PA; 1357 } 1358