1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// This file implements SLP analysis based on VPlan. The analysis is based on 10 /// the ideas described in 11 /// 12 /// Look-ahead SLP: auto-vectorization in the presence of commutative 13 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 14 /// Luís F. W. Góes 15 /// 16 //===----------------------------------------------------------------------===// 17 18 #include "VPlan.h" 19 #include "llvm/ADT/DepthFirstIterator.h" 20 #include "llvm/ADT/PostOrderIterator.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Analysis/LoopInfo.h" 24 #include "llvm/Analysis/VectorUtils.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/ErrorHandling.h" 36 #include "llvm/Support/GraphWriter.h" 37 #include "llvm/Support/raw_ostream.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 #include <cassert> 40 #include <iterator> 41 #include <string> 42 #include <vector> 43 44 using namespace llvm; 45 46 #define DEBUG_TYPE "vplan-slp" 47 48 // Number of levels to look ahead when re-ordering multi node operands. 49 static unsigned LookaheadMaxDepth = 5; 50 51 VPInstruction *VPlanSlp::markFailed() { 52 // FIXME: Currently this is used to signal we hit instructions we cannot 53 // trivially SLP'ize. 54 CompletelySLP = false; 55 return nullptr; 56 } 57 58 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { 59 if (all_of(Operands, [](VPValue *V) { 60 return cast<VPInstruction>(V)->getUnderlyingInstr(); 61 })) { 62 unsigned BundleSize = 0; 63 for (VPValue *V : Operands) { 64 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); 65 assert(!T->isVectorTy() && "Only scalar types supported for now"); 66 BundleSize += T->getScalarSizeInBits(); 67 } 68 WidestBundleBits = std::max(WidestBundleBits, BundleSize); 69 } 70 71 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); 72 assert(Res.second && 73 "Already created a combined instruction for the operand bundle"); 74 (void)Res; 75 } 76 77 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { 78 // Currently we only support VPInstructions. 79 if (!all_of(Operands, [](VPValue *Op) { 80 return Op && isa<VPInstruction>(Op) && 81 cast<VPInstruction>(Op)->getUnderlyingInstr(); 82 })) { 83 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); 84 return false; 85 } 86 87 // Check if opcodes and type width agree for all instructions in the bundle. 88 // FIXME: Differing widths/opcodes can be handled by inserting additional 89 // instructions. 90 // FIXME: Deal with non-primitive types. 91 const Instruction *OriginalInstr = 92 cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); 93 unsigned Opcode = OriginalInstr->getOpcode(); 94 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); 95 if (!all_of(Operands, [Opcode, Width](VPValue *Op) { 96 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); 97 return I->getOpcode() == Opcode && 98 I->getType()->getPrimitiveSizeInBits() == Width; 99 })) { 100 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); 101 return false; 102 } 103 104 // For now, all operands must be defined in the same BB. 105 if (any_of(Operands, [this](VPValue *Op) { 106 return cast<VPInstruction>(Op)->getParent() != &this->BB; 107 })) { 108 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); 109 return false; 110 } 111 112 if (any_of(Operands, 113 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { 114 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); 115 return false; 116 } 117 118 // For loads, check that there are no instructions writing to memory in 119 // between them. 120 // TODO: we only have to forbid instructions writing to memory that could 121 // interfere with any of the loads in the bundle 122 if (Opcode == Instruction::Load) { 123 unsigned LoadsSeen = 0; 124 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); 125 for (auto &I : *Parent) { 126 auto *VPI = cast<VPInstruction>(&I); 127 if (VPI->getOpcode() == Instruction::Load && 128 std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) 129 LoadsSeen++; 130 131 if (LoadsSeen == Operands.size()) 132 break; 133 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { 134 LLVM_DEBUG( 135 dbgs() << "VPSLP: instruction modifying memory between loads\n"); 136 return false; 137 } 138 } 139 140 if (!all_of(Operands, [](VPValue *Op) { 141 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 142 ->isSimple(); 143 })) { 144 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); 145 return false; 146 } 147 } 148 149 if (Opcode == Instruction::Store) 150 if (!all_of(Operands, [](VPValue *Op) { 151 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) 152 ->isSimple(); 153 })) { 154 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); 155 return false; 156 } 157 158 return true; 159 } 160 161 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, 162 unsigned OperandIndex) { 163 SmallVector<VPValue *, 4> Operands; 164 for (VPValue *V : Values) { 165 auto *U = cast<VPUser>(V); 166 Operands.push_back(U->getOperand(OperandIndex)); 167 } 168 return Operands; 169 } 170 171 static bool areCommutative(ArrayRef<VPValue *> Values) { 172 return Instruction::isCommutative( 173 cast<VPInstruction>(Values[0])->getOpcode()); 174 } 175 176 static SmallVector<SmallVector<VPValue *, 4>, 4> 177 getOperands(ArrayRef<VPValue *> Values) { 178 SmallVector<SmallVector<VPValue *, 4>, 4> Result; 179 auto *VPI = cast<VPInstruction>(Values[0]); 180 181 switch (VPI->getOpcode()) { 182 case Instruction::Load: 183 llvm_unreachable("Loads terminate a tree, no need to get operands"); 184 case Instruction::Store: 185 Result.push_back(getOperands(Values, 0)); 186 break; 187 default: 188 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) 189 Result.push_back(getOperands(Values, I)); 190 break; 191 } 192 193 return Result; 194 } 195 196 /// Returns the opcode of Values or ~0 if they do not all agree. 197 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { 198 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); 199 if (any_of(Values, [Opcode](VPValue *V) { 200 return cast<VPInstruction>(V)->getOpcode() != Opcode; 201 })) 202 return None; 203 return {Opcode}; 204 } 205 206 /// Returns true if A and B access sequential memory if they are loads or 207 /// stores or if they have identical opcodes otherwise. 208 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, 209 VPInterleavedAccessInfo &IAI) { 210 if (A->getOpcode() != B->getOpcode()) 211 return false; 212 213 if (A->getOpcode() != Instruction::Load && 214 A->getOpcode() != Instruction::Store) 215 return true; 216 auto *GA = IAI.getInterleaveGroup(A); 217 auto *GB = IAI.getInterleaveGroup(B); 218 219 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); 220 } 221 222 /// Implements getLAScore from Listing 7 in the paper. 223 /// Traverses and compares operands of V1 and V2 to MaxLevel. 224 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, 225 VPInterleavedAccessInfo &IAI) { 226 if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) 227 return 0; 228 229 if (MaxLevel == 0) 230 return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), 231 cast<VPInstruction>(V2), IAI); 232 233 unsigned Score = 0; 234 for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) 235 for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) 236 Score += getLAScore(cast<VPUser>(V1)->getOperand(I), 237 cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); 238 return Score; 239 } 240 241 std::pair<VPlanSlp::OpMode, VPValue *> 242 VPlanSlp::getBest(OpMode Mode, VPValue *Last, 243 SmallVectorImpl<VPValue *> &Candidates, 244 VPInterleavedAccessInfo &IAI) { 245 LLVM_DEBUG(dbgs() << " getBest\n"); 246 VPValue *Best = Candidates[0]; 247 SmallVector<VPValue *, 4> BestCandidates; 248 249 LLVM_DEBUG(dbgs() << " Candidates for " 250 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); 251 for (auto *Candidate : Candidates) { 252 auto *LastI = cast<VPInstruction>(Last); 253 auto *CandidateI = cast<VPInstruction>(Candidate); 254 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { 255 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() 256 << " "); 257 BestCandidates.push_back(Candidate); 258 } 259 } 260 LLVM_DEBUG(dbgs() << "\n"); 261 262 if (BestCandidates.empty()) 263 return {OpMode::Failed, nullptr}; 264 265 if (BestCandidates.size() == 1) 266 return {Mode, BestCandidates[0]}; 267 268 if (Mode == OpMode::Opcode) { 269 unsigned BestScore = 0; 270 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { 271 unsigned PrevScore = ~0u; 272 bool AllSame = true; 273 274 // FIXME: Avoid visiting the same operands multiple times. 275 for (auto *Candidate : BestCandidates) { 276 unsigned Score = getLAScore(Last, Candidate, Depth, IAI); 277 if (PrevScore == ~0u) 278 PrevScore = Score; 279 if (PrevScore != Score) 280 AllSame = false; 281 PrevScore = Score; 282 283 if (Score > BestScore) { 284 BestScore = Score; 285 Best = Candidate; 286 } 287 } 288 if (!AllSame) 289 break; 290 } 291 } 292 LLVM_DEBUG(dbgs() << "Found best " 293 << *cast<VPInstruction>(Best)->getUnderlyingInstr() 294 << "\n"); 295 std::remove(Candidates.begin(), Candidates.end(), Best); 296 297 return {Mode, Best}; 298 } 299 300 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { 301 SmallVector<MultiNodeOpTy, 4> FinalOrder; 302 SmallVector<OpMode, 4> Mode; 303 FinalOrder.reserve(MultiNodeOps.size()); 304 Mode.reserve(MultiNodeOps.size()); 305 306 LLVM_DEBUG(dbgs() << "Reordering multinode\n"); 307 308 for (auto &Operands : MultiNodeOps) { 309 FinalOrder.push_back({Operands.first, {Operands.second[0]}}); 310 if (cast<VPInstruction>(Operands.second[0])->getOpcode() == 311 Instruction::Load) 312 Mode.push_back(OpMode::Load); 313 else 314 Mode.push_back(OpMode::Opcode); 315 } 316 317 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { 318 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); 319 SmallVector<VPValue *, 4> Candidates; 320 Candidates.reserve(MultiNodeOps.size()); 321 LLVM_DEBUG(dbgs() << " Candidates "); 322 for (auto Ops : MultiNodeOps) { 323 LLVM_DEBUG( 324 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() 325 << " "); 326 Candidates.push_back(Ops.second[Lane]); 327 } 328 LLVM_DEBUG(dbgs() << "\n"); 329 330 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { 331 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); 332 if (Mode[Op] == OpMode::Failed) 333 continue; 334 335 VPValue *Last = FinalOrder[Op].second[Lane - 1]; 336 std::pair<OpMode, VPValue *> Res = 337 getBest(Mode[Op], Last, Candidates, IAI); 338 if (Res.second) 339 FinalOrder[Op].second.push_back(Res.second); 340 else 341 // TODO: handle this case 342 FinalOrder[Op].second.push_back(markFailed()); 343 } 344 } 345 346 return FinalOrder; 347 } 348 349 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { 350 LLVM_DEBUG(dbgs() << " Ops: "); 351 for (auto Op : Values) 352 if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr()) 353 LLVM_DEBUG(dbgs() << *Instr << " | "); 354 else 355 LLVM_DEBUG(dbgs() << " nullptr | "); 356 LLVM_DEBUG(dbgs() << "\n"); 357 } 358 359 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { 360 assert(!Values.empty() && "Need some operands!"); 361 362 // If we already visited this instruction bundle, re-use the existing node 363 auto I = BundleToCombined.find(to_vector<4>(Values)); 364 if (I != BundleToCombined.end()) { 365 #ifdef NDEBUG 366 // Check that the resulting graph is a tree. If we re-use a node, this means 367 // its values have multiple users. We only allow this, if all users of each 368 // value are the same instruction. 369 for (auto *V : Values) { 370 auto UI = V->user_begin(); 371 auto *FirstUser = *UI++; 372 while (UI != V->use_end()) { 373 assert(*UI == FirstUser && "Currently we only support SLP trees."); 374 UI++; 375 } 376 } 377 #endif 378 return I->second; 379 } 380 381 // Dump inputs 382 LLVM_DEBUG({ 383 dbgs() << "buildGraph: "; 384 dumpBundle(Values); 385 }); 386 387 if (!areVectorizable(Values)) 388 return markFailed(); 389 390 assert(getOpcode(Values) && "Opcodes for all values must match"); 391 unsigned ValuesOpcode = getOpcode(Values).getValue(); 392 393 SmallVector<VPValue *, 4> CombinedOperands; 394 if (areCommutative(Values)) { 395 bool MultiNodeRoot = !MultiNodeActive; 396 MultiNodeActive = true; 397 for (auto &Operands : getOperands(Values)) { 398 LLVM_DEBUG({ 399 dbgs() << " Visiting Commutative"; 400 dumpBundle(Operands); 401 }); 402 403 auto OperandsOpcode = getOpcode(Operands); 404 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { 405 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); 406 CombinedOperands.push_back(buildGraph(Operands)); 407 } else { 408 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); 409 // Create dummy VPInstruction, which will we replace later by the 410 // re-ordered operand. 411 VPInstruction *Op = new VPInstruction(0, {}); 412 CombinedOperands.push_back(Op); 413 MultiNodeOps.emplace_back(Op, Operands); 414 } 415 } 416 417 if (MultiNodeRoot) { 418 LLVM_DEBUG(dbgs() << "Reorder \n"); 419 MultiNodeActive = false; 420 421 auto FinalOrder = reorderMultiNodeOps(); 422 423 MultiNodeOps.clear(); 424 for (auto &Ops : FinalOrder) { 425 VPInstruction *NewOp = buildGraph(Ops.second); 426 Ops.first->replaceAllUsesWith(NewOp); 427 for (unsigned i = 0; i < CombinedOperands.size(); i++) 428 if (CombinedOperands[i] == Ops.first) 429 CombinedOperands[i] = NewOp; 430 delete Ops.first; 431 Ops.first = NewOp; 432 } 433 LLVM_DEBUG(dbgs() << "Found final order\n"); 434 } 435 } else { 436 LLVM_DEBUG(dbgs() << " NonCommuntative\n"); 437 if (ValuesOpcode == Instruction::Load) 438 for (VPValue *V : Values) 439 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); 440 else 441 for (auto &Operands : getOperands(Values)) 442 CombinedOperands.push_back(buildGraph(Operands)); 443 } 444 445 unsigned Opcode; 446 switch (ValuesOpcode) { 447 case Instruction::Load: 448 Opcode = VPInstruction::SLPLoad; 449 break; 450 case Instruction::Store: 451 Opcode = VPInstruction::SLPStore; 452 break; 453 default: 454 Opcode = ValuesOpcode; 455 break; 456 } 457 458 if (!CompletelySLP) 459 return markFailed(); 460 461 assert(CombinedOperands.size() > 0 && "Need more some operands"); 462 auto *VPI = new VPInstruction(Opcode, CombinedOperands); 463 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); 464 465 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); 466 cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); 467 addCombined(Values, VPI); 468 return VPI; 469 } 470