1 //===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H 10 #define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H 11 12 #include "mlir/IR/Attributes.h" 13 #include "mlir/IR/MLIRContext.h" 14 15 namespace mlir { 16 namespace transform { 17 namespace gpu { 18 19 /// Base struct to hold GPU mapping information for a given operation. 20 struct MappingInfo { 21 /// Number of threads to use for the mapping. 22 /// Note: When the number of threads used is smaller than the total number of 23 /// available threads, predication ensues. It is often useful to use more 24 /// threads and saturate memory bandwidth for some operations, even if others 25 /// end up being predicated. 26 SmallVector<int64_t> numThreads; 27 28 /// Thread mapping attributes, one per entry of `numThreads`. 29 SmallVector<Attribute> threadMapping; 30 }; 31 32 struct CopyMappingInfo : public MappingInfo { 33 /// Status of the mapping computation, invalid usually means too many threads 34 /// are required and we fail to map. This usually happens when the copy is too 35 /// large compared to the number of threads. 36 enum class Status { Success = 0, RequiresPredication, Invalid }; 37 38 /// Greedily compute the MappingInfo to use to perform a copy of `sizes` 39 /// elements of bitwidth `elementalBitwidth`. 40 /// The `desiredBitAlignment` is the number of elements by which the most 41 /// minor dimension of the copy is expected to be aligned. 42 /// This is an approximation of the final alignment, for each row of the copy. 43 /// This is used to restrict the size of copied vector so that they match 44 /// potential subsequent cp.async. 45 /// If the alignment does not match the required alignment for a cp.async down 46 /// the line, the conversion to cp.async will be eventually skipped, possibly 47 /// degrading performance. 48 /// When `favorPredication` is false, the mapping is computed to fill all 49 /// threads with an equal amount of data to copy, so as to avoid predication. 50 /// Predication ends up requiring a split epilogue in current pipelining 51 /// implementations and is better avoided when possible. 52 CopyMappingInfo(MLIRContext *ctx, int totalNumThreads, 53 int64_t desiredBitAlignment, ArrayRef<int64_t> sizes, 54 bool favorPredication = false, 55 int64_t elementalBitwidth = 32); 56 57 private: 58 /// Determine the maximal vector size to use to copy a contiguous array of 59 /// `numContiguousElements`, each of bitwidth `elementalBitwidth`. 60 /// The `alignment` is the number of elements by which the most minor 61 /// dimension of the copy is aligned. This is an approximation of actual 62 /// memory alignment after bufferization, for each row of the copy. This is 63 /// used to restrict the of the copied vector so that it is properly aligned 64 /// with the requirements of cp.async. If the copy alignment does not match 65 /// the required aligned for a cp.async, thae conversion to cp.async will be 66 /// skipped. 67 /// Asserts that `elementalBitwidth` divides `numContiguousElements`. 68 static int64_t 69 maxContiguousElementsToTransfer(int64_t alignment, 70 int64_t numContiguousElements, 71 int64_t elementalBitwidth = 32); 72 73 /// Compute the number of threads to use to perform a copy of `sizes` 74 /// elements of `elementalBitwidth`. 75 /// The `alignment` is the number of elements by which the most minor 76 /// dimension of the copy is aligned. This is an approximation of actual 77 /// memory alignment after bufferization, for each row of the copy. This is 78 /// used to restrict the of the copied vector so that it is properly aligned 79 /// with the requirements of cp.async. If the copy alignment does not match 80 /// the required aligned for a cp.async, the conversion to cp.async will be 81 /// skipped. 82 /// When `favorPredication` is false, the implementation avoids predication 83 /// in the copy, even if it means reducing the granularity of the transfer. 84 /// Otherwise, the implementation will come up with a maximal assignment of 85 /// the remaining threads to sizes of interest, using a DP implementation. 86 Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes, 87 int64_t desiredVectorSize, bool favorPredication); 88 Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes, 89 int64_t desiredVectorSize); 90 91 public: 92 // Pretty-printing and diagnostic methods. 93 void print(llvm::raw_ostream &os) const; 94 LLVM_DUMP_METHOD void dump() const; 95 96 /// Static quantity determining the number of bits to target in an individual 97 /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid 98 /// transfer sizes. In the future we should have more hardware pluggability 99 /// here, especially when we want sub-byte granularity 100 static constexpr int64_t kMaxVectorLoadBitWidth = 128; 101 102 /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy. 103 int64_t vectorSize; 104 105 /// Number of threads to use for the copy mapping, from most major to most 106 /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads 107 /// for best coalescing). 108 using MappingInfo::numThreads; 109 110 /// Explicit computation / injection of the smallest bounding tile sizes after 111 /// mapping to `numThreads`. This is useful in masked scenarios. 112 SmallVector<int64_t> smallestBoundingTileSizes; 113 114 /// Thread mapping attributes, one per entry of `numThreads`. 115 using MappingInfo::threadMapping; 116 117 /// The status of a particular copy mapping. Must be checked before applying 118 /// transformations. 119 Status status; 120 }; 121 122 } // namespace gpu 123 } // namespace transform 124 } // namespace mlir 125 126 #endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H 127