xref: /llvm-project/mlir/include/mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h (revision db791b278a414fb6df1acc1799adcf11d8fb9169)
1 //===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
10 #define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
11 
12 #include "mlir/IR/Attributes.h"
13 #include "mlir/IR/MLIRContext.h"
14 
15 namespace mlir {
16 namespace transform {
17 namespace gpu {
18 
19 /// Base struct to hold GPU mapping information for a given operation.
20 struct MappingInfo {
21   /// Number of threads to use for the mapping.
22   /// Note: When the number of threads used is smaller than the total number of
23   /// available threads, predication ensues. It is often useful to use more
24   /// threads and saturate memory bandwidth for some operations, even if others
25   /// end up being predicated.
26   SmallVector<int64_t> numThreads;
27 
28   /// Thread mapping attributes, one per entry of `numThreads`.
29   SmallVector<Attribute> threadMapping;
30 };
31 
32 struct CopyMappingInfo : public MappingInfo {
33   /// Status of the mapping computation, invalid usually means too many threads
34   /// are required and we fail to map. This usually happens when the copy is too
35   /// large compared to the number of threads.
36   enum class Status { Success = 0, RequiresPredication, Invalid };
37 
38   /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
39   /// elements of bitwidth `elementalBitwidth`.
40   /// The `desiredBitAlignment` is the number of elements by which the most
41   /// minor dimension of the copy is expected to be aligned.
42   /// This is an approximation of the final alignment, for each row of the copy.
43   /// This is used to restrict the size of copied vector so that they match
44   /// potential subsequent cp.async.
45   /// If the alignment does not match the required alignment for a cp.async down
46   /// the line, the conversion to cp.async will be eventually skipped, possibly
47   /// degrading performance.
48   /// When `favorPredication` is false, the mapping is computed to fill all
49   /// threads with an equal amount of data to copy, so as to avoid predication.
50   /// Predication ends up requiring a split epilogue in current pipelining
51   /// implementations and is better avoided when possible.
52   CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
53                   int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
54                   bool favorPredication = false,
55                   int64_t elementalBitwidth = 32);
56 
57 private:
58   /// Determine the maximal vector size to use to copy a contiguous array of
59   /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
60   /// The `alignment` is the number of elements by which the most minor
61   /// dimension of the copy is aligned. This is an approximation of actual
62   /// memory alignment after bufferization, for each row of the copy. This is
63   /// used to restrict the of the copied vector so that it is properly aligned
64   /// with the requirements of cp.async. If the copy alignment does not match
65   /// the required aligned for a cp.async, thae conversion to cp.async will be
66   /// skipped.
67   /// Asserts that `elementalBitwidth` divides `numContiguousElements`.
68   static int64_t
69   maxContiguousElementsToTransfer(int64_t alignment,
70                                   int64_t numContiguousElements,
71                                   int64_t elementalBitwidth = 32);
72 
73   /// Compute the number of threads to use to perform a copy of `sizes`
74   /// elements of `elementalBitwidth`.
75   /// The `alignment` is the number of elements by which the most minor
76   /// dimension of the copy is aligned. This is an approximation of actual
77   /// memory alignment after bufferization, for each row of the copy. This is
78   /// used to restrict the of the copied vector so that it is properly aligned
79   /// with the requirements of cp.async. If the copy alignment does not match
80   /// the required aligned for a cp.async, the conversion to cp.async will be
81   /// skipped.
82   /// When `favorPredication` is false, the implementation avoids predication
83   /// in the copy, even if it means reducing the granularity of the transfer.
84   /// Otherwise, the implementation will come up with a maximal assignment of
85   /// the remaining threads to sizes of interest, using a DP implementation.
86   Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
87                          int64_t desiredVectorSize, bool favorPredication);
88   Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
89                              int64_t desiredVectorSize);
90 
91 public:
92   // Pretty-printing and diagnostic methods.
93   void print(llvm::raw_ostream &os) const;
94   LLVM_DUMP_METHOD void dump() const;
95 
96   /// Static quantity determining the number of bits to target in an individual
97   /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
98   /// transfer sizes. In the future we should have more hardware pluggability
99   /// here, especially when we want sub-byte granularity
100   static constexpr int64_t kMaxVectorLoadBitWidth = 128;
101 
102   /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
103   int64_t vectorSize;
104 
105   /// Number of threads to use for the copy mapping, from most major to most
106   /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
107   /// for best coalescing).
108   using MappingInfo::numThreads;
109 
110   /// Explicit computation / injection of the smallest bounding tile sizes after
111   /// mapping to `numThreads`. This is useful in masked scenarios.
112   SmallVector<int64_t> smallestBoundingTileSizes;
113 
114   /// Thread mapping attributes, one per entry of `numThreads`.
115   using MappingInfo::threadMapping;
116 
117   /// The status of a particular copy mapping. Must be checked before applying
118   /// transformations.
119   Status status;
120 };
121 
122 } // namespace gpu
123 } // namespace transform
124 } // namespace mlir
125 
126 #endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
127