1 //===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This header file defines prototypes that expose pass constructors. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ 14 #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ 15 16 #include "mlir/Dialect/GPU/IR/GPUDialect.h" 17 #include "mlir/Dialect/GPU/Utils/GPUUtils.h" 18 #include "mlir/IR/PatternMatch.h" 19 #include "mlir/Pass/Pass.h" 20 #include <optional> 21 22 namespace llvm { 23 class TargetMachine; 24 class LLVMContext; 25 class Module; 26 } // namespace llvm 27 28 namespace mlir { 29 class TypeConverter; 30 class ConversionTarget; 31 namespace func { 32 class FuncOp; 33 } // namespace func 34 35 #define GEN_PASS_DECL 36 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" 37 38 /// Pass that moves ops which are likely an index computation into gpu.launch 39 /// body. 40 std::unique_ptr<Pass> createGpuLauchSinkIndexComputationsPass(); 41 42 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into 43 /// a separate kernel function. 44 std::unique_ptr<OperationPass<ModuleOp>> 45 createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef()); 46 47 /// Rewrites a function region so that GPU ops execute asynchronously. 48 std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass(); 49 50 /// Maps the parallel loops found in the given function to workgroups. The first 51 /// loop encountered will be mapped to the global workgroup and the second loop 52 /// encountered to the local workgroup. Within each mapping, the first three 53 /// dimensions are mapped to x/y/z hardware ids and all following dimensions are 54 /// mapped to sequential loops. 55 std::unique_ptr<OperationPass<func::FuncOp>> createGpuMapParallelLoopsPass(); 56 57 /// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect. 58 void populateGpuGlobalIdPatterns(RewritePatternSet &patterns); 59 60 /// Collect a set of patterns to rewrite shuffle ops within the GPU dialect. 61 void populateGpuShufflePatterns(RewritePatternSet &patterns); 62 63 /// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect. 64 void populateGpuAllReducePatterns(RewritePatternSet &patterns); 65 66 /// Collect a set of patterns to break down subgroup_reduce ops into smaller 67 /// ones supported by the target of `size <= maxShuffleBitwidth`, where `size` 68 /// is the subgroup_reduce value bitwidth. 69 void populateGpuBreakDownSubgroupReducePatterns( 70 RewritePatternSet &patterns, unsigned maxShuffleBitwidth = 32, 71 PatternBenefit benefit = 1); 72 73 /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle` 74 /// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has 75 /// `subgroupSize` lanes. Uses the butterfly shuffle algorithm. 76 /// 77 /// The patterns populated by this function will ignore ops with the 78 /// `cluster_size` attribute. 79 /// `populateGpuLowerClusteredSubgroupReduceToShufflePatterns` is the opposite. 80 void populateGpuLowerSubgroupReduceToShufflePatterns( 81 RewritePatternSet &patterns, unsigned subgroupSize, 82 unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); 83 84 /// Disjoint counterpart of `populateGpuLowerSubgroupReduceToShufflePatterns` 85 /// that only matches `gpu.subgroup_reduce` ops with a `cluster_size`. 86 void populateGpuLowerClusteredSubgroupReduceToShufflePatterns( 87 RewritePatternSet &patterns, unsigned subgroupSize, 88 unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); 89 90 /// Collect all patterns to rewrite ops within the GPU dialect. 91 inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { 92 populateGpuAllReducePatterns(patterns); 93 populateGpuGlobalIdPatterns(patterns); 94 populateGpuShufflePatterns(patterns); 95 } 96 97 namespace gpu { 98 /// Searches for all GPU modules in `op` and transforms them into GPU binary 99 /// operations. The resulting `gpu.binary` has `handler` as its offloading 100 /// handler attribute. 101 LogicalResult transformGpuModulesToBinaries( 102 Operation *op, OffloadingLLVMTranslationAttrInterface handler = nullptr, 103 const gpu::TargetOptions &options = {}); 104 } // namespace gpu 105 106 //===----------------------------------------------------------------------===// 107 // Registration 108 //===----------------------------------------------------------------------===// 109 110 /// Collect a set of patterns to decompose memrefs ops. 111 void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns); 112 113 /// Pass decomposes memref ops inside `gpu.launch` body. 114 std::unique_ptr<Pass> createGpuDecomposeMemrefsPass(); 115 116 /// Erase barriers that do not enforce conflicting memory side effects. 117 void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns); 118 119 /// Generate the code for registering passes. 120 #define GEN_PASS_REGISTRATION 121 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" 122 123 } // namespace mlir 124 125 #endif // MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ 126