1//===- NVGPUTransformOps.td - NVGPU transform ops ----------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#ifndef NVGPU_TRANSFORM_OPS 10#define NVGPU_TRANSFORM_OPS 11 12include "mlir/Dialect/Transform/IR/TransformAttrs.td" 13include "mlir/Dialect/Transform/IR/TransformDialect.td" 14include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td" 15include "mlir/Dialect/Transform/IR/TransformTypes.td" 16include "mlir/Interfaces/SideEffectInterfaces.td" 17 18//===----------------------------------------------------------------------===// 19// Apply...ConversionPatternsOp 20//===----------------------------------------------------------------------===// 21 22def ApplyNVGPUToNVVMConversionPatternsOp : Op<Transform_Dialect, 23 "apply_conversion_patterns.nvgpu.nvgpu_to_nvvm", 24 [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface, 25 ["verifyTypeConverter"]>]> { 26 let description = [{ 27 Collects patterns that convert NVGPU dialect ops to NVVM dialect ops. These 28 patterns require an "LLVMTypeConverter". 29 }]; 30 let assemblyFormat = "attr-dict"; 31} 32 33//===----------------------------------------------------------------------===// 34// CreateAsyncGroupsOp 35//===----------------------------------------------------------------------===// 36 37def CreateAsyncGroupsOp : 38 Op<Transform_Dialect, "nvgpu.create_async_groups", 39 [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, 40 TransformEachOpTrait, 41 TransformOpInterface, 42 ReportTrackingListenerFailuresOpTrait]> { 43 let description = [{ 44 Look for global to shared memory copies within the targeted op in the form 45 of vector transfer ops and convert them to async copies when possible. 46 Consecutive copies are put into the same group. A "wait" operation is 47 inserted right at the of end the group. 48 49 `bypass_l1` specifies whether `bypassL1` attributes should be added to 50 the async copies. `bypass_l1` is a compiler hint: only 16 byte transfers 51 can bypass the L1 cache, so this attribute is not set for any other transfer 52 sizes. 53 54 #### Return modes 55 56 This op consumes the `target` handle and produces the `result` handle, which 57 is mapped to the same payload operations as the `target` handle. The op 58 modifies the payload. 59 }]; 60 61 let arguments = (ins TransformHandleTypeInterface:$target, 62 UnitAttr:$bypass_l1); 63 let results = (outs TransformHandleTypeInterface:$result); 64 65 let assemblyFormat = [{ 66 $target attr-dict `:` functional-type(operands, results) 67 }]; 68 69 let extraClassDeclaration = [{ 70 ::mlir::DiagnosedSilenceableFailure applyToOne( 71 ::mlir::transform::TransformRewriter &rewriter, 72 ::mlir::Operation *target, 73 ::mlir::transform::ApplyToEachResultList &results, 74 ::mlir::transform::TransformState &state); 75 }]; 76} 77 78//===----------------------------------------------------------------------===// 79// PipelineSharedMemoryCopiesOp 80//===----------------------------------------------------------------------===// 81 82def PipelineSharedMemoryCopiesOp : 83 Op<Transform_Dialect, "nvgpu.pipeline_shared_memory_copies", 84 [FunctionalStyleTransformOpTrait, 85 MemoryEffectsOpInterface, 86 TransformEachOpTrait, 87 TransformOpInterface, 88 ReportTrackingListenerFailuresOpTrait]> { 89 let summary = 90 "Applies software pipelining to a given loop with shared memory copies"; 91 92 let description = [{ 93 Applies software pipelining to a given scf.for loop. The pipelining 94 strategy will look for a load into shared memory and pipeline it to overlap 95 it with the rest of the loop. 96 97 NOTE: It is user responsibility to ensure that there are no dependency 98 between `depth` iterations of the loop by using multi-buffering. It is 99 also user responsibility to ensure a sufficient amount of shared memory 100 is allocated to cover eventual writes by `depth-1` speculative 101 iterations. 102 103 `depth` will indicate how many stages the software pipeline should have. 104 `peel_epilogue` allows to force the epilogue to be peeled out instead of 105 potentially using predicated operations for the epilogue phase. 106 107 #### Return modes 108 109 Consumes the operand handle and produces a result handle pointing to the 110 loop, which may or may not have been pipelined. Produces a definite failure 111 if the loop pipeliner mutated the IR before failing to pipeline, in 112 particular if `peel_epilogue` is not set and the loop body doesn't support 113 predication. If failure propagation mode is set to "propagate", produces a 114 silenceable failure when pipelining preconditions, e.g., loop bound being 115 static, are not met or when the loop wasn't pipelined because due to the 116 lack of loads into shared memory. If the failure propagation mode is set 117 to "suppress" (default), succeeds in these case and associates the result 118 handle with the original loop. 119 120 TODO: the shared memory part and behavior specific to NVGPU should be 121 made orthogonal to pipelining so that `transform.loop.pipeline` becomes 122 usable here. 123 }]; 124 125 let arguments = (ins TransformHandleTypeInterface:$for_op, 126 I64Attr:$depth, 127 UnitAttr:$peel_epilogue, 128 DefaultValuedAttr<FailurePropagationMode, 129 "::mlir::transform::FailurePropagationMode::Suppress"> 130 :$failure_propagation_mode); 131 let results = (outs TransformHandleTypeInterface:$result); 132 133 let assemblyFormat = [{ 134 `failures` `(` $failure_propagation_mode `)` 135 $for_op 136 attr-dict 137 `:` functional-type(operands, results) 138 }]; 139 140 let extraClassDeclaration = [{ 141 ::mlir::DiagnosedSilenceableFailure applyToOne( 142 ::mlir::transform::TransformRewriter &rewriter, 143 ::mlir::scf::ForOp forOp, 144 ::mlir::transform::ApplyToEachResultList &results, 145 ::mlir::transform::TransformState &state); 146 }]; 147} 148 149//===----------------------------------------------------------------------===// 150// RewriteMatmulAsMmaSyncOp 151//===----------------------------------------------------------------------===// 152 153def RewriteMatmulAsMmaSyncOp : 154 Op<Transform_Dialect, "nvgpu.rewrite_matmul_as_mma_sync", 155 [FunctionalStyleTransformOpTrait, 156 MemoryEffectsOpInterface, 157 TransformEachOpTrait, 158 TransformOpInterface, 159 ReportTrackingListenerFailuresOpTrait]> { 160 let description = [{ 161 Rewrite a matmul operation on memref to an mma.sync operation on vectors. 162 163 Memory copies with the required access patterns are automatically inserted. 164 Operations that do not have a 1-1 mapping to mma.sync operations are left 165 unchanged. 166 }]; 167 168 let arguments = (ins TransformHandleTypeInterface:$target); 169 let results = (outs); 170 171 let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) "; 172 173 let extraClassDeclaration = [{ 174 ::mlir::DiagnosedSilenceableFailure applyToOne( 175 ::mlir::transform::TransformRewriter &rewriter, 176 ::mlir::linalg::LinalgOp linalgOp, 177 ::mlir::transform::ApplyToEachResultList &results, 178 ::mlir::transform::TransformState &state); 179 }]; 180} 181 182//===----------------------------------------------------------------------===// 183// RewriteCopyAsTmaOp 184//===----------------------------------------------------------------------===// 185 186def RewriteCopyAsTmaOp : 187 Op<Transform_Dialect, "nvgpu.rewrite_copy_as_tma", 188 [FunctionalStyleTransformOpTrait, 189 MemoryEffectsOpInterface, 190 TransformEachOpTrait, 191 TransformOpInterface, 192 ReportTrackingListenerFailuresOpTrait]> { 193 let description = [{ 194 Rewrite a copy operation on memref to tma operations that transit through 195 shared memory. 196 }]; 197 198 let arguments = (ins TransformHandleTypeInterface:$target); 199 let results = (outs); 200 201 let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) "; 202 203 let extraClassDeclaration = [{ 204 ::mlir::DiagnosedSilenceableFailure apply( 205 ::mlir::transform::TransformRewriter &rewriter, 206 ::mlir::transform::TransformResults &transformResults, 207 ::mlir::transform::TransformState &state); 208 }]; 209} 210 211#endif // NVGPU_TRANSFORM_OPS 212