1// RUN: mlir-opt %s --transform-interpreter -canonicalize --split-input-file --verify-diagnostics | FileCheck %s 2 3func.func @simple_depth_2_unpeeled(%global: memref<?xf32>, %result: memref<?xf32> ) { 4 %c0 = arith.constant 0 : index 5 %c100 = arith.constant 100 : index 6 %c4 = arith.constant 4 : index 7 %shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>> 8 %c0f = arith.constant 0.0 : f32 9 // Predication is not currently implemented for transfer_read/write, so this is expected to fail. 10 // expected-note @below {{couldn't predicate}} 11 scf.for %i = %c0 to %c100 step %c4 iter_args(%accum = %c0f) -> f32 { 12 %mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32> 13 vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>> 14 %0 = arith.addf %accum, %accum : f32 15 scf.yield %0 : f32 16 } 17 return 18} 19 20!t = !transform.any_op 21 22module attributes {transform.with_named_sequence} { 23 transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) { 24 %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t 25 // expected-error @below {{irreversible pipelining failure}} 26 // expected-note @below {{try setting "peel_epilogue"}} 27 transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t 28 transform.yield 29 } 30} 31 32// ----- 33 34// Loop pipeliner is tested separately, just verify the overall shape of the IR here. 35 36func.func private @body(index, memref<?xf32, #gpu.address_space<workgroup>>) 37 38// CHECK-LABEL: @simple_depth_2_peeled 39// CHECK-SAME: %[[ARG:.+]]: memref 40func.func @simple_depth_2_peeled(%global: memref<?xf32>) { 41 %c0 = arith.constant 0 : index 42 %c100 = arith.constant 100 : index 43 %c200 = arith.constant 200 : index 44 %c4 = arith.constant 4 : index 45 // CHECK: memref.alloc 46 %shared = memref.alloc(%c200) : memref<?xf32, #gpu.address_space<workgroup>> 47 %c0f = arith.constant 0.0 : f32 48 // CHECK: %[[LOADED1:.+]] = vector.transfer_read %[[ARG]] 49 // CHECK: %[[LOADED2:.+]] = vector.transfer_read %[[ARG]] 50 // CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[IA1:.+]] = %[[LOADED1]], %[[IA2:.+]] = %[[LOADED2]]) 51 // CHECK: vector.transfer_write %[[IA1]] 52 // CHECK: func.call @body 53 // CHECK: %[[LOCAL_LOADED:.+]] = vector.transfer_read %[[ARG]] 54 // CHECK: scf.yield %[[IA2]], %[[LOCAL_LOADED]] 55 scf.for %i = %c0 to %c100 step %c4 { 56 %mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32> 57 vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>> 58 func.call @body(%i, %shared) : (index, memref<?xf32, #gpu.address_space<workgroup>>) -> () 59 } 60 // CHECK: vector.transfer_write %[[LOOP]]#0 61 // CHECK: call @body 62 // CHECK: vector.transfer_write %[[LOOP]]#1 63 // CHECK: call @body 64 return 65} 66 67!t = !transform.any_op 68 69module attributes {transform.with_named_sequence} { 70 transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) { 71 %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t 72 transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t 73 transform.yield 74 } 75} 76 77// ----- 78 79// CHECK-LABEL: @async_depth_2_predicated 80// CHECK-SAME: %[[GLOBAL:.+]]: memref 81func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index) { 82 %c0 = arith.constant 0 : index 83 %c98 = arith.constant 98 : index 84 %c100 = arith.constant 100 : index 85 // CHECK-DAG: %[[C4:.+]] = arith.constant 4 86 // CHECK-DAG: %[[C90:.+]] = arith.constant 90 87 // CHECK-DAG: %[[C96:.+]] = arith.constant 96 88 // CHECK-DAG: %[[C8:.+]] = arith.constant 8 89 // CHECK-DAG: %[[C2:.+]] = arith.constant 2 90 // CHECK-DAG: %[[C0:.+]] = arith.constant 0 91 %c4 = arith.constant 4 : index 92 // CHECK: %[[SHARED:.+]] = memref.alloc{{.*}} #gpu.address_space<workgroup> 93 %shared = memref.alloc(%alloc_size) : memref<?xf32, #gpu.address_space<workgroup>> 94 %c0f = arith.constant 0.0 : f32 95 // CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy 96 // CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy 97 // CHECK: scf.for %[[I:.+]] = {{.*}} iter_args 98 // CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]] 99 // CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]] 100 scf.for %i = %c0 to %c98 step %c4 { 101 // Condition for the predication "select" below. 102 // CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]] 103 // CHECK: nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1 104 // Original "select" with updated induction variable. 105 // CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]] 106 // CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]] 107 // CHECK: %[[SELECTED0:.+]] = arith.select %[[CMP1]], %[[C4]], %[[C2]] 108 %c96 = arith.constant 96 : index 109 %cond = arith.cmpi slt, %i, %c96 : index 110 %c2 = arith.constant 2 : index 111 %read_size = arith.select %cond, %c4, %c2 : index 112 113 // Updated induction variables (two more) for the device_async_copy below. 114 // These are generated repeatedly by the pipeliner. 115 // CHECK: %[[I_PLUS_8_2:.+]] = arith.addi %[[I]], %[[C8]] 116 // CHECK: %[[I_PLUS_8_3:.+]] = arith.addi %[[I]], %[[C8]] 117 118 // The second "select" is generated by predication and selects 0 for 119 // the two last iterations. 120 // CHECK: %[[SELECTED1:.+]] = arith.select %[[CMP0]], %[[SELECTED0]], %[[C0]] 121 // CHECK: %[[ASYNC_TOKEN:.+]] = nvgpu.device_async_copy %[[GLOBAL]][%[[I_PLUS_8_3]]], %[[SHARED]][%[[I_PLUS_8_2]]], 4, %[[SELECTED1]] 122 %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size 123 : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>> 124 125 nvgpu.device_async_wait %token 126 127 // CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]] 128 } 129 // There is no need to wait for the last copies as it it was fully predicated 130 // out and doesn't load the original data. 131 // CHECK-NOT: nvgpu.device_async_wait 132 return 133} 134 135 136!t = !transform.any_op 137 138module attributes {transform.with_named_sequence} { 139 transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) { 140 %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t 141 transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t 142 transform.yield 143 } 144} 145 146// ----- 147 148// CHECK-LABEL: @async_depth_2_peeled 149func.func @async_depth_2_peeled(%global: memref<?xf32>) { 150 %c0 = arith.constant 0 : index 151 %c98 = arith.constant 98 : index 152 %c100 = arith.constant 100 : index 153 %c4 = arith.constant 4 : index 154 %shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>> 155 %c0f = arith.constant 0.0 : f32 156 // CHECK: nvgpu.device_async_copy 157 // CHECK: nvgpu.device_async_copy 158 // CHECK: scf.for 159 // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 160 // CHECK: arith.select 161 // CHECK: nvgpu.device_async_copy 162 // CHECK: scf.yield 163 // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 164 // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0 165 scf.for %i = %c0 to %c98 step %c4 { 166 %c96 = arith.constant 96 : index 167 %cond = arith.cmpi slt, %i, %c96 : index 168 %c2 = arith.constant 2 : index 169 %read_size = arith.select %cond, %c4, %c2 : index 170 %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size 171 : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>> 172 nvgpu.device_async_wait %token 173 } 174 return 175} 176 177 178!t = !transform.any_op 179 180module attributes {transform.with_named_sequence} { 181 transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) { 182 %loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t 183 transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t 184 transform.yield 185 } 186} 187