1// RUN: mlir-opt %s -canonicalize="test-convergence" --split-input-file -allow-unregistered-dialect | FileCheck %s 2 3// Fold all the gpu.wait ops as they are redundant. 4// CHECK-LABEL: func @fold_wait_op_test1 5func.func @fold_wait_op_test1() { 6 %1 = gpu.wait async 7 gpu.wait [] 8 %3 = gpu.wait async 9 gpu.wait [%3] 10 return 11} 12// CHECK-NOT: gpu.wait 13 14// ----- 15 16// Erase duplicate barriers. 17// CHECK-LABEL: func @erase_barriers 18// CHECK-NEXT: gpu.barrier 19// CHECK-NEXT: return 20func.func @erase_barriers() { 21 gpu.barrier 22 gpu.barrier 23 return 24} 25 26// ----- 27 28// Replace uses of gpu.wait op with its async dependency. 29// CHECK-LABEL: func @fold_wait_op_test2 30func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) { 31 %0 = gpu.wait async 32 %memref, %asyncToken = gpu.alloc async [%0] () : memref<5xf16> 33 gpu.wait [%0] 34 %1 = gpu.wait async [%0] 35 %memref_0, %asyncToken_0 = gpu.alloc async [%1] () : memref<5xf16> 36 gpu.wait [%1] 37 return %memref, %memref_0 : memref<5xf16>, memref<5xf16> 38} 39// CHECK-NEXT: %[[TOKEN0:.*]] = gpu.wait async 40// CHECK-NEXT: gpu.alloc async [%[[TOKEN0]]] () 41// CHECK-NEXT: %[[TOKEN1:.*]] = gpu.wait async 42// CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] () 43// CHECK-NEXT: return 44 45// ----- 46 47// CHECK-LABEL: func @fold_memcpy_op 48func.func @fold_memcpy_op(%arg0: i1) { 49 %cst = arith.constant 0.000000e+00 : f16 50 %1 = memref.alloc() : memref<2xf16> 51 %2 = gpu.wait async 52 %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16> 53 gpu.wait [%2] 54 affine.store %cst, %memref[0] : memref<2xf16> 55 %3 = gpu.wait async 56 %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16> 57 gpu.wait [%3] 58 %5 = scf.if %arg0 -> (i1) { 59 memref.dealloc %1 : memref<2xf16> 60 scf.yield %arg0 : i1 61 } else { 62 memref.dealloc %1 : memref<2xf16> 63 scf.yield %arg0 : i1 64 } 65 return 66} 67// CHECK-NOT: gpu.memcpy 68 69// ----- 70 71// We cannot fold memcpy here as dest is a block argument. 72// CHECK-LABEL: func @do_not_fold_memcpy_op1 73func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) { 74 %cst = arith.constant 0.000000e+00 : f16 75 %2 = gpu.wait async 76 %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16> 77 gpu.wait [%2] 78 affine.store %cst, %memref[0] : memref<2xf16> 79 %3 = gpu.wait async 80 %4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16> 81 gpu.wait [%3] 82 return 83} 84// CHECK: gpu.memcpy 85 86// ----- 87 88// We cannot fold gpu.memcpy as it is used by an op having read effect on dest. 89// CHECK-LABEL: func @do_not_fold_memcpy_op2 90func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 { 91 %cst = arith.constant 0.000000e+00 : f16 92 %1 = memref.alloc() : memref<2xf16> 93 %2 = gpu.wait async 94 %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16> 95 gpu.wait [%2] 96 affine.store %cst, %memref[0] : memref<2xf16> 97 %3 = gpu.wait async 98 %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16> 99 gpu.wait [%3] 100 %5 = memref.load %1[%arg1] : memref<2xf16> 101 return %5 : f16 102} 103// CHECK: gpu.memcpy 104 105// ----- 106 107// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op. 108// CHECK-LABEL: func @do_not_fold_memcpy_op3 109func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) { 110 %0 = arith.constant 0 : index 111 %1 = memref.view %arg0[%0][] : memref<1xi8> to memref<i1> 112 gpu.memcpy %1, %arg1 : memref<i1>, memref<i1> 113 func.return 114} 115// CHECK: gpu.memcpy 116 117// ----- 118 119// CHECK-LABEL: @memcpy_after_cast 120func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) { 121 // CHECK-NOT: memref.cast 122 // CHECK: gpu.memcpy 123 %0 = memref.cast %arg0 : memref<10xf32> to memref<?xf32> 124 %1 = memref.cast %arg1 : memref<10xf32> to memref<?xf32> 125 gpu.memcpy %0, %1 : memref<?xf32>, memref<?xf32> 126 return 127} 128 129// ----- 130 131// CHECK-LABEL: @memset_after_cast 132func.func @memset_after_cast(%arg0: memref<10xf32>, %arg1: f32) { 133 // CHECK-NOT: memref.cast 134 // CHECK: gpu.memset 135 %0 = memref.cast %arg0 : memref<10xf32> to memref<?xf32> 136 gpu.memset %0, %arg1 : memref<?xf32>, f32 137 return 138} 139 140// ----- 141 142// Test case: Folding of memref.dim(gpu.alloc(%size), %idx) -> %size 143// CHECK-LABEL: func @gpu_dim_of_alloc( 144// CHECK-SAME: %[[SIZE:[0-9a-z]+]]: index 145// CHECK-NEXT: return %[[SIZE]] : index 146func.func @gpu_dim_of_alloc(%size: index) -> index { 147 %0 = gpu.alloc(%size) : memref<?xindex> 148 %c0 = arith.constant 0 : index 149 %1 = memref.dim %0, %c0 : memref<?xindex> 150 return %1 : index 151} 152 153// ----- 154 155// CHECK-LABEL: func @out_of_bound_memref.dim 156// CHECK: %[[MEMREF:.[a-z0-9A-Z_]+]] = memref.dim 157// CHECK: return %[[MEMREF]] : index 158func.func @out_of_bound_memref.dim(%arg : memref<?xi8>, %size: index) -> index { 159 %c2 = arith.constant 2 : index 160 %1 = memref.dim %arg, %c2 : memref<?xi8> 161 return %1 : index 162} 163 164// ----- 165 166// CHECK-LABEL: func @simplify_gpu_launch 167func.func @simplify_gpu_launch() attributes {llvm.emit_c_interface} { 168 %cst = arith.constant 0.000000e+00 : f32 169 %c1 = arith.constant 1 : index 170 %c32 = arith.constant 32 : index 171 %c16 = arith.constant 16 : index 172 %c2 = arith.constant 2 : index 173 %c0 = arith.constant 0 : index 174 %0 = memref.alloc() : memref<2x16x16xf32> 175 scf.for %arg0 = %c0 to %c2 step %c1 { 176 scf.for %arg1 = %c0 to %c16 step %c1 { 177 scf.for %arg2 = %c0 to %c16 step %c1 { 178 memref.store %cst, %0[%arg0, %arg1, %arg2] : memref<2x16x16xf32> 179 } 180 } 181 } 182 %1 = gpu.wait async 183 %memref, %asyncToken = gpu.alloc async [%1] () : memref<2x16x16xf32> 184 %2 = gpu.memcpy async [%1] %memref, %0 : memref<2x16x16xf32>, memref<2x16x16xf32> 185 gpu.wait [%1] 186 gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) 187 threads(%arg3, %arg4, %arg5) in (%arg9 = %c32, %arg10 = %c1, %arg11 = %c1) { 188 %3 = arith.muli %arg5, %c32 : index 189 %4 = arith.muli %arg4, %c32 : index 190 %5 = arith.addi %3, %4 : index 191 %6 = arith.addi %5, %arg3 : index 192 %7 = arith.divui %6, %c32 : index 193 %8 = arith.muli %arg0, %c16 : index 194 %9 = arith.muli %arg1, %c2 : index 195 %10 = arith.muli %7, %c2 : index 196 %11 = arith.addi %9, %10 : index 197 %12 = memref.load %memref[%11, %c0, %8] : memref<2x16x16xf32> 198 %13 = arith.addi %11, %c1 : index 199 %14 = memref.load %memref[%13, %c0, %8] : memref<2x16x16xf32> 200 memref.store %12, %memref[%11, %c0, %8] : memref<2x16x16xf32> 201 memref.store %14, %memref[%13, %c0, %8] : memref<2x16x16xf32> 202 gpu.terminator 203 } 204 return 205} 206 207// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 208// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 209// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%[[TIDX:.*]], %{{.*}}, %{{.*}}) in (%{{.*}} = %c32, %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) { 210// CHECK-NEXT: arith.divui %[[TIDX]], %c32 : index 211// CHECK-NEXT: arith.muli %{{.*}}, %c2 : index 212// CHECK-NEXT: memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32> 213// CHECK-NEXT: arith.addi %{{.*}}, %[[C1]] : index 214// CHECK-NEXT: memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32> 215// CHECK-NEXT: memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32> 216// CHECK-NEXT: memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32> 217// CHECK-NEXT: gpu.terminator 218// CHECK-NEXT: } 219 220// ----- 221 222// CHECK-LABEL: func @make_reduce_uniform 223// CHECK: gpu.launch blocks 224// CHECK: %[[V1:.*]] = "test.test2"() : () -> i32 225// CHECK: %[[V2:.*]] = gpu.all_reduce add %[[V1]] uniform { 226// CHECK: "test.test3"(%[[V2]]) : (i32) -> () 227func.func @make_reduce_uniform() { 228 %0:6 = "test.test1"() : () -> (index, index, index, index, index, index) 229 gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2) 230 threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) { 231 %1 = "test.test2"() : () -> i32 232 %2 = gpu.all_reduce add %1 {} : (i32) -> (i32) 233 "test.test3"(%2) : (i32) -> () 234 gpu.terminator 235 } 236 return 237} 238 239// ----- 240 241// CHECK-LABEL: func @make_subgroup_reduce_uniform 242// CHECK: gpu.launch blocks 243// CHECK: %[[V1:.*]] = "test.test2"() : () -> i32 244// CHECK: %[[V2:.*]] = gpu.subgroup_reduce add %[[V1]] uniform 245// CHECK: "test.test3"(%[[V2]]) : (i32) -> () 246func.func @make_subgroup_reduce_uniform() { 247 %0:6 = "test.test1"() : () -> (index, index, index, index, index, index) 248 gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2) 249 threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) { 250 %1 = "test.test2"() : () -> i32 251 %2 = gpu.subgroup_reduce add %1 : (i32) -> (i32) 252 "test.test3"(%2) : (i32) -> () 253 gpu.terminator 254 } 255 return 256} 257 258// ----- 259 260// CHECK-LABEL: func @subgroup_reduce_cluster_size_1 261// CHECK: gpu.launch blocks 262// CHECK: %[[V1:.*]] = "test.test2"() : () -> i32 263// CHECK: "test.test3"(%[[V1]]) : (i32) -> () 264func.func @subgroup_reduce_cluster_size_1() { 265 %0:6 = "test.test1"() : () -> (index, index, index, index, index, index) 266 gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2) 267 threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) { 268 %1 = "test.test2"() : () -> i32 269 %2 = gpu.subgroup_reduce add %1 cluster(size=1) : (i32) -> (i32) 270 "test.test3"(%2) : (i32) -> () 271 gpu.terminator 272 } 273 return 274} 275 276// ----- 277 278// The GPU kernel does not have any side effecting ops, so the entire 279// gpu.launch op can fold away. 280 281// CHECK-LABEL: func @gpu_launch_without_side_effects 282// CHECK-NOT: gpu.launch 283func.func @gpu_launch_without_side_effects() { 284 %0:6 = "test.test1"() : () -> (index, index, index, index, index, index) 285 gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2) 286 threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) { 287 %1 = arith.addi %arg0, %arg1 : index 288 gpu.terminator 289 } 290 return 291} 292