1// RUN: mlir-opt %s --transform-interpreter | FileCheck %s 2// RUN: mlir-opt %s --gpu-eliminate-barriers | FileCheck %s 3 4module attributes {transform.with_named_sequence} { 5 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 6 %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op 7 transform.apply_patterns to %0 { 8 transform.apply_patterns.gpu.eliminate_barriers 9 } : !transform.any_op 10 transform.yield 11 } 12} 13 14// CHECK-LABEL: @read_read_write 15func.func @read_read_write(%arg0: memref<?xf32>, %arg1: index) attributes {__parallel_region_boundary_for_test} { 16 // CHECK: load 17 %0 = memref.load %arg0[%arg1] : memref<?xf32> 18 // The barrier between loads can be removed. 19 // CHECK-NOT: barrier 20 gpu.barrier 21 // CHECK: load 22 %1 = memref.load %arg0[%arg1] : memref<?xf32> 23 %2 = arith.addf %0, %1 : f32 24 // The barrier between load and store cannot be removed (unless we reason about accessed subsets). 25 // CHECK: barrier 26 gpu.barrier 27 // CHECK: store 28 memref.store %2, %arg0[%arg1] : memref<?xf32> 29 return 30} 31 32// CHECK-LABEL: @write_read_read 33func.func @write_read_read(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> f32 34attributes {__parallel_region_boundary_for_test} { 35 // CHECK: store 36 memref.store %arg2, %arg0[%arg1] : memref<?xf32> 37 // The barrier between load and store cannot be removed (unless we reason about accessed subsets). 38 // CHECK: barrier 39 gpu.barrier 40 // CHECK: load 41 %0 = memref.load %arg0[%arg1] : memref<?xf32> 42 // CHECK-NOT: barrier 43 gpu.barrier 44 // CHECK: load 45 %1 = memref.load %arg0[%arg1] : memref<?xf32> 46 %2 = arith.addf %0, %1 : f32 47 return %2 : f32 48} 49 50// CHECK-LABEL: @write_in_a_loop 51func.func @write_in_a_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} { 52 %c0 = arith.constant 0 : index 53 %c42 = arith.constant 42 : index 54 %c1 = arith.constant 1 : index 55 scf.for %i = %c0 to %c42 step %c1 { 56 memref.store %arg1, %arg0[%i] : memref<?xf32> 57 // Cannot remove this barrier because it guards write-after-write between different iterations. 58 // CHECK: barrier 59 gpu.barrier 60 } 61 return 62} 63 64// CHECK-LABEL: @read_read_write_loop 65func.func @read_read_write_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} { 66 %c0 = arith.constant 0 : index 67 %c42 = arith.constant 42 : index 68 %c1 = arith.constant 1 : index 69 scf.for %i = %c0 to %c42 step %c1 { 70 // (Note that if subscript were different, this would have been a race with the store at the end of the loop). 71 %0 = memref.load %arg0[%i] : memref<?xf32> 72 // Guards read-after-write where the write happens on the previous iteration. 73 // CHECK: barrier 74 gpu.barrier 75 %1 = memref.load %arg0[%i] : memref<?xf32> 76 %2 = arith.addf %0, %1 : f32 77 // Guards write-after-read. 78 // CHECK: barrier 79 gpu.barrier 80 memref.store %2, %arg0[%i] : memref<?xf32> 81 } 82 return 83} 84 85// CHECK-LABEL: @read_read_write_loop_trailing_sync 86func.func @read_read_write_loop_trailing_sync(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} { 87 %c0 = arith.constant 0 : index 88 %c42 = arith.constant 42 : index 89 %c1 = arith.constant 1 : index 90 scf.for %i = %c0 to %c42 step %c1 { 91 // CHECK: load 92 %0 = memref.load %arg0[%i] : memref<?xf32> 93 // This can be removed because it only guards a read-after-read. 94 // CHECK-NOT: barrier 95 gpu.barrier 96 // CHECK: load 97 %1 = memref.load %arg0[%i] : memref<?xf32> 98 %2 = arith.addf %0, %1 : f32 99 // CHECK: barrier 100 gpu.barrier 101 // CHECK: store 102 memref.store %2, %arg0[%i] : memref<?xf32> 103 // CHECK: barrier 104 gpu.barrier 105 } 106 return 107} 108 109// CHECK-LABEL: @write_write_noalias 110func.func @write_write_noalias(%arg0: index, %arg1: f32) -> (memref<42xf32>, memref<10xf32>) 111attributes {__parallel_region_boundary_for_test} { 112 %0 = memref.alloc() : memref<42xf32> 113 %1 = memref.alloc() : memref<10xf32> 114 // CHECK: store 115 memref.store %arg1, %0[%arg0] : memref<42xf32> 116 // This can be removed because we can prove two allocations don't alias. 117 // CHECK-NOT: barrier 118 gpu.barrier 119 // CHECK: store 120 memref.store %arg1, %1[%arg0] : memref<10xf32> 121 return %0, %1 : memref<42xf32>, memref<10xf32> 122} 123 124// CHECK-LABEL: @write_write_alloc_arg_noalias 125func.func @write_write_alloc_arg_noalias(%arg0: index, %arg1: f32, %arg2: memref<?xf32>) -> (memref<42xf32>) 126attributes {__parallel_region_boundary_for_test} { 127 %0 = memref.alloc() : memref<42xf32> 128 // CHECK: store 129 memref.store %arg1, %0[%arg0] : memref<42xf32> 130 // This can be removed because we can prove local allocation doesn't alias with a function argument. 131 // CHECK-NOT: barrier 132 gpu.barrier 133 // CHECK: store 134 memref.store %arg1, %arg2[%arg0] : memref<?xf32> 135 return %0 : memref<42xf32> 136} 137 138// CHECK-LABEL: @repeated_barrier 139func.func @repeated_barrier(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> f32 140attributes {__parallel_region_boundary_for_test} { 141 %0 = memref.load %arg0[%arg1] : memref<?xf32> 142 // CHECK: gpu.barrier 143 gpu.barrier 144 // CHECK-NOT: gpu.barrier 145 gpu.barrier 146 memref.store %arg2, %arg0[%arg1] : memref<?xf32> 147 return %0 : f32 148} 149 150// CHECK-LABEL: @symmetric_stop 151func.func @symmetric_stop(%val: f32) -> (f32, f32, f32, f32, f32) 152attributes {__parallel_region_boundary_for_test} { 153 // CHECK: %[[A:.+]] = memref.alloc 154 // CHECK: %[[B:.+]] = memref.alloc 155 // CHECK: %[[C:.+]] = memref.alloc 156 %A = memref.alloc() : memref<f32> 157 %B = memref.alloc() : memref<f32> 158 %C = memref.alloc() : memref<f32> 159 // CHECK: memref.store %{{.*}}, %[[A]] 160 memref.store %val, %A[] : memref<f32> 161 // CHECK: gpu.barrier 162 gpu.barrier 163 // CHECK: memref.load %[[A]] 164 %0 = memref.load %A[] : memref<f32> 165 // CHECK: memref.store %{{.*}}, %[[B]] 166 memref.store %val, %B[] : memref<f32> 167 // This barrier is eliminated because the surrounding barriers are sufficient 168 // to guard write/read on all memrefs. 169 // CHECK-NOT: gpu.barrier 170 gpu.barrier 171 // CHECK: memref.load %[[A]] 172 %1 = memref.load %A[] : memref<f32> 173 // CHECK: memref.store %{{.*}} %[[C]] 174 memref.store %val, %C[] : memref<f32> 175 // CHECK: gpu.barrier 176 gpu.barrier 177 // CHECK: memref.load %[[A]] 178 // CHECK: memref.load %[[B]] 179 // CHECK: memref.load %[[C]] 180 %2 = memref.load %A[] : memref<f32> 181 %3 = memref.load %B[] : memref<f32> 182 %4 = memref.load %C[] : memref<f32> 183 return %0, %1, %2, %3, %4 : f32, f32, f32, f32, f32 184} 185 186// CHECK-LABEL: @nested_loop_barrier_only 187func.func @nested_loop_barrier_only() attributes {__parallel_region_boundary_for_test} { 188 %c0 = arith.constant 0 : index 189 %c42 = arith.constant 42 : index 190 %c1 = arith.constant 1 : index 191 // Note: the barrier can be removed and as consequence the loops get folded 192 // by the greedy rewriter. 193 // CHECK-NOT: scf.for 194 // CHECK-NOT: gpu.barrier 195 scf.for %j = %c0 to %c42 step %c1 { 196 scf.for %i = %c0 to %c42 step %c1 { 197 gpu.barrier 198 } 199 } 200 return 201} 202