xref: /llvm-project/mlir/test/Dialect/GPU/barrier-elimination.mlir (revision 5ce4d4c77517f1bc9ad11b2637056b870f2bd156)
1// RUN: mlir-opt %s --transform-interpreter | FileCheck %s
2// RUN: mlir-opt %s --gpu-eliminate-barriers | FileCheck %s
3
4module attributes {transform.with_named_sequence} {
5  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
6    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
7    transform.apply_patterns to %0 {
8      transform.apply_patterns.gpu.eliminate_barriers
9    } : !transform.any_op
10    transform.yield
11  }
12}
13
14// CHECK-LABEL: @read_read_write
15func.func @read_read_write(%arg0: memref<?xf32>, %arg1: index) attributes {__parallel_region_boundary_for_test} {
16  // CHECK: load
17  %0 = memref.load %arg0[%arg1] : memref<?xf32>
18  // The barrier between loads can be removed.
19  // CHECK-NOT: barrier
20  gpu.barrier
21  // CHECK: load
22  %1 = memref.load %arg0[%arg1] : memref<?xf32>
23  %2 = arith.addf %0, %1 : f32
24  // The barrier between load and store cannot be removed (unless we reason about accessed subsets).
25  // CHECK: barrier
26  gpu.barrier
27  // CHECK: store
28  memref.store %2, %arg0[%arg1] : memref<?xf32>
29  return
30}
31
32// CHECK-LABEL: @write_read_read
33func.func @write_read_read(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> f32
34attributes {__parallel_region_boundary_for_test} {
35  // CHECK: store
36  memref.store %arg2, %arg0[%arg1] : memref<?xf32>
37  // The barrier between load and store cannot be removed (unless we reason about accessed subsets).
38  // CHECK: barrier
39  gpu.barrier
40  // CHECK: load
41  %0 = memref.load %arg0[%arg1] : memref<?xf32>
42  // CHECK-NOT: barrier
43  gpu.barrier
44  // CHECK: load
45  %1 = memref.load %arg0[%arg1] : memref<?xf32>
46  %2 = arith.addf %0, %1 : f32
47  return %2 : f32
48}
49
50// CHECK-LABEL: @write_in_a_loop
51func.func @write_in_a_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
52  %c0 = arith.constant 0 : index
53  %c42 = arith.constant 42 : index
54  %c1 = arith.constant 1 : index
55  scf.for %i = %c0 to %c42 step %c1 {
56    memref.store %arg1, %arg0[%i] : memref<?xf32>
57    // Cannot remove this barrier because it guards write-after-write between different iterations.
58    // CHECK: barrier
59    gpu.barrier
60  }
61  return
62}
63
64// CHECK-LABEL: @read_read_write_loop
65func.func @read_read_write_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
66  %c0 = arith.constant 0 : index
67  %c42 = arith.constant 42 : index
68  %c1 = arith.constant 1 : index
69  scf.for %i = %c0 to %c42 step %c1 {
70    // (Note that if subscript were different, this would have been a race with the store at the end of the loop).
71    %0 = memref.load %arg0[%i] : memref<?xf32>
72    // Guards read-after-write where the write happens on the previous iteration.
73    // CHECK: barrier
74    gpu.barrier
75    %1 = memref.load %arg0[%i] : memref<?xf32>
76    %2 = arith.addf %0, %1 : f32
77    // Guards write-after-read.
78    // CHECK: barrier
79    gpu.barrier
80    memref.store %2, %arg0[%i] : memref<?xf32>
81  }
82  return
83}
84
85// CHECK-LABEL: @read_read_write_loop_trailing_sync
86func.func @read_read_write_loop_trailing_sync(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
87  %c0 = arith.constant 0 : index
88  %c42 = arith.constant 42 : index
89  %c1 = arith.constant 1 : index
90  scf.for %i = %c0 to %c42 step %c1 {
91    // CHECK: load
92    %0 = memref.load %arg0[%i] : memref<?xf32>
93    // This can be removed because it only guards a read-after-read.
94    // CHECK-NOT: barrier
95    gpu.barrier
96    // CHECK: load
97    %1 = memref.load %arg0[%i] : memref<?xf32>
98    %2 = arith.addf %0, %1 : f32
99    // CHECK: barrier
100    gpu.barrier
101    // CHECK: store
102    memref.store %2, %arg0[%i] : memref<?xf32>
103    // CHECK: barrier
104    gpu.barrier
105  }
106  return
107}
108
109// CHECK-LABEL: @write_write_noalias
110func.func @write_write_noalias(%arg0: index, %arg1: f32) -> (memref<42xf32>, memref<10xf32>)
111attributes {__parallel_region_boundary_for_test} {
112  %0 = memref.alloc() : memref<42xf32>
113  %1 = memref.alloc() : memref<10xf32>
114  // CHECK: store
115  memref.store %arg1, %0[%arg0] : memref<42xf32>
116  // This can be removed because we can prove two allocations don't alias.
117  // CHECK-NOT: barrier
118  gpu.barrier
119  // CHECK: store
120  memref.store %arg1, %1[%arg0] : memref<10xf32>
121  return %0, %1 : memref<42xf32>, memref<10xf32>
122}
123
124// CHECK-LABEL: @write_write_alloc_arg_noalias
125func.func @write_write_alloc_arg_noalias(%arg0: index, %arg1: f32, %arg2: memref<?xf32>) -> (memref<42xf32>)
126attributes {__parallel_region_boundary_for_test} {
127  %0 = memref.alloc() : memref<42xf32>
128  // CHECK: store
129  memref.store %arg1, %0[%arg0] : memref<42xf32>
130  // This can be removed because we can prove local allocation doesn't alias with a function argument.
131  // CHECK-NOT: barrier
132  gpu.barrier
133  // CHECK: store
134  memref.store %arg1, %arg2[%arg0] : memref<?xf32>
135  return %0 : memref<42xf32>
136}
137
138// CHECK-LABEL: @repeated_barrier
139func.func @repeated_barrier(%arg0: memref<?xf32>, %arg1: index, %arg2: f32) -> f32
140attributes {__parallel_region_boundary_for_test} {
141  %0 = memref.load %arg0[%arg1] : memref<?xf32>
142  // CHECK: gpu.barrier
143  gpu.barrier
144  // CHECK-NOT: gpu.barrier
145  gpu.barrier
146  memref.store %arg2, %arg0[%arg1] : memref<?xf32>
147  return %0 : f32
148}
149
150// CHECK-LABEL: @symmetric_stop
151func.func @symmetric_stop(%val: f32) -> (f32, f32, f32, f32, f32)
152attributes {__parallel_region_boundary_for_test} {
153  // CHECK: %[[A:.+]] = memref.alloc
154  // CHECK: %[[B:.+]] = memref.alloc
155  // CHECK: %[[C:.+]] = memref.alloc
156  %A = memref.alloc() : memref<f32>
157  %B = memref.alloc() : memref<f32>
158  %C = memref.alloc() : memref<f32>
159  // CHECK: memref.store %{{.*}}, %[[A]]
160  memref.store %val, %A[] : memref<f32>
161  // CHECK: gpu.barrier
162  gpu.barrier
163  // CHECK: memref.load %[[A]]
164  %0 = memref.load %A[] : memref<f32>
165  // CHECK: memref.store %{{.*}}, %[[B]]
166  memref.store %val, %B[] : memref<f32>
167  // This barrier is eliminated because the surrounding barriers are sufficient
168  // to guard write/read on all memrefs.
169  // CHECK-NOT: gpu.barrier
170  gpu.barrier
171  // CHECK: memref.load %[[A]]
172  %1 = memref.load %A[] : memref<f32>
173  // CHECK: memref.store %{{.*}} %[[C]]
174  memref.store %val, %C[] : memref<f32>
175  // CHECK: gpu.barrier
176  gpu.barrier
177  // CHECK: memref.load %[[A]]
178  // CHECK: memref.load %[[B]]
179  // CHECK: memref.load %[[C]]
180  %2 = memref.load %A[] : memref<f32>
181  %3 = memref.load %B[] : memref<f32>
182  %4 = memref.load %C[] : memref<f32>
183  return %0, %1, %2, %3, %4 : f32, f32, f32, f32, f32
184}
185
186// CHECK-LABEL: @nested_loop_barrier_only
187func.func @nested_loop_barrier_only() attributes {__parallel_region_boundary_for_test} {
188  %c0 = arith.constant 0 : index
189  %c42 = arith.constant 42 : index
190  %c1 = arith.constant 1 : index
191  // Note: the barrier can be removed and as consequence the loops get folded
192  // by the greedy rewriter.
193  // CHECK-NOT: scf.for
194  // CHECK-NOT: gpu.barrier
195  scf.for %j = %c0 to %c42 step %c1 {
196    scf.for %i = %c0 to %c42 step %c1 {
197      gpu.barrier
198    }
199  }
200  return
201}
202