xref: /llvm-project/mlir/test/Dialect/GPU/canonicalize.mlir (revision a9b399aeef57224cfe699c2804a01363142f1f68)
1// RUN: mlir-opt %s -canonicalize="test-convergence" --split-input-file -allow-unregistered-dialect | FileCheck %s
2
3// Fold all the gpu.wait ops as they are redundant.
4// CHECK-LABEL: func @fold_wait_op_test1
5func.func @fold_wait_op_test1() {
6  %1 = gpu.wait async
7  gpu.wait []
8  %3 = gpu.wait async
9  gpu.wait [%3]
10  return
11}
12// CHECK-NOT: gpu.wait
13
14// -----
15
16// Erase duplicate barriers.
17// CHECK-LABEL: func @erase_barriers
18//       CHECK-NEXT: gpu.barrier
19//       CHECK-NEXT: return
20func.func @erase_barriers() {
21  gpu.barrier
22  gpu.barrier
23  return
24}
25
26// -----
27
28// Replace uses of gpu.wait op with its async dependency.
29// CHECK-LABEL: func @fold_wait_op_test2
30func.func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
31  %0 = gpu.wait async
32  %memref, %asyncToken = gpu.alloc async [%0] () : memref<5xf16>
33  gpu.wait [%0]
34  %1 = gpu.wait async [%0]
35  %memref_0, %asyncToken_0 = gpu.alloc async [%1] () : memref<5xf16>
36  gpu.wait [%1]
37  return %memref, %memref_0 : memref<5xf16>, memref<5xf16>
38}
39// CHECK-NEXT: %[[TOKEN0:.*]] = gpu.wait async
40// CHECK-NEXT: gpu.alloc async [%[[TOKEN0]]] ()
41// CHECK-NEXT: %[[TOKEN1:.*]] = gpu.wait async
42// CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
43// CHECK-NEXT: return
44
45// -----
46
47// CHECK-LABEL: func @fold_memcpy_op
48func.func @fold_memcpy_op(%arg0: i1) {
49    %cst = arith.constant 0.000000e+00 : f16
50    %1 = memref.alloc() : memref<2xf16>
51    %2 = gpu.wait async
52    %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
53    gpu.wait [%2]
54    affine.store %cst, %memref[0] : memref<2xf16>
55    %3 = gpu.wait async
56    %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
57    gpu.wait [%3]
58    %5 = scf.if %arg0 -> (i1) {
59      memref.dealloc %1 : memref<2xf16>
60      scf.yield %arg0 : i1
61    } else {
62      memref.dealloc %1 : memref<2xf16>
63      scf.yield %arg0 : i1
64    }
65    return
66}
67// CHECK-NOT: gpu.memcpy
68
69// -----
70
71// We cannot fold memcpy here as dest is a block argument.
72// CHECK-LABEL: func @do_not_fold_memcpy_op1
73func.func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
74    %cst = arith.constant 0.000000e+00 : f16
75    %2 = gpu.wait async
76    %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
77    gpu.wait [%2]
78    affine.store %cst, %memref[0] : memref<2xf16>
79    %3 = gpu.wait async
80    %4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16>
81    gpu.wait [%3]
82    return
83}
84// CHECK: gpu.memcpy
85
86// -----
87
88// We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
89// CHECK-LABEL: func @do_not_fold_memcpy_op2
90func.func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
91    %cst = arith.constant 0.000000e+00 : f16
92    %1 = memref.alloc() : memref<2xf16>
93    %2 = gpu.wait async
94    %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
95    gpu.wait [%2]
96    affine.store %cst, %memref[0] : memref<2xf16>
97    %3 = gpu.wait async
98    %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
99    gpu.wait [%3]
100    %5 = memref.load %1[%arg1] : memref<2xf16>
101    return %5 : f16
102}
103// CHECK: gpu.memcpy
104
105// -----
106
107// We cannot fold gpu.memcpy, as the defining op if dest is not a alloc like op.
108// CHECK-LABEL: func @do_not_fold_memcpy_op3
109func.func @do_not_fold_memcpy_op3(%arg0: memref<1xi8>, %arg1: memref<i1>) {
110  %0 = arith.constant 0 : index
111  %1 = memref.view %arg0[%0][] : memref<1xi8> to memref<i1>
112  gpu.memcpy  %1, %arg1 : memref<i1>, memref<i1>
113  func.return
114}
115// CHECK: gpu.memcpy
116
117// -----
118
119// CHECK-LABEL: @memcpy_after_cast
120func.func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
121  // CHECK-NOT: memref.cast
122  // CHECK: gpu.memcpy
123  %0 = memref.cast %arg0 : memref<10xf32> to memref<?xf32>
124  %1 = memref.cast %arg1 : memref<10xf32> to memref<?xf32>
125  gpu.memcpy %0, %1 : memref<?xf32>, memref<?xf32>
126  return
127}
128
129// -----
130
131// CHECK-LABEL: @memset_after_cast
132func.func @memset_after_cast(%arg0: memref<10xf32>, %arg1: f32) {
133  // CHECK-NOT: memref.cast
134  // CHECK: gpu.memset
135  %0 = memref.cast %arg0 : memref<10xf32> to memref<?xf32>
136  gpu.memset %0, %arg1 : memref<?xf32>, f32
137  return
138}
139
140// -----
141
142// Test case: Folding of memref.dim(gpu.alloc(%size), %idx) -> %size
143// CHECK-LABEL: func @gpu_dim_of_alloc(
144//  CHECK-SAME:     %[[SIZE:[0-9a-z]+]]: index
145//  CHECK-NEXT:   return %[[SIZE]] : index
146func.func @gpu_dim_of_alloc(%size: index) -> index {
147  %0 = gpu.alloc(%size) : memref<?xindex>
148  %c0 = arith.constant 0 : index
149  %1 = memref.dim %0, %c0 : memref<?xindex>
150  return %1 : index
151}
152
153// -----
154
155// CHECK-LABEL: func @out_of_bound_memref.dim
156//  CHECK:   %[[MEMREF:.[a-z0-9A-Z_]+]] = memref.dim
157//  CHECK:   return %[[MEMREF]] : index
158func.func @out_of_bound_memref.dim(%arg : memref<?xi8>, %size: index) -> index {
159  %c2 = arith.constant 2 : index
160  %1 = memref.dim %arg, %c2 : memref<?xi8>
161  return %1 : index
162}
163
164// -----
165
166// CHECK-LABEL: func @simplify_gpu_launch
167func.func @simplify_gpu_launch() attributes {llvm.emit_c_interface} {
168  %cst = arith.constant 0.000000e+00 : f32
169  %c1 = arith.constant 1 : index
170  %c32 = arith.constant 32 : index
171  %c16 = arith.constant 16 : index
172  %c2 = arith.constant 2 : index
173  %c0 = arith.constant 0 : index
174  %0 = memref.alloc() : memref<2x16x16xf32>
175  scf.for %arg0 = %c0 to %c2 step %c1 {
176    scf.for %arg1 = %c0 to %c16 step %c1 {
177      scf.for %arg2 = %c0 to %c16 step %c1 {
178        memref.store %cst, %0[%arg0, %arg1, %arg2] : memref<2x16x16xf32>
179      }
180    }
181  }
182  %1 = gpu.wait async
183  %memref, %asyncToken = gpu.alloc async [%1] () : memref<2x16x16xf32>
184  %2 = gpu.memcpy async [%1] %memref, %0 : memref<2x16x16xf32>, memref<2x16x16xf32>
185  gpu.wait [%1]
186  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1)
187    threads(%arg3, %arg4, %arg5) in (%arg9 = %c32, %arg10 = %c1, %arg11 = %c1) {
188    %3 = arith.muli %arg5, %c32 : index
189    %4 = arith.muli %arg4, %c32 : index
190    %5 = arith.addi %3, %4 : index
191    %6 = arith.addi %5, %arg3 : index
192    %7 = arith.divui %6, %c32 : index
193    %8 = arith.muli %arg0, %c16 : index
194    %9 = arith.muli %arg1, %c2 : index
195    %10 = arith.muli %7, %c2 : index
196    %11 = arith.addi %9, %10 : index
197    %12 = memref.load %memref[%11, %c0, %8] : memref<2x16x16xf32>
198    %13 = arith.addi %11, %c1 : index
199    %14 = memref.load %memref[%13, %c0, %8] : memref<2x16x16xf32>
200    memref.store %12, %memref[%11, %c0, %8] : memref<2x16x16xf32>
201    memref.store %14, %memref[%13, %c0, %8] : memref<2x16x16xf32>
202    gpu.terminator
203  }
204  return
205}
206
207// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
208// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
209// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%[[TIDX:.*]], %{{.*}}, %{{.*}}) in (%{{.*}} = %c32, %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) {
210// CHECK-NEXT:    arith.divui %[[TIDX]], %c32 : index
211// CHECK-NEXT:    arith.muli %{{.*}}, %c2 : index
212// CHECK-NEXT:    memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
213// CHECK-NEXT:    arith.addi %{{.*}}, %[[C1]] : index
214// CHECK-NEXT:    memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
215// CHECK-NEXT:    memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
216// CHECK-NEXT:    memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
217// CHECK-NEXT:    gpu.terminator
218// CHECK-NEXT:  }
219
220// -----
221
222// CHECK-LABEL: func @make_reduce_uniform
223//       CHECK: gpu.launch blocks
224//       CHECK: %[[V1:.*]] = "test.test2"() : () -> i32
225//       CHECK: %[[V2:.*]] = gpu.all_reduce add %[[V1]] uniform {
226//       CHECK: "test.test3"(%[[V2]]) : (i32) -> ()
227func.func @make_reduce_uniform() {
228  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
229  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
230    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
231    %1 = "test.test2"() : () -> i32
232    %2 = gpu.all_reduce add %1 {} : (i32) -> (i32)
233    "test.test3"(%2) : (i32) -> ()
234    gpu.terminator
235  }
236  return
237}
238
239// -----
240
241// CHECK-LABEL: func @make_subgroup_reduce_uniform
242//       CHECK: gpu.launch blocks
243//       CHECK: %[[V1:.*]] = "test.test2"() : () -> i32
244//       CHECK: %[[V2:.*]] = gpu.subgroup_reduce add %[[V1]] uniform
245//       CHECK: "test.test3"(%[[V2]]) : (i32) -> ()
246func.func @make_subgroup_reduce_uniform() {
247  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
248  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
249    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
250    %1 = "test.test2"() : () -> i32
251    %2 = gpu.subgroup_reduce add %1 : (i32) -> (i32)
252    "test.test3"(%2) : (i32) -> ()
253    gpu.terminator
254  }
255  return
256}
257
258// -----
259
260// CHECK-LABEL: func @subgroup_reduce_cluster_size_1
261//       CHECK: gpu.launch blocks
262//       CHECK: %[[V1:.*]] = "test.test2"() : () -> i32
263//       CHECK: "test.test3"(%[[V1]]) : (i32) -> ()
264func.func @subgroup_reduce_cluster_size_1() {
265  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
266  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
267    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
268    %1 = "test.test2"() : () -> i32
269    %2 = gpu.subgroup_reduce add %1 cluster(size=1) : (i32) -> (i32)
270    "test.test3"(%2) : (i32) -> ()
271    gpu.terminator
272  }
273  return
274}
275
276// -----
277
278// The GPU kernel does not have any side effecting ops, so the entire
279// gpu.launch op can fold away.
280
281// CHECK-LABEL: func @gpu_launch_without_side_effects
282//   CHECK-NOT:   gpu.launch
283func.func @gpu_launch_without_side_effects() {
284  %0:6 = "test.test1"() : () -> (index, index, index, index, index, index)
285  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %0#0, %arg7 = %0#1, %arg8 = %0#2)
286    threads(%arg3, %arg4, %arg5) in (%arg9 = %0#3, %arg10 = %0#4, %arg11 = %0#5) {
287    %1 = arith.addi %arg0, %arg1 : index
288    gpu.terminator
289  }
290  return
291}
292