xref: /llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir (revision 4279a642fba2d107e9bef17376facf9350dae6a9)
1// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
2// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
3
4gpu.module @kernel {
5  // NVVM-LABEL:  llvm.func @private
6  gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
7    // Allocate private memory inside the function.
8    // NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64
9    // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr
10
11    // ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64
12    // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr<5>
13
14    // Populate the memref descriptor.
15    // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
16    // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
17    // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
18    // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
19    // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
20    // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
21    // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
22    // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
23    // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
24
25    // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
26    // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
27    // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
28    // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
29    // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
30    // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
31    // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
32    // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
33    // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
34
35    // "Store" lowering should work just as any other memref, only check that
36    // we emit some core instructions.
37    // NVVM: llvm.extractvalue %[[descr6:.*]]
38    // NVVM: llvm.getelementptr
39    // NVVM: llvm.store
40
41    // ROCDL: llvm.extractvalue %[[descr6:.*]]
42    // ROCDL: llvm.getelementptr
43    // ROCDL: llvm.store
44    %c0 = arith.constant 0 : index
45    memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space<private>>
46
47    "terminator"() : () -> ()
48  }
49}
50
51// -----
52
53gpu.module @kernel {
54  // Workgroup buffers are allocated as globals.
55  // NVVM: llvm.mlir.global internal @[[$buffer:.*]]()
56  // NVVM-SAME:  addr_space = 3
57  // NVVM-SAME:  !llvm.array<4 x f32>
58
59  // ROCDL: llvm.mlir.global internal @[[$buffer:.*]]()
60  // ROCDL-SAME:  addr_space = 3
61  // ROCDL-SAME:  !llvm.array<4 x f32>
62
63  // NVVM-LABEL: llvm.func @workgroup
64  // NVVM-SAME: {
65
66  // ROCDL-LABEL: llvm.func @workgroup
67  // ROCDL-SAME: {
68  gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, #gpu.address_space<workgroup>>) {
69    // Get the address of the first element in the global array.
70    // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
71    // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
72    // NVVM-SAME: !llvm.ptr<3>
73
74    // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
75    // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
76    // ROCDL-SAME: !llvm.ptr<3>
77
78    // Populate the memref descriptor.
79    // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
80    // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
81    // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
82    // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
83    // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
84    // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
85    // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
86    // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
87    // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
88
89    // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
90    // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
91    // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
92    // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
93    // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
94    // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
95    // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
96    // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
97    // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
98
99    // "Store" lowering should work just as any other memref, only check that
100    // we emit some core instructions.
101    // NVVM: llvm.extractvalue %[[descr6:.*]]
102    // NVVM: llvm.getelementptr
103    // NVVM: llvm.store
104
105    // ROCDL: llvm.extractvalue %[[descr6:.*]]
106    // ROCDL: llvm.getelementptr
107    // ROCDL: llvm.store
108    %c0 = arith.constant 0 : index
109    memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space<workgroup>>
110
111    "terminator"() : () -> ()
112  }
113}
114
115// -----
116
117gpu.module @kernel {
118  // Check that the total size was computed correctly.
119  // NVVM: llvm.mlir.global internal @[[$buffer:.*]]()
120  // NVVM-SAME:  addr_space = 3
121  // NVVM-SAME:  !llvm.array<48 x f32>
122
123  // ROCDL: llvm.mlir.global internal @[[$buffer:.*]]()
124  // ROCDL-SAME:  addr_space = 3
125  // ROCDL-SAME:  !llvm.array<48 x f32>
126
127  // NVVM-LABEL: llvm.func @workgroup3d
128  // ROCDL-LABEL: llvm.func @workgroup3d
129  gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, #gpu.address_space<workgroup>>) {
130    // Get the address of the first element in the global array.
131    // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
132    // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
133    // NVVM-SAME: !llvm.ptr<3>
134
135    // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
136    // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
137    // ROCDL-SAME: !llvm.ptr<3>
138
139    // Populate the memref descriptor.
140    // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
141    // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
142    // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
143    // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
144    // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
145    // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
146    // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
147    // NVVM: %[[c12:.*]] = llvm.mlir.constant(12 : index) : i64
148    // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
149    // NVVM: %[[c2:.*]] = llvm.mlir.constant(2 : index) : i64
150    // NVVM: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
151    // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
152    // NVVM: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
153    // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
154    // NVVM: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
155    // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
156    // NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
157
158    // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
159    // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
160    // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
161    // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
162    // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
163    // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
164    // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
165    // ROCDL: %[[c12:.*]] = llvm.mlir.constant(12 : index) : i64
166    // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
167    // ROCDL: %[[c2:.*]] = llvm.mlir.constant(2 : index) : i64
168    // ROCDL: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
169    // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
170    // ROCDL: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
171    // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
172    // ROCDL: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
173    // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
174    // ROCDL: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
175
176    %c0 = arith.constant 0 : index
177    memref.store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, #gpu.address_space<workgroup>>
178    "terminator"() : () -> ()
179  }
180}
181
182// -----
183
184gpu.module @kernel {
185  // Check that several buffers are defined.
186  // NVVM: llvm.mlir.global internal @[[$buffer1:.*]]()
187  // NVVM-SAME:  !llvm.array<1 x f32>
188  // NVVM: llvm.mlir.global internal @[[$buffer2:.*]]()
189  // NVVM-SAME:  !llvm.array<2 x f32>
190
191  // ROCDL: llvm.mlir.global internal @[[$buffer1:.*]]()
192  // ROCDL-SAME:  !llvm.array<1 x f32>
193  // ROCDL: llvm.mlir.global internal @[[$buffer2:.*]]()
194  // ROCDL-SAME:  !llvm.array<2 x f32>
195
196  // NVVM-LABEL: llvm.func @multiple
197  // ROCDL-LABEL: llvm.func @multiple
198  gpu.func @multiple(%arg0: f32)
199      workgroup(%arg1: memref<1xf32, #gpu.address_space<workgroup>>, %arg2: memref<2xf32, #gpu.address_space<workgroup>>)
200      private(%arg3: memref<3xf32, #gpu.address_space<private>>, %arg4: memref<4xf32, #gpu.address_space<private>>) {
201
202    // Workgroup buffers.
203    // NVVM: llvm.mlir.addressof @[[$buffer1]]
204    // NVVM: llvm.mlir.addressof @[[$buffer2]]
205
206    // ROCDL: llvm.mlir.addressof @[[$buffer1]]
207    // ROCDL: llvm.mlir.addressof @[[$buffer2]]
208
209    // Private buffers.
210    // NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
211    // NVVM: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr
212    // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
213    // NVVM: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr
214
215    // ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
216    // ROCDL: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr<5>
217    // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
218    // ROCDL: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr<5>
219
220    %c0 = arith.constant 0 : index
221    memref.store %arg0, %arg1[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
222    memref.store %arg0, %arg2[%c0] : memref<2xf32, #gpu.address_space<workgroup>>
223    memref.store %arg0, %arg3[%c0] : memref<3xf32, #gpu.address_space<private>>
224    memref.store %arg0, %arg4[%c0] : memref<4xf32, #gpu.address_space<private>>
225    "terminator"() : () -> ()
226  }
227}
228
229// -----
230
231gpu.module @kernel {
232  // Check that alignment attributes are set correctly
233  // NVVM: llvm.mlir.global internal @[[$buffer:.*]]()
234  // NVVM-SAME:  addr_space = 3
235  // NVVM-SAME:  alignment = 8
236  // NVVM-SAME:  !llvm.array<48 x f32>
237
238  // ROCDL: llvm.mlir.global internal @[[$buffer:.*]]()
239  // ROCDL-SAME:  addr_space = 3
240  // ROCDL-SAME:  alignment = 8
241  // ROCDL-SAME:  !llvm.array<48 x f32>
242
243  // NVVM-LABEL: llvm.func @explicitAlign
244  // ROCDL-LABEL: llvm.func @explicitAlign
245  gpu.func @explicitAlign(%arg0 : index)
246    workgroup(%arg1: memref<48xf32, #gpu.address_space<workgroup>> {llvm.align = 8 : i64})
247    private(%arg2: memref<48xf32, #gpu.address_space<private>> {llvm.align = 4 : i64}) {
248    // NVVM: %[[size:.*]] = llvm.mlir.constant(48 : i64) : i64
249    // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 {alignment = 4 : i64} : (i64) -> !llvm.ptr
250
251    // ROCDL: %[[size:.*]] = llvm.mlir.constant(48 : i64) : i64
252    // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 {alignment = 4 : i64} : (i64) -> !llvm.ptr<5>
253
254    %val = memref.load %arg1[%arg0] : memref<48xf32, #gpu.address_space<workgroup>>
255    memref.store %val, %arg2[%arg0] : memref<48xf32, #gpu.address_space<private>>
256    "terminator"() : () -> ()
257  }
258}
259