xref: /llvm-project/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir (revision 0c1c49f0ff8003aee22c3f26fca03c2f5385f355)
1// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s --check-prefixes=CHECK,GFX9,GFX908
2// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx90a | FileCheck %s --check-prefixes=CHECK,GFX9,GFX90A
3// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10,RDNA
4// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11,RDNA
5// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12,RDNA
6
7// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32
8func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
9  // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
10  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
11  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
12  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
13  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
14  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
15  // CHECK: return %[[ret]]
16  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[] : memref<i32> -> i32
17  func.return %0 : i32
18}
19
20// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32
21func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
22  // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
23  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
24  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
25  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
26  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
27  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
28  // CHECK: return %[[ret]]
29  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> i32
30  func.return %0 : i32
31}
32
33// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_strided
34func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<16x16xi32, strided<[?, ?], offset: ?>>, %i: i32, %j: i32) -> i32 {
35    // CHECK: %[[descriptor:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref<16x16xi32, strided<[?, ?], offset: ?>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
36    // CHECK: %[[elem_size:.*]] = llvm.mlir.constant(4 : i32) : i32
37    // CHECK: %[[algn_ptr:.*]] = llvm.extractvalue %[[descriptor]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
38    // CHECK: %[[offset:.*]] = llvm.extractvalue %[[descriptor]][2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
39    // CHECK: %[[ptr:.*]] = llvm.getelementptr %[[algn_ptr]][%[[offset]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32
40    // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16
41    // CHECK: %[[sz_i:.*]] = llvm.extractvalue %[[descriptor]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
42    // CHECK: %[[stride_i:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
43    // CHECK: %[[ext_i:.*]] = llvm.mul %[[sz_i]], %[[stride_i]] : i64
44    // CHECK: %[[sz_j:.*]] = llvm.extractvalue %[[descriptor]][3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
45    // CHECK: %[[stride_j:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
46    // CHECK: %[[ext_j:.*]] = llvm.mul %[[sz_j]], %[[stride_j]] : i64
47    // CHECK: %[[num_records:.*]] = llvm.intr.umax(%[[ext_i]], %[[ext_j]]) : (i64, i64) -> i64
48    // CHECK: %[[num_rec_i32:.*]] = llvm.trunc %[[num_records]] : i64 to i32
49    // CHECK: %[[num_rec_bytes_i32:.*]] = llvm.mul %[[num_rec_i32]], %[[elem_size]] : i32
50    // CHECK: %[[rsrc:.*]] = rocdl.make.buffer.rsrc %[[ptr]], %[[stride]], %[[num_rec_bytes_i32]], %{{.*}} : !llvm.ptr to <8>
51    // CHECK: %[[stride_i_1:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
52    // CHECK: %[[stride_i_i32:.*]] = llvm.trunc %[[stride_i_1]] : i64 to i32
53    // CHECK: %[[t_0:.*]] = llvm.mul %{{.*}}, %[[stride_i_i32]] : i32
54    // CHECK: %[[stride_j_1:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
55    // CHECK: %[[stride_j_i32:.*]] = llvm.trunc %[[stride_j_1]] : i64 to i32
56    // CHECK: %[[t_1:.*]] = llvm.mul %{{.*}}, %[[stride_j_i32]] : i32
57    // CHECK: %[[index:.*]] = llvm.add %[[t_0]], %[[t_1]] : i32
58    // CHECK: %[[vgpr_off:.*]] = llvm.mul %[[index]], %[[elem_size]] : i32
59    // CHECK: %[[zero_0:.*]] = llvm.mlir.constant(0 : i32) : i32
60    // CHECK: %[[sgpr_off:.*]] = llvm.mul %[[zero_0]], %[[elem_size]] : i32
61    // CHECK: %[[zero_1:.*]] = llvm.mlir.constant(0 : i32) : i32
62    // CHECK: %[[v:.*]] = rocdl.raw.ptr.buffer.load %[[rsrc]], %[[vgpr_off]], %[[sgpr_off]], %[[zero_1]] : i32
63    // CHECK: return %[[v]] : i32
64  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%i, %j] :  memref<16x16xi32, strided<[?, ?], offset: ?>>, i32, i32 -> i32
65  func.return %0 : i32
66}
67
68// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
69func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
70  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
71  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(553807872 : i32)
72  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
73  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
74  // CHECK: return %[[ret]]
75  %0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32
76  func.return %0 : i32
77}
78
79// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32
80func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> {
81  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32>
82  // CHECK: return %[[ret]]
83  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32>
84  func.return %0 : vector<2xi32>
85}
86
87// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8
88func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 {
89  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
90  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
91  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8
92  // CHECK: return %[[ret]]
93  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> i8
94  func.return %0 : i8
95}
96
97// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8
98func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> {
99  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
100  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
101  // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i16
102  // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8>
103  // CHECK: return %[[ret]]
104  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8>
105  func.return %0 : vector<2xi8>
106}
107
108// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8
109func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> {
110  // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
111  // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8>
112  // CHECK: return %[[ret]]
113  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8>
114  func.return %0 : vector<16xi8>
115}
116
117// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ
118func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx: i32) -> f8E5M2FNUZ {
119  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
120  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
121  // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8
122  // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[loaded]] : i8 to f8E5M2FNUZ
123  // CHECK: return %[[ret]]
124  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xf8E5M2FNUZ>, i32 -> f8E5M2FNUZ
125  func.return %0 : f8E5M2FNUZ
126}
127
128// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ
129func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %idx: i32) -> vector<4xf8E4M3FNUZ> {
130  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
131  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
132  // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
133  // CHECK: %[[cast:.*]] = llvm.bitcast %[[loaded]] : i32 to vector<4xi8>
134  // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[cast]] : vector<4xi8> to vector<4xf8E4M3FNUZ>
135  // CHECK: return %[[ret]]
136  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xf8E4M3FNUZ>, i32 -> vector<4xf8E4M3FNUZ>
137  func.return %0 : vector<4xf8E4M3FNUZ>
138}
139
140// Since the lowering logic is shared with loads, only bitcasts need to be rechecked
141// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32
142func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
143  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
144  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
145  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
146  // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
147  amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32>
148  func.return
149}
150
151// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
152func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
153  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
154  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
155  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
156  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
157  // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
158  amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
159  func.return
160}
161
162// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8
163func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) {
164  // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16
165  // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16
166  amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32
167  func.return
168}
169
170// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8
171func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) {
172  // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32>
173  // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
174  amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32
175  func.return
176}
177
178// And more so for atomic add
179// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
180func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
181  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
182  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
183  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
184  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
185  // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
186  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
187  func.return
188}
189
190// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2f16
191func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: memref<64xf16>, %idx: i32) {
192  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32)
193  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
194  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
195  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
196  // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf16>
197  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32
198  func.return
199}
200
201// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16
202func.func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16(%value: vector<2xbf16>, %buf: memref<64xbf16>, %idx: i32) {
203  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32)
204  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
205  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
206  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
207  // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xbf16>
208  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xbf16> -> memref<64xbf16>, i32
209  func.return
210}
211
212// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
213func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
214  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
215  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
216  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
217  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
218  // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
219  amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
220  func.return
221}
222
223// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32
224func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
225  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
226  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
227  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
228  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
229  // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
230  amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
231  func.return
232}
233
234// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32
235func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
236  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
237  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
238  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
239  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
240  // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
241  amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
242  func.return
243}
244
245// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_f32
246// CHECK-SAME: (%[[src:.*]]: f32, %[[cmp:.*]]: f32, {{.*}})
247func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 {
248  // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
249  // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
250  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
251  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
252  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
253  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
254  // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
255  // CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
256  // CHECK: return %[[dstCast]]
257  %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32
258  func.return %dst : f32
259}
260
261// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64
262// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
263func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
264  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
265  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
266  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
267  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
268  // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
269  // CHECK: return %[[dst]]
270  %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32
271  func.return %dst : i64
272}
273
274// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_v2f16
275// CHECK-SAME: (%[[src:.*]]: vector<2xf16>, %[[cmp:.*]]: vector<2xf16>, {{.*}})
276func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : vector<2xf16>, %buf : memref<64xf16>, %idx: i32) -> vector<2xf16> {
277  // CHECK-DAG: %[[srcBits:.+]] = llvm.bitcast %[[src]] : vector<2xf16> to i32
278  // CHECK-DAG: %[[cmpBits:.+]] = llvm.bitcast %[[cmp]] : vector<2xf16> to i32
279  // CHECK: %[[dstBits:.+]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcBits]], %[[cmpBits]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32
280  // CHECK: %[[dst:.+]] = llvm.bitcast %[[dstBits]] : i32 to vector<2xf16>
281  // CHECK: return %[[dst]]
282  %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32
283  func.return %dst : vector<2xf16>
284}
285
286// CHECK-LABEL: func @lds_barrier
287func.func @lds_barrier() {
288  // GFX908: llvm.inline_asm has_side_effects asm_dialect = att
289  // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
290  // GFX90A: rocdl.waitcnt -7937
291  // GFX90A-NEXT: rocdl.s.barrier
292  // GFX10:  rocdl.waitcnt -16129
293  // GFX10-NEXT: rocdl.s.barrier
294  // GFX11:  llvm.inline_asm has_side_effects asm_dialect = att
295  // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
296  // GFX12:  rocdl.s.wait.dscnt 0
297  // GFX12-NEXT: rocdl.s.barrier.signal -1
298  // GFX12-NEXT: rocdl.s.barrier.wait -1
299  amdgpu.lds_barrier
300  func.return
301}
302
303// CHECK-LABEL: func @sched_barrier
304func.func @sched_barrier() {
305  // CHECK: rocdl.sched.barrier 0
306  amdgpu.sched_barrier allow = <none>
307  // CHECK: rocdl.sched.barrier 1
308  amdgpu.sched_barrier allow = <non_mem_non_sideffect>
309  // CHECK: rocdl.sched.barrier 2
310  amdgpu.sched_barrier allow = <valu>
311  // CHECK: rocdl.sched.barrier 4
312  amdgpu.sched_barrier allow = <salu>
313  // CHECK: rocdl.sched.barrier 8
314  amdgpu.sched_barrier allow = <mfma_wmma>
315  // CHECK: rocdl.sched.barrier 16
316  amdgpu.sched_barrier allow = <all_vmem>
317  // CHECK: rocdl.sched.barrier 32
318  amdgpu.sched_barrier allow = <vmem_read>
319  // CHECK: rocdl.sched.barrier 64
320  amdgpu.sched_barrier allow = <vmem_write>
321  // CHECK: rocdl.sched.barrier 128
322  amdgpu.sched_barrier allow = <all_ds>
323  // CHECK: rocdl.sched.barrier 256
324  amdgpu.sched_barrier allow = <ds_read>
325  // CHECK: rocdl.sched.barrier 512
326  amdgpu.sched_barrier allow = <ds_write>
327  // CHECK: rocdl.sched.barrier 1024
328  amdgpu.sched_barrier allow = <transcendental>
329  // CHECK: rocdl.sched.barrier 18
330  amdgpu.sched_barrier allow = <valu|all_vmem>
331  func.return
332}
333