1// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s --check-prefixes=CHECK,GFX9,GFX908 2// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx90a | FileCheck %s --check-prefixes=CHECK,GFX9,GFX90A 3// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10,RDNA 4// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11,RDNA 5// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12,RDNA 6 7// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32 8func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 { 9 // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) 10 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) 11 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 12 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 13 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8> 14 // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 15 // CHECK: return %[[ret]] 16 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[] : memref<i32> -> i32 17 func.return %0 : i32 18} 19 20// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32 21func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 { 22 // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) 23 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 24 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 25 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 26 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8> 27 // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 28 // CHECK: return %[[ret]] 29 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> i32 30 func.return %0 : i32 31} 32 33// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_strided 34func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<16x16xi32, strided<[?, ?], offset: ?>>, %i: i32, %j: i32) -> i32 { 35 // CHECK: %[[descriptor:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref<16x16xi32, strided<[?, ?], offset: ?>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 36 // CHECK: %[[elem_size:.*]] = llvm.mlir.constant(4 : i32) : i32 37 // CHECK: %[[algn_ptr:.*]] = llvm.extractvalue %[[descriptor]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 38 // CHECK: %[[offset:.*]] = llvm.extractvalue %[[descriptor]][2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 39 // CHECK: %[[ptr:.*]] = llvm.getelementptr %[[algn_ptr]][%[[offset]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 40 // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16 41 // CHECK: %[[sz_i:.*]] = llvm.extractvalue %[[descriptor]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 42 // CHECK: %[[stride_i:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 43 // CHECK: %[[ext_i:.*]] = llvm.mul %[[sz_i]], %[[stride_i]] : i64 44 // CHECK: %[[sz_j:.*]] = llvm.extractvalue %[[descriptor]][3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 45 // CHECK: %[[stride_j:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 46 // CHECK: %[[ext_j:.*]] = llvm.mul %[[sz_j]], %[[stride_j]] : i64 47 // CHECK: %[[num_records:.*]] = llvm.intr.umax(%[[ext_i]], %[[ext_j]]) : (i64, i64) -> i64 48 // CHECK: %[[num_rec_i32:.*]] = llvm.trunc %[[num_records]] : i64 to i32 49 // CHECK: %[[num_rec_bytes_i32:.*]] = llvm.mul %[[num_rec_i32]], %[[elem_size]] : i32 50 // CHECK: %[[rsrc:.*]] = rocdl.make.buffer.rsrc %[[ptr]], %[[stride]], %[[num_rec_bytes_i32]], %{{.*}} : !llvm.ptr to <8> 51 // CHECK: %[[stride_i_1:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 52 // CHECK: %[[stride_i_i32:.*]] = llvm.trunc %[[stride_i_1]] : i64 to i32 53 // CHECK: %[[t_0:.*]] = llvm.mul %{{.*}}, %[[stride_i_i32]] : i32 54 // CHECK: %[[stride_j_1:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 55 // CHECK: %[[stride_j_i32:.*]] = llvm.trunc %[[stride_j_1]] : i64 to i32 56 // CHECK: %[[t_1:.*]] = llvm.mul %{{.*}}, %[[stride_j_i32]] : i32 57 // CHECK: %[[index:.*]] = llvm.add %[[t_0]], %[[t_1]] : i32 58 // CHECK: %[[vgpr_off:.*]] = llvm.mul %[[index]], %[[elem_size]] : i32 59 // CHECK: %[[zero_0:.*]] = llvm.mlir.constant(0 : i32) : i32 60 // CHECK: %[[sgpr_off:.*]] = llvm.mul %[[zero_0]], %[[elem_size]] : i32 61 // CHECK: %[[zero_1:.*]] = llvm.mlir.constant(0 : i32) : i32 62 // CHECK: %[[v:.*]] = rocdl.raw.ptr.buffer.load %[[rsrc]], %[[vgpr_off]], %[[sgpr_off]], %[[zero_1]] : i32 63 // CHECK: return %[[v]] : i32 64 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%i, %j] : memref<16x16xi32, strided<[?, ?], offset: ?>>, i32, i32 -> i32 65 func.return %0 : i32 66} 67 68// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off 69func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 { 70 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 71 // RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32) 72 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]] 73 // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 74 // CHECK: return %[[ret]] 75 %0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32 76 func.return %0 : i32 77} 78 79// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32 80func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> { 81 // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32> 82 // CHECK: return %[[ret]] 83 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32> 84 func.return %0 : vector<2xi32> 85} 86 87// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8 88func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 { 89 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) 90 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} 91 // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8 92 // CHECK: return %[[ret]] 93 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> i8 94 func.return %0 : i8 95} 96 97// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8 98func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> { 99 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) 100 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} 101 // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i16 102 // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8> 103 // CHECK: return %[[ret]] 104 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8> 105 func.return %0 : vector<2xi8> 106} 107 108// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8 109func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> { 110 // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> 111 // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8> 112 // CHECK: return %[[ret]] 113 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8> 114 func.return %0 : vector<16xi8> 115} 116 117// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ 118func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx: i32) -> f8E5M2FNUZ { 119 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) 120 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} 121 // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8 122 // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[loaded]] : i8 to f8E5M2FNUZ 123 // CHECK: return %[[ret]] 124 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xf8E5M2FNUZ>, i32 -> f8E5M2FNUZ 125 func.return %0 : f8E5M2FNUZ 126} 127 128// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ 129func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %idx: i32) -> vector<4xf8E4M3FNUZ> { 130 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) 131 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} 132 // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 133 // CHECK: %[[cast:.*]] = llvm.bitcast %[[loaded]] : i32 to vector<4xi8> 134 // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[cast]] : vector<4xi8> to vector<4xf8E4M3FNUZ> 135 // CHECK: return %[[ret]] 136 %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xf8E4M3FNUZ>, i32 -> vector<4xf8E4M3FNUZ> 137 func.return %0 : vector<4xf8E4M3FNUZ> 138} 139 140// Since the lowering logic is shared with loads, only bitcasts need to be rechecked 141// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32 142func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) { 143 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 144 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 145 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]] 146 // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 147 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32> 148 func.return 149} 150 151// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32 152func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { 153 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 154 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 155 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 156 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 157 // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 158 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 159 func.return 160} 161 162// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8 163func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) { 164 // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16 165 // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16 166 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32 167 func.return 168} 169 170// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8 171func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) { 172 // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32> 173 // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32> 174 amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32 175 func.return 176} 177 178// And more so for atomic add 179// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32 180func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { 181 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 182 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 183 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 184 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 185 // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 186 amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 187 func.return 188} 189 190// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2f16 191func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: memref<64xf16>, %idx: i32) { 192 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) 193 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 194 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 195 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 196 // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf16> 197 amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32 198 func.return 199} 200 201// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16 202func.func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16(%value: vector<2xbf16>, %buf: memref<64xbf16>, %idx: i32) { 203 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) 204 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 205 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 206 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 207 // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xbf16> 208 amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xbf16> -> memref<64xbf16>, i32 209 func.return 210} 211 212// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 213func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { 214 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 215 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 216 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 217 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 218 // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32 219 amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 220 func.return 221} 222 223// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32 224func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { 225 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 226 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 227 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 228 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 229 // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 230 amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 231 func.return 232} 233 234// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32 235func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { 236 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 237 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 238 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 239 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 240 // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 241 amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32 242 func.return 243} 244 245// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_f32 246// CHECK-SAME: (%[[src:.*]]: f32, %[[cmp:.*]]: f32, {{.*}}) 247func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 { 248 // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32 249 // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32 250 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) 251 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 252 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 253 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 254 // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 255 // CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32 256 // CHECK: return %[[dstCast]] 257 %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32 258 func.return %dst : f32 259} 260 261// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64 262// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}}) 263func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 { 264 // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32) 265 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) 266 // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) 267 // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] 268 // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64 269 // CHECK: return %[[dst]] 270 %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32 271 func.return %dst : i64 272} 273 274// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_v2f16 275// CHECK-SAME: (%[[src:.*]]: vector<2xf16>, %[[cmp:.*]]: vector<2xf16>, {{.*}}) 276func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : vector<2xf16>, %buf : memref<64xf16>, %idx: i32) -> vector<2xf16> { 277 // CHECK-DAG: %[[srcBits:.+]] = llvm.bitcast %[[src]] : vector<2xf16> to i32 278 // CHECK-DAG: %[[cmpBits:.+]] = llvm.bitcast %[[cmp]] : vector<2xf16> to i32 279 // CHECK: %[[dstBits:.+]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcBits]], %[[cmpBits]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32 280 // CHECK: %[[dst:.+]] = llvm.bitcast %[[dstBits]] : i32 to vector<2xf16> 281 // CHECK: return %[[dst]] 282 %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32 283 func.return %dst : vector<2xf16> 284} 285 286// CHECK-LABEL: func @lds_barrier 287func.func @lds_barrier() { 288 // GFX908: llvm.inline_asm has_side_effects asm_dialect = att 289 // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" 290 // GFX90A: rocdl.waitcnt -7937 291 // GFX90A-NEXT: rocdl.s.barrier 292 // GFX10: rocdl.waitcnt -16129 293 // GFX10-NEXT: rocdl.s.barrier 294 // GFX11: llvm.inline_asm has_side_effects asm_dialect = att 295 // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" 296 // GFX12: rocdl.s.wait.dscnt 0 297 // GFX12-NEXT: rocdl.s.barrier.signal -1 298 // GFX12-NEXT: rocdl.s.barrier.wait -1 299 amdgpu.lds_barrier 300 func.return 301} 302 303// CHECK-LABEL: func @sched_barrier 304func.func @sched_barrier() { 305 // CHECK: rocdl.sched.barrier 0 306 amdgpu.sched_barrier allow = <none> 307 // CHECK: rocdl.sched.barrier 1 308 amdgpu.sched_barrier allow = <non_mem_non_sideffect> 309 // CHECK: rocdl.sched.barrier 2 310 amdgpu.sched_barrier allow = <valu> 311 // CHECK: rocdl.sched.barrier 4 312 amdgpu.sched_barrier allow = <salu> 313 // CHECK: rocdl.sched.barrier 8 314 amdgpu.sched_barrier allow = <mfma_wmma> 315 // CHECK: rocdl.sched.barrier 16 316 amdgpu.sched_barrier allow = <all_vmem> 317 // CHECK: rocdl.sched.barrier 32 318 amdgpu.sched_barrier allow = <vmem_read> 319 // CHECK: rocdl.sched.barrier 64 320 amdgpu.sched_barrier allow = <vmem_write> 321 // CHECK: rocdl.sched.barrier 128 322 amdgpu.sched_barrier allow = <all_ds> 323 // CHECK: rocdl.sched.barrier 256 324 amdgpu.sched_barrier allow = <ds_read> 325 // CHECK: rocdl.sched.barrier 512 326 amdgpu.sched_barrier allow = <ds_write> 327 // CHECK: rocdl.sched.barrier 1024 328 amdgpu.sched_barrier allow = <transcendental> 329 // CHECK: rocdl.sched.barrier 18 330 amdgpu.sched_barrier allow = <valu|all_vmem> 331 func.return 332} 333