1; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s 2 3;;;==========================================================================;;; 4;;; MUBUF LOAD TESTS 5;;;==========================================================================;;; 6 7; MUBUF load with an immediate byte offset that fits into 12-bits 8; CHECK-LABEL: {{^}}mubuf_load0: 9; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 10define amdgpu_kernel void @mubuf_load0(ptr addrspace(1) %out, ptr addrspace(1) %in) { 11entry: 12 %0 = getelementptr i32, ptr addrspace(1) %in, i64 1 13 %1 = load i32, ptr addrspace(1) %0 14 store i32 %1, ptr addrspace(1) %out 15 ret void 16} 17 18; MUBUF load with the largest possible immediate offset 19; CHECK-LABEL: {{^}}mubuf_load1: 20; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 21define amdgpu_kernel void @mubuf_load1(ptr addrspace(1) %out, ptr addrspace(1) %in) { 22entry: 23 %0 = getelementptr i8, ptr addrspace(1) %in, i64 4095 24 %1 = load i8, ptr addrspace(1) %0 25 store i8 %1, ptr addrspace(1) %out 26 ret void 27} 28 29; MUBUF load with an immediate byte offset that doesn't fit into 12-bits 30; CHECK-LABEL: {{^}}mubuf_load2: 31; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 32; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 33define amdgpu_kernel void @mubuf_load2(ptr addrspace(1) %out, ptr addrspace(1) %in) { 34entry: 35 %0 = getelementptr i32, ptr addrspace(1) %in, i64 1024 36 %1 = load i32, ptr addrspace(1) %0 37 store i32 %1, ptr addrspace(1) %out 38 ret void 39} 40 41; MUBUF load with a 12-bit immediate offset and a register offset 42; CHECK-LABEL: {{^}}mubuf_load3: 43; CHECK-NOT: ADD 44; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 45define amdgpu_kernel void @mubuf_load3(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %offset) { 46entry: 47 %0 = getelementptr i32, ptr addrspace(1) %in, i64 %offset 48 %1 = getelementptr i32, ptr addrspace(1) %0, i64 1 49 %2 = load i32, ptr addrspace(1) %1 50 store i32 %2, ptr addrspace(1) %out 51 ret void 52} 53 54; CHECK-LABEL: {{^}}soffset_max_imm: 55; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc 56define amdgpu_gs void @soffset_max_imm(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { 57main_body: 58 %tmp1 = load ptr addrspace(8), ptr addrspace(4) %0 59 %tmp2 = shl i32 %6, 2 60 %tmp3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %tmp1, i32 %tmp2, i32 64, i32 1) 61 %tmp4 = add i32 %6, 16 62 %tmp1.4xi32 = bitcast ptr addrspace(8) %tmp1 to ptr addrspace(8) 63 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i32(i32 %tmp3, ptr addrspace(8) %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3) 64 ret void 65} 66 67; Make sure immediates that aren't inline constants don't get folded into 68; the soffset operand. 69; FIXME: for this test we should be smart enough to shift the immediate into 70; the offset field. 71; CHECK-LABEL: {{^}}soffset_no_fold: 72; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 73; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc 74define amdgpu_gs void @soffset_no_fold(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { 75main_body: 76 %tmp1 = load ptr addrspace(8), ptr addrspace(4) %0 77 %tmp2 = shl i32 %6, 2 78 %tmp3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %tmp1, i32 %tmp2, i32 65, i32 1) 79 %tmp4 = add i32 %6, 16 80 %tmp1.4xi32 = bitcast ptr addrspace(8) %tmp1 to ptr addrspace(8) 81 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i32(i32 %tmp3, ptr addrspace(8) %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3) 82 ret void 83} 84 85;;;==========================================================================;;; 86;;; MUBUF STORE TESTS 87;;;==========================================================================;;; 88 89; MUBUF store with an immediate byte offset that fits into 12-bits 90; CHECK-LABEL: {{^}}mubuf_store0: 91; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 92define amdgpu_kernel void @mubuf_store0(ptr addrspace(1) %out) { 93entry: 94 %0 = getelementptr i32, ptr addrspace(1) %out, i64 1 95 store i32 0, ptr addrspace(1) %0 96 ret void 97} 98 99; MUBUF store with the largest possible immediate offset 100; CHECK-LABEL: {{^}}mubuf_store1: 101; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 102 103define amdgpu_kernel void @mubuf_store1(ptr addrspace(1) %out) { 104entry: 105 %0 = getelementptr i8, ptr addrspace(1) %out, i64 4095 106 store i8 0, ptr addrspace(1) %0 107 ret void 108} 109 110; MUBUF store with an immediate byte offset that doesn't fit into 12-bits 111; CHECK-LABEL: {{^}}mubuf_store2: 112; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 113; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 114define amdgpu_kernel void @mubuf_store2(ptr addrspace(1) %out) { 115entry: 116 %0 = getelementptr i32, ptr addrspace(1) %out, i64 1024 117 store i32 0, ptr addrspace(1) %0 118 ret void 119} 120 121; MUBUF store with a 12-bit immediate offset and a register offset 122; CHECK-LABEL: {{^}}mubuf_store3: 123; CHECK-NOT: ADD 124; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 125define amdgpu_kernel void @mubuf_store3(ptr addrspace(1) %out, i64 %offset) { 126entry: 127 %0 = getelementptr i32, ptr addrspace(1) %out, i64 %offset 128 %1 = getelementptr i32, ptr addrspace(1) %0, i64 1 129 store i32 0, ptr addrspace(1) %1 130 ret void 131} 132 133; CHECK-LABEL: {{^}}store_sgpr_ptr: 134; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 135define amdgpu_kernel void @store_sgpr_ptr(ptr addrspace(1) %out) { 136 store i32 99, ptr addrspace(1) %out, align 4 137 ret void 138} 139 140; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: 141; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 142define amdgpu_kernel void @store_sgpr_ptr_offset(ptr addrspace(1) %out) { 143 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 10 144 store i32 99, ptr addrspace(1) %out.gep, align 4 145 ret void 146} 147 148; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: 149; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 150; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] 151define amdgpu_kernel void @store_sgpr_ptr_large_offset(ptr addrspace(1) %out) { 152 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 32768 153 store i32 99, ptr addrspace(1) %out.gep, align 4 154 ret void 155} 156 157; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: 158; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 159; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] 160define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(ptr addrspace(1) %out) { 161 %gep = getelementptr i32, ptr addrspace(1) %out, i32 32768 162 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 5 syncscope("agent") seq_cst 163 ret void 164} 165 166; CHECK-LABEL: {{^}}store_vgpr_ptr: 167; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 168define amdgpu_kernel void @store_vgpr_ptr(ptr addrspace(1) %out) { 169 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 170 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid 171 store i32 99, ptr addrspace(1) %out.gep, align 4 172 ret void 173} 174 175 176declare i32 @llvm.amdgcn.workitem.id.x() #1 177declare void @llvm.amdgcn.raw.ptr.tbuffer.store.i32(i32, ptr addrspace(8), i32, i32, i32 immarg, i32 immarg) #2 178declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) #3 179 180attributes #0 = { nounwind readonly } 181attributes #1 = { nounwind readnone speculatable willreturn } 182attributes #2 = { nounwind willreturn writeonly } 183attributes #3 = { nounwind readonly willreturn } 184attributes #4 = { readnone } 185