1; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s 2; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-GISEL %s 3; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s 4; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %s 5; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s 6; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %s 7; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-SDAG,GCN-SDAG %s 8; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-GISEL,GCN-GISEL %s 9 10; GCN-LABEL: {{^}}ds_append_lds: 11; GCN: s_load_dword [[PTR:s[0-9]+]] 12; GCN: s_mov_b32 m0, [[PTR]] 13; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 14; GCN-NOT: buffer_wbinvl1 15; GCN: {{.*}}store{{.*}} [[RESULT]] 16define amdgpu_kernel void @ds_append_lds(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 17 %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false) 18 store i32 %val, ptr addrspace(1) %out 19 ret void 20} 21 22; GCN-LABEL: {{^}}ds_append_lds_max_offset: 23; GCN: s_load_dword [[PTR:s[0-9]+]] 24; GCN: s_mov_b32 m0, [[PTR]] 25; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} 26; GCN-NOT: buffer_wbinvl1 27; GCN: {{.*}}store{{.*}} [[RESULT]] 28define amdgpu_kernel void @ds_append_lds_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 29 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383 30 %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false) 31 store i32 %val, ptr addrspace(1) %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}ds_append_no_fold_offset_si: 36; GCN: s_load_dword [[PTR:s[0-9]+]] 37 38; SI: s_add_i32 [[PTR]], [[PTR]], 16 39; SI: s_mov_b32 m0, [[PTR]] 40; SI: ds_append [[RESULT:v[0-9]+]]{{$}} 41 42; CIPLUS: s_mov_b32 m0, [[PTR]] 43; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}} 44 45; GCN-NOT: buffer_wbinvl1 46; GCN: {{.*}}store{{.*}} [[RESULT]] 47define amdgpu_kernel void @ds_append_no_fold_offset_si(ptr addrspace(4) %lds.ptr, ptr addrspace(1) %out) #0 { 48 %lds = load ptr addrspace(3), ptr addrspace(4) %lds.ptr, align 4 49 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 4 50 %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false) 51 store i32 %val, ptr addrspace(1) %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}ds_append_lds_over_max_offset: 56; GCN: s_load_dword [[PTR:s[0-9]+]] 57 58; SI-SDAG: s_bitset1_b32 [[PTR]], 16 59; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 60; GCN-SDAG: s_mov_b32 m0, [[PTR]] 61 62; SI-GISEL: s_bitset1_b32 m0, 16 63; CIPLUS-GISEL: s_add_u32 m0, [[PTR]], 0x10000 64 65; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 66; GCN-NOT: buffer_wbinvl1 67; GCN: {{.*}}store{{.*}} [[RESULT]] 68define amdgpu_kernel void @ds_append_lds_over_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 69 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16384 70 %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false) 71 store i32 %val, ptr addrspace(1) %out 72 ret void 73} 74 75; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr: 76; GCN-SDAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 77; GCN-SDAG: s_mov_b32 m0, [[READLANE]] 78 79; GCN-GISEL: v_readfirstlane_b32 m0, v0 80 81; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 82; GCN-NOT: buffer_wbinvl1 83; GCN: {{.*}}store{{.*}} [[RESULT]] 84define void @ds_append_lds_vgpr_addr(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 85 %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false) 86 store i32 %val, ptr addrspace(1) %out 87 ret void 88} 89 90; GCN-LABEL: {{^}}ds_append_gds: 91; GCN: s_load_dword [[PTR:s[0-9]+]] 92; GCN: s_mov_b32 m0, [[PTR]] 93; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}} 94; GCN-NOT: buffer_wbinvl1 95; GCN: {{.*}}store{{.*}} [[RESULT]] 96define amdgpu_kernel void @ds_append_gds(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 { 97 %val = call i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) %gds, i1 false) 98 store i32 %val, ptr addrspace(1) %out 99 ret void 100} 101 102; GCN-LABEL: {{^}}ds_append_gds_max_offset: 103; GCN: s_load_dword [[PTR:s[0-9]+]] 104; GCN: s_mov_b32 m0, [[PTR]] 105; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}} 106; GCN-NOT: buffer_wbinvl1 107; GCN: {{.*}}store{{.*}} [[RESULT]] 108define amdgpu_kernel void @ds_append_gds_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 { 109 %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16383 110 %val = call i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) %gep, i1 false) 111 store i32 %val, ptr addrspace(1) %out 112 ret void 113} 114 115; GCN-LABEL: {{^}}ds_append_gds_over_max_offset: 116; GCN-NOT: buffer_wbinvl1 117define amdgpu_kernel void @ds_append_gds_over_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 { 118 %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16384 119 %val = call i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) %gep, i1 false) 120 store i32 %val, ptr addrspace(1) %out 121 ret void 122} 123 124; GCN-LABEL: {{^}}ds_append_lds_m0_restore: 125; GCN: s_load_dword [[PTR:s[0-9]+]] 126; GCN: s_mov_b32 m0, [[PTR]] 127; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 128; GCN-NOT: buffer_wbinvl1 129; NOTGFX9: s_mov_b32 m0, -1 130; GFX9-NOT: m0 131; GCN: _store_dword 132; GCN: ds_read_b32 133define amdgpu_kernel void @ds_append_lds_m0_restore(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 134 %val0 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false) 135 store i32 %val0, ptr addrspace(1) %out 136 %val1 = load volatile i32, ptr addrspace(3) %lds 137 ret void 138} 139 140; Make sure this selects successfully with no use. The result register needs to be constrained. 141; GCN-LABEL: {{^}}ds_append_lds_no_use: 142; GCN: s_load_dword [[PTR:s[0-9]+]] 143; GCN: s_mov_b32 m0, [[PTR]] 144; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} 145define amdgpu_kernel void @ds_append_lds_no_use(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 146 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383 147 %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %gep, i1 false) 148 ret void 149} 150 151declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #1 152declare i32 @llvm.amdgcn.ds.append.p2(ptr addrspace(2) nocapture, i1 immarg) #1 153 154attributes #0 = { nounwind } 155attributes #1 = { argmemonly convergent nounwind } 156