1; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s 2; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-GISEL %s 3; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s 4; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-GISEL %s 5; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s 6; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-GISEL %s 7; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,GCN-SDAG %s 8; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,GCN-GISEL %s 9 10; GCN-LABEL: {{^}}ds_consume_lds: 11; GCN: s_load_dword [[PTR:s[0-9]+]] 12; GCN: s_mov_b32 m0, [[PTR]] 13; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 14; GCN-NOT: buffer_wbinvl1 15; GCN: {{.*}}store{{.*}} [[RESULT]] 16define amdgpu_kernel void @ds_consume_lds(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 17 %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %lds, i1 false) 18 store i32 %val, ptr addrspace(1) %out 19 ret void 20} 21 22; GCN-LABEL: {{^}}ds_consume_lds_max_offset: 23; GCN: s_load_dword [[PTR:s[0-9]+]] 24; GCN: s_mov_b32 m0, [[PTR]] 25; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} 26; GCN-NOT: buffer_wbinvl1 27; GCN: {{.*}}store{{.*}} [[RESULT]] 28define amdgpu_kernel void @ds_consume_lds_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 29 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383 30 %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false) 31 store i32 %val, ptr addrspace(1) %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si: 36; GCN: s_load_dword [[PTR:s[0-9]+]] 37 38; SI: s_add_i32 [[PTR]], [[PTR]], 16 39; SI: s_mov_b32 m0, [[PTR]] 40; SI: ds_consume [[RESULT:v[0-9]+]]{{$}} 41 42; CIPLUS: s_mov_b32 m0, [[PTR]] 43; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}} 44 45; GCN-NOT: buffer_wbinvl1 46; GCN: {{.*}}store{{.*}} [[RESULT]] 47define amdgpu_kernel void @ds_consume_no_fold_offset_si(ptr addrspace(4) %lds.ptr, ptr addrspace(1) %out) #0 { 48 %lds = load ptr addrspace(3), ptr addrspace(4) %lds.ptr, align 4 49 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 4 50 %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false) 51 store i32 %val, ptr addrspace(1) %out 52 ret void 53} 54 55; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset: 56; GCN: s_load_dword [[PTR:s[0-9]+]] 57 58; SI: s_bitset1_b32 [[PTR]], 16 59; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 60; CIPLUS-GISEL: s_add_u32 [[PTR]], [[PTR]], 0x10000 61 62; GCN-SDAG: s_mov_b32 m0, [[PTR]] 63; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 64; GCN-NOT: buffer_wbinvl1 65; GCN: {{.*}}store{{.*}} [[RESULT]] 66define amdgpu_kernel void @ds_consume_lds_over_max_offset(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 67 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16384 68 %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false) 69 store i32 %val, ptr addrspace(1) %out 70 ret void 71} 72 73; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr: 74; GCN-SDAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 75; GCN-SDAG: s_mov_b32 m0, [[READLANE]] 76; GCN-GISEL: v_readfirstlane_b32 m0, v0 77; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 78; GCN-NOT: buffer_wbinvl1 79; GCN: {{.*}}store{{.*}} [[RESULT]] 80define void @ds_consume_lds_vgpr_addr(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 81 %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %lds, i1 false) 82 store i32 %val, ptr addrspace(1) %out 83 ret void 84} 85 86; GCN-LABEL: {{^}}ds_consume_gds: 87; GCN: s_load_dword [[PTR:s[0-9]+]] 88; GCN: s_mov_b32 m0, [[PTR]] 89; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}} 90; GCN-NOT: buffer_wbinvl1 91; GCN: {{.*}}store{{.*}} [[RESULT]] 92define amdgpu_kernel void @ds_consume_gds(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 { 93 %val = call i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) %gds, i1 false) 94 store i32 %val, ptr addrspace(1) %out 95 ret void 96} 97 98; GCN-LABEL: {{^}}ds_consume_gds_max_offset: 99; GCN: s_load_dword [[PTR:s[0-9]+]] 100; GCN: s_mov_b32 m0, [[PTR]] 101; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}} 102; GCN-NOT: buffer_wbinvl1 103; GCN: {{.*}}store{{.*}} [[RESULT]] 104define amdgpu_kernel void @ds_consume_gds_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 { 105 %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16383 106 %val = call i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) %gep, i1 false) 107 store i32 %val, ptr addrspace(1) %out 108 ret void 109} 110 111; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset: 112; GCN-NOT: buffer_wbinvl1 113define amdgpu_kernel void @ds_consume_gds_over_max_offset(ptr addrspace(2) %gds, ptr addrspace(1) %out) #0 { 114 %gep = getelementptr inbounds i32, ptr addrspace(2) %gds, i32 16384 115 %val = call i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) %gep, i1 false) 116 store i32 %val, ptr addrspace(1) %out 117 ret void 118} 119 120; GCN-LABEL: {{^}}ds_consume_lds_m0_restore: 121; GCN: s_load_dword [[PTR:s[0-9]+]] 122; GCN: s_mov_b32 m0, [[PTR]] 123; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 124; GCN-NOT: buffer_wbinvl1 125; NOTGFX9: s_mov_b32 m0, -1 126; GFX9-NOT: m0 127; GCN: _store_dword 128; GCN: ds_read_b32 129define amdgpu_kernel void @ds_consume_lds_m0_restore(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 130 %val0 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %lds, i1 false) 131 store i32 %val0, ptr addrspace(1) %out 132 %val1 = load volatile i32, ptr addrspace(3) %lds 133 ret void 134} 135 136; Make sure this selects successfully with no use. The result register needs to be constrained. 137; GCN-LABEL: {{^}}ds_consume_lds_no_use: 138; GCN: s_load_dword [[PTR:s[0-9]+]] 139; GCN: s_mov_b32 m0, [[PTR]] 140; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} 141define amdgpu_kernel void @ds_consume_lds_no_use(ptr addrspace(3) %lds, ptr addrspace(1) %out) #0 { 142 %gep = getelementptr inbounds i32, ptr addrspace(3) %lds, i32 16383 143 %val = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %gep, i1 false) 144 ret void 145} 146 147declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #1 148declare i32 @llvm.amdgcn.ds.consume.p2(ptr addrspace(2) nocapture, i1 immarg) #1 149 150attributes #0 = { nounwind } 151attributes #1 = { argmemonly convergent nounwind } 152