1; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 2; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 3; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 4; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 5; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 6; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 7; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s 8; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s 9; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s 10; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s 11; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s 12; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s 13 14; Minimum offset 15; GCN-LABEL: {{^}}gws_init_offset0: 16; GCN-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 17; GCN-DAG: s_mov_b32 m0, 0{{$}} 18; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] 19; NOLOOP: ds_gws_init v0 gds{{$}} 20 21; LOOP: [[LOOP:.LBB[0-9]+_[0-9]+]]: 22; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 23; LOOP-NEXT: ds_gws_init v0 gds 24; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) 26; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 27; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] 28define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 { 29 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) 30 ret void 31} 32 33; Maximum offset 34; GCN-LABEL: {{^}}gws_init_offset63: 35; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 36; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} 37; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 38; NOLOOP: ds_gws_init v0 offset:63 gds{{$}} 39 40 41; LOOP: s_mov_b32 m0, 0{{$}} 42; LOOP: [[LOOP:.LBB[0-9]+_[0-9]+]]: 43; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 44; LOOP-NEXT: ds_gws_init v0 offset:63 gds 45; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) 47; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 48; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] 49define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { 50 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63) 51 ret void 52} 53 54; FIXME: Should be able to shift directly into m0 55; GCN-LABEL: {{^}}gws_init_sgpr_offset: 56; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] 57 58; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 59; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 60 61; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 62 63; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 64; NOLOOP: ds_gws_init [[GWS_VAL]] gds{{$}} 65define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { 66 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) 67 ret void 68} 69 70; Variable offset in SGPR with constant add 71; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: 72; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] 73 74; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 75; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 76 77; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 78 79; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 80; NOLOOP: ds_gws_init [[GWS_VAL]] offset:1 gds{{$}} 81define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { 82 %offset = add i32 %offset.base, 1 83 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) 84 ret void 85} 86 87; GCN-LABEL: {{^}}gws_init_vgpr_offset: 88; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 89; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 90 91; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 92; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 93 94; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 95 96; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 97; NOLOOP: ds_gws_init v0 gds{{$}} 98define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { 99 %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() 100 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) 101 ret void 102} 103 104; Variable offset in VGPR with constant add 105; GCN-LABEL: {{^}}gws_init_vgpr_offset_add: 106; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 107; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 108 109; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 110; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 111 112; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 113 114; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 115; NOLOOP: ds_gws_init v0 offset:3 gds{{$}} 116define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 { 117 %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() 118 %vgpr.offset = add i32 %vgpr.offset.base, 3 119 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) 120 ret void 121} 122 123@lds = internal unnamed_addr addrspace(3) global i32 undef 124 125; Check if m0 initialization is shared. 126; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset: 127; NOLOOP: s_mov_b32 m0, 0 128; NOLOOP: ds_gws_init v{{[0-9]+}} offset:10 gds 129 130; LOOP: s_mov_b32 m0, -1 131; LOOP: ds_write_b32 132; LOOP: s_mov_b32 m0, 0 133; LOOP: s_setreg_imm32_b32 134; LOOP: ds_gws_init v{{[0-9]+}} offset:10 gds 135; LOOP: s_cbranch_scc1 136 137; LOOP: s_mov_b32 m0, -1 138; LOOP: ds_write_b32 139define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 { 140 store volatile i32 1, ptr addrspace(3) @lds 141 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10) 142 store i32 2, ptr addrspace(3) @lds 143 ret void 144} 145 146; GCN-LABEL: {{^}}gws_init_lgkmcnt: 147; NOLOOP: s_mov_b32 m0, 0{{$}} 148; NOLOOP: ds_gws_init v0 gds{{$}} 149; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; NOLOOP-NEXT: s_setpc_b64 151define void @gws_init_lgkmcnt(i32 %val) { 152 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) 153 ret void 154} 155 156; Does not imply memory fence on its own 157; GCN-LABEL: {{^}}gws_init_wait_before: 158; NOLOOP: s_waitcnt lgkmcnt(0) 159; NOLOOP-NOT: s_waitcnt 160; NOLOOP: ds_gws_init 161; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 162define amdgpu_kernel void @gws_init_wait_before(i32 %val, ptr addrspace(1) %ptr) #0 { 163 store i32 0, ptr addrspace(1) %ptr 164 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) 165 ret void 166} 167 168declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1 169declare i32 @llvm.amdgcn.workitem.id.x() #2 170 171attributes #0 = { nounwind } 172attributes #1 = { convergent inaccessiblememonly nounwind writeonly } 173attributes #2 = { nounwind readnone speculatable } 174