1; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 2; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 3; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 4; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 5; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 6; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s 7; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX9 %s 8; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX9 %s 9; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s 10; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s 11; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s 12; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s 13 14; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. 15; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s 16; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s 17; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s 18; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s 19 20 21; Minimum offset 22; GCN-LABEL: {{^}}gws_barrier_offset0: 23; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 24; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} 25; NOLOOP: v_mov_b32_e32 v0, [[BAR_NUM]] 26; NOLOOP: ds_gws_barrier v0 gds{{$}} 27 28; LOOP: s_mov_b32 m0, 0{{$}} 29; LOOP: [[LOOP:.LBB[0-9]+_[0-9]+]]: 30; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 31; LOOP-NEXT: ds_gws_barrier v0 gds 32; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) 34; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 35; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] 36 37; MIR-LABEL: name: gws_barrier_offset0{{$}} 38; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { 39; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") 40; MIR-NEXT: S_WAITCNT 0 41; MIR-NEXT: } 42define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { 43 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) 44 ret void 45} 46 47; MIR-LABEL: name: gws_barrier_offset63{{$}} 48 49; Maximum offset 50; GCN-LABEL: {{^}}gws_barrier_offset63: 51; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 52; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} 53; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] 54; NOLOOP: ds_gws_barrier v0 offset:63 gds{{$}} 55define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { 56 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63) 57 ret void 58} 59 60; FIXME: Should be able to shift directly into m0 61; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: 62; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] 63 64; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 65; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 66 67; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 68 69 70; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 71; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} 72define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { 73 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) 74 ret void 75} 76 77; Variable offset in SGPR with constant add 78; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: 79; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] 80 81; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 82; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 83 84; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 85 86; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] 87; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:1 gds{{$}} 88define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { 89 %offset = add i32 %offset.base, 1 90 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) 91 ret void 92} 93 94; GCN-LABEL: {{^}}gws_barrier_vgpr_offset: 95; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 96; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 97 98; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 99; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 100 101; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 102 103; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] 104; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} 105define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { 106 %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() 107 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) 108 ret void 109} 110 111; Variable offset in VGPR with constant add 112; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add: 113; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] 114; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 115 116; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 117; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} 118 119; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 120 121; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] 122; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}} 123define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 { 124 %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() 125 %vgpr.offset = add i32 %vgpr.offset.base, 3 126 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) 127 ret void 128} 129 130@lds = internal unnamed_addr addrspace(3) global i32 undef 131 132; Check if m0 initialization is shared 133; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset: 134; NOLOOP: s_mov_b32 m0, 0 135; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:10 gds 136 137; LOOP: s_mov_b32 m0, -1 138; LOOP: ds_write_b32 139; LOOP: s_mov_b32 m0, 0 140; LOOP: s_setreg_imm32_b32 141; LOOP: ds_gws_barrier v{{[0-9]+}} offset:10 gds 142; LOOP: s_cbranch_scc1 143 144; LOOP: s_mov_b32 m0, -1 145; LOOP: ds_write_b32 146define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 { 147 store i32 1, ptr addrspace(3) @lds 148 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10) 149 store i32 2, ptr addrspace(3) @lds 150 ret void 151} 152 153; Make sure this increments lgkmcnt 154; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: 155; NOLOOP: s_mov_b32 m0, 0{{$}} 156; NOLOOP: ds_gws_barrier v0 gds{{$}} 157; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 158; NOLOOP-NEXT: s_setpc_b64 159define void @gws_barrier_lgkmcnt(i32 %val) { 160 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) 161 ret void 162} 163 164; Does not imply memory fence on its own 165; GCN-LABEL: {{^}}gws_barrier_wait_before: 166; NOLOOP: s_waitcnt 167; NOLOOP-NOT: s_waitcnt{{$}} 168define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, ptr addrspace(1) %ptr) #0 { 169 store i32 0, ptr addrspace(1) %ptr 170 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 171 ret void 172} 173 174; GCN-LABEL: {{^}}gws_barrier_wait_after: 175; NOLOOP: s_mov_b32 m0, 0{{$}} 176; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds 177; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; NOLOOP: load_{{dword|b32}} 179define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, ptr addrspace(1) %ptr) #0 { 180 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 181 %load = load volatile i32, ptr addrspace(1) %ptr 182 ret void 183} 184 185; Does not imply memory fence on its own 186; GCN-LABEL: {{^}}gws_barrier_fence_before: 187; NOLOOP: s_mov_b32 m0, 0{{$}} 188; NOLOOP: store_{{dword|b32}} 189; GFX9: s_waitcnt vmcnt(0) 190; GFX10: s_waitcnt_vscnt null, 0x0 191; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds 192; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 193define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, ptr addrspace(1) %ptr) #0 { 194 store i32 0, ptr addrspace(1) %ptr 195 fence release 196 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 197 ret void 198} 199 200; FIXME: Extra waitcnt 201; GCN-LABEL: {{^}}gws_barrier_fence_after: 202; NOLOOP: s_mov_b32 m0, 0{{$}} 203; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds 204; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; NOLOOP-NEXT: load_{{dword|b32}} 206 207define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, ptr addrspace(1) %ptr) #0 { 208 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 209 fence release 210 %load = load volatile i32, ptr addrspace(1) %ptr 211 ret void 212} 213 214; FIXME: Should a wait be inserted here, or is an explicit fence needed? 215; GCN-LABEL: {{^}}gws_init_barrier: 216; NOLOOP: s_mov_b32 m0, 0 217; NOLOOP: ds_gws_init v0 offset:7 gds 218; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds 220; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 221define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { 222 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) 223 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 224 ret void 225} 226 227; FIXME: Why vmcnt, not expcnt? 228; GCN-LABEL: {{^}}gws_init_fence_barrier: 229; NOLOOP: s_mov_b32 m0, 0 230; NOLOOP: ds_gws_init v0 offset:7 gds 231; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds 233; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { 235 call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) 236 fence release 237 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) 238 ret void 239} 240 241declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 242declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 243declare i32 @llvm.amdgcn.workitem.id.x() #3 244 245attributes #0 = { nounwind } 246attributes #1 = { convergent inaccessiblememonly nounwind } 247attributes #2 = { convergent inaccessiblememonly nounwind writeonly } 248attributes #3 = { nounwind readnone speculatable } 249