1# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s 2# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefixes=CHECK,GFX89 %s 3 4--- | 5 define amdgpu_kernel void @flat_zero_waitcnt(ptr addrspace(1) %global4, 6 ptr addrspace(1) %global16, 7 ptr %flat4, 8 ptr %flat16) { 9 ret void 10 } 11 12 define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() { 13 ret void 14 } 15 16 define amdgpu_kernel void @single_branch_successor_not_next_block() { 17 ret void 18 } 19 20 define amdgpu_kernel void @preexisting_waitcnt() { 21 ret void 22 } 23 24 define amdgpu_kernel void @bundle_no_waitcnt() { 25 ret void 26 } 27 28 define amdgpu_kernel void @preexisting_waitcnt_in_bundle() { 29 ret void 30 } 31 32 define amdgpu_kernel void @insert_in_bundle() { 33 ret void 34 } 35 36 define amdgpu_kernel void @exit_bundle() { 37 ret void 38 } 39 40 define amdgpu_kernel void @cross_bundle() { 41 ret void 42 } 43 44 define amdgpu_kernel void @subregs16bit() { 45 ret void 46 } 47 48 define amdgpu_kernel void @waitcnt_backedge() { 49 ret void 50 } 51... 52--- 53 54# CHECK-LABEL: name: flat_zero_waitcnt 55 56# CHECK-LABEL: bb.0: 57# CHECK: FLAT_LOAD_DWORD 58# CHECK: FLAT_LOAD_DWORDX4 59# Global loads will return in order so we should: 60# s_waitcnt vmcnt(1) 61# CHECK-NEXT: S_WAITCNT 3953 62 63# CHECK-LABEL: bb.1: 64# CHECK: FLAT_LOAD_DWORD 65# s_waitcnt vmcnt(0) 66# GFX89: S_WAITCNT 3952 67# CHECK: FLAT_LOAD_DWORDX4 68 69# CHECK-LABEL: bb.2: 70# CHECK: FLAT_LOAD_DWORD 71# s_waitcnt vmcnt(0) 72# GFX89: S_WAITCNT 3952 73# CHECK: FLAT_LOAD_DWORDX4 74 75# CHECK-LABEL: bb.3: 76# s_waitcnt vmcnt(0) 77# GFX89: S_WAITCNT 3952 78# CHECK: FLAT_LOAD_DWORD 79# CHECK: FLAT_LOAD_DWORD 80# s_waitcnt vmcnt(0) lgkmcnt(0) 81# GFX89: S_WAITCNT 112 82 83# CHECK-LABEL: bb.4: 84# GFX89-NOT: S_WAITCNT 85# CHECK: FLAT_LOAD_DWORD 86# s_waitcnt vmcnt(0) lgkmcnt(0) 87# GFX89: S_WAITCNT 112 88 89name: flat_zero_waitcnt 90 91body: | 92 bb.0: 93 successors: %bb.1 94 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.global4) 95 $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.global16) 96 $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec 97 S_BRANCH %bb.1 98 99 bb.1: 100 successors: %bb.2 101 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 102 $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.global16) 103 $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec 104 S_BRANCH %bb.2 105 106 bb.2: 107 successors: %bb.3 108 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.flat4) 109 $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.flat16) 110 $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec 111 S_BRANCH %bb.3 112 113 bb.3: 114 successors: %bb.4 115 $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.flat4) 116 $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.global4) 117 $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec 118 S_BRANCH %bb.4 119 120 bb.4: 121 $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.flat4) 122 $vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec 123 S_ENDPGM 0 124... 125--- 126# There is only a single fallthrough successor block, so there's no 127# need to wait immediately. 128 129# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait 130# CHECK: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2 131# CHECK-NOT: S_WAITCNT 132 133# CHECK: bb.1: 134# CHECK-NEXT: V_LSHLREV_B64_e64 135# CHECK-NEXT: S_WAITCNT 112 136# CHECK-NEXT: FLAT_STORE_DWORD 137name: single_fallthrough_successor_no_end_block_wait 138 139body: | 140 bb.0: 141 successors: %bb.1 142 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 143 144 bb.1: 145 $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec 146 FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 147 S_ENDPGM 0 148... 149--- 150# The block has a single predecessor with a single successor, but it 151# is not the next block so it's non-obvious that the wait is not needed. 152 153 154# CHECK-LABEL: name: single_branch_successor_not_next_block 155 156# CHECK: bb.1 157# CHECK-NEXT: FLAT_STORE_DWORD 158# CHECK-NEXT: S_ENDPGM 0 159 160# CHECK: bb.2: 161# CHECK-NEXT: V_LSHLREV_B64_e64 162# CHECK-NEXT: S_WAITCNT 112 163# CHECK-NEXT: FLAT_STORE_DWORD 164name: single_branch_successor_not_next_block 165 166body: | 167 bb.0: 168 successors: %bb.2 169 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 170 S_BRANCH %bb.2 171 172 bb.1: 173 FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, implicit $exec, implicit $flat_scr 174 S_ENDPGM 0 175 176 bb.2: 177 $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec 178 FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 179 S_ENDPGM 0 180... 181 182# CHECK-LABEL: name: preexisting_waitcnt{{$}} 183# CHECK: FLAT_LOAD_DWORD 184# CHECK-NEXT: S_WAITCNT 0 185# CHECK-NOT: S_WAITCNT 186name: preexisting_waitcnt 187tracksRegLiveness: true 188machineFunctionInfo: 189 isEntryFunction: true 190body: | 191 bb.0: 192 liveins: $vgpr1_vgpr2 193 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 194 S_WAITCNT 0 195 FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 196 197... 198 199--- 200 201# CHECK-LABEL: name: bundle_no_waitcnt{{$}} 202# CHECK: FLAT_LOAD_DWORD 203# CHECK-NEXT: BUNDLE 204# CHECK-NEXT: S_NOP 205# CHECK-NEXT: S_NOP 206# CHECK-NEXT: } 207# CHECK-NEXT: S_WAITCNT 112 208name: bundle_no_waitcnt 209tracksRegLiveness: true 210machineFunctionInfo: 211 isEntryFunction: true 212body: | 213 bb.0: 214 liveins: $vgpr1_vgpr2 215 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 216 BUNDLE { 217 S_NOP 0 218 S_NOP 0 219 } 220 FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 221 222... 223 224--- 225 226# See the waitcnt inside the bundle and don't insert an extra 227# CHECK-LABEL: name: preexisting_waitcnt_in_bundle{{$}} 228# CHECK: FLAT_LOAD_DWORD 229# CHECK: S_WAITCNT 0 230# CHECK-NOT: S_WAITCNT 231name: preexisting_waitcnt_in_bundle 232tracksRegLiveness: true 233machineFunctionInfo: 234 isEntryFunction: true 235body: | 236 bb.0: 237 liveins: $vgpr1_vgpr2 238 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 239 BUNDLE { 240 S_NOP 0 241 S_WAITCNT 0 242 } 243 FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 244 245... 246 247--- 248 249# Def and use inside bundle 250# CHECK-LABEL: name: insert_in_bundle{{$}} 251# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { 252# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 253# CHECK-NEXT: S_WAITCNT 112 254# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 255# CHECK-NEXT: } 256 257name: insert_in_bundle 258tracksRegLiveness: true 259machineFunctionInfo: 260 isEntryFunction: true 261body: | 262 bb.0: 263 liveins: $vgpr1_vgpr2 264 BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { 265 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 266 FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 267 } 268... 269 270--- 271 272# Def is last instruction in bundle, use is outside bundle 273 274# CHECK-LABEL: name: exit_bundle{{$}} 275# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { 276# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 277# CHECK-NEXT: } 278# CHECK-NEXT: S_WAITCNT 112 279# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 280 281name: exit_bundle 282tracksRegLiveness: true 283machineFunctionInfo: 284 isEntryFunction: true 285body: | 286 bb.0: 287 liveins: $vgpr1_vgpr2 288 BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { 289 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 290 } 291 292 FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 293 294... 295 296--- 297 298# Def is in bundle, use is in another bundle 299 300# CHECK-LABEL: name: cross_bundle{{$}} 301# CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { 302# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 303# CHECK-NEXT: } 304# CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 { 305# CHECK-NEXT: S_WAITCNT 112 306# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 307# CHECK-NEXT: } 308 309name: cross_bundle 310tracksRegLiveness: true 311machineFunctionInfo: 312 isEntryFunction: true 313body: | 314 bb.0: 315 liveins: $vgpr1_vgpr2 316 BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { 317 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr 318 } 319 BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 { 320 FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr 321 } 322... 323 324--- 325# CHECK-LABEL: name: subregs16bit 326# CHECK: S_WAITCNT 112 327# CHECK-NEXT: V_NOP_e32 328 329name: subregs16bit 330machineFunctionInfo: 331 isEntryFunction: true 332body: | 333 bb.0: 334 liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4 335 $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr 336 $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr 337 V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16 338... 339 340--- 341# Waitcnt required before the use of $sgpr10_sgpr11, as the S_LOAD also writes 342# to $sgpr10_sgpr11, and can occur first in the program running order. 343 344# CHECK-LABEL: name: waitcnt_backedge 345# CHECK: S_WAITCNT 346# CHECK: $sgpr10_sgpr11 = S_CSELECT_B64 347# CHECK: $sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM 348 349 350name: waitcnt_backedge 351body: | 352 bb.0: 353 renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr2_sgpr3, 32, 0 :: (load (s128) from `ptr addrspace(4) undef`, addrspace 4) 354 355 bb.4: 356 renamable $sgpr10_sgpr11 = S_CSELECT_B64 -1, 0, implicit killed $scc 357 renamable $vgpr1 = BUFFER_LOAD_DWORD_OFFEN killed renamable $vgpr5, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) 358 renamable $sgpr10_sgpr11 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 0, 0 :: (load (s64) from `ptr addrspace(4) undef`, align 4, addrspace 4) 359 S_CBRANCH_SCC0 %bb.9, implicit killed $scc 360 361 bb.9: 362 renamable $vgpr1 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr14_sgpr15, implicit $exec 363 S_CBRANCH_SCC0 %bb.14, implicit killed $scc 364 365 bb.10: 366 S_BRANCH %bb.4 367 368 bb.14: 369 S_ENDPGM 0 370... 371