1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 3 4; OBJ: Relocations [ 5; OBJ-NEXT: ] 6 7; Used to emit an always 4 byte instruction. Inline asm always assumes 8; each instruction is the maximum size. 9declare void @llvm.amdgcn.s.sleep(i32) #0 10 11declare i32 @llvm.amdgcn.workitem.id.x() #1 12 13 14define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { 15; GCN-LABEL: uniform_conditional_max_short_forward_branch: 16; GCN: ; %bb.0: ; %bb 17; GCN-NEXT: s_load_dword s0, s[4:5], 0xb 18; GCN-NEXT: s_waitcnt lgkmcnt(0) 19; GCN-NEXT: s_cmp_eq_u32 s0, 0 20; GCN-NEXT: s_cbranch_scc1 .LBB0_2 21; GCN-NEXT: ; %bb.1: ; %bb2 22; GCN-NEXT: ;;#ASMSTART 23; GCN-NEXT: v_nop_e64 24; GCN-NEXT: v_nop_e64 25; GCN-NEXT: v_nop_e64 26; GCN-NEXT: ;;#ASMEND 27; GCN-NEXT: s_sleep 0 28; GCN-NEXT: .LBB0_2: ; %bb3 29; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 30; GCN-NEXT: s_mov_b32 s7, 0xf000 31; GCN-NEXT: s_mov_b32 s6, -1 32; GCN-NEXT: v_mov_b32_e32 v0, s0 33; GCN-NEXT: s_waitcnt lgkmcnt(0) 34; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 35; GCN-NEXT: s_waitcnt vmcnt(0) 36; GCN-NEXT: s_endpgm 37bb: 38 %cmp = icmp eq i32 %cnd, 0 39 br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch 40 41bb2: 42; 24 bytes 43 call void asm sideeffect 44 "v_nop_e64 45 v_nop_e64 46 v_nop_e64", ""() #0 47 call void @llvm.amdgcn.s.sleep(i32 0) 48 br label %bb3 49 50bb3: 51 store volatile i32 %cnd, ptr addrspace(1) %arg 52 ret void 53} 54 55define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { 56; GCN-LABEL: uniform_conditional_min_long_forward_branch: 57; GCN: ; %bb.0: ; %bb0 58; GCN-NEXT: s_load_dword s0, s[4:5], 0xb 59; GCN-NEXT: s_waitcnt lgkmcnt(0) 60; GCN-NEXT: s_cmp_eq_u32 s0, 0 61; GCN-NEXT: s_cbranch_scc0 .LBB1_1 62; GCN-NEXT: ; %bb.3: ; %bb0 63; GCN-NEXT: s_getpc_b64 s[2:3] 64; GCN-NEXT: .Lpost_getpc0: 65; GCN-NEXT: s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc0)&4294967295 66; GCN-NEXT: s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc0)>>32 67; GCN-NEXT: s_setpc_b64 s[2:3] 68; GCN-NEXT: .LBB1_1: ; %bb2 69; GCN-NEXT: ;;#ASMSTART 70; GCN-NEXT: v_nop_e64 71; GCN-NEXT: v_nop_e64 72; GCN-NEXT: v_nop_e64 73; GCN-NEXT: v_nop_e64 74; GCN-NEXT: ;;#ASMEND 75; GCN-NEXT: .LBB1_2: ; %bb3 76; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 77; GCN-NEXT: s_mov_b32 s7, 0xf000 78; GCN-NEXT: s_mov_b32 s6, -1 79; GCN-NEXT: v_mov_b32_e32 v0, s0 80; GCN-NEXT: s_waitcnt lgkmcnt(0) 81; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 82; GCN-NEXT: s_waitcnt vmcnt(0) 83; GCN-NEXT: s_endpgm 84bb0: 85 %cmp = icmp eq i32 %cnd, 0 86 br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch 87 88bb2: 89; 32 bytes 90 call void asm sideeffect 91 "v_nop_e64 92 v_nop_e64 93 v_nop_e64 94 v_nop_e64", ""() #0 95 br label %bb3 96 97bb3: 98 store volatile i32 %cnd, ptr addrspace(1) %arg 99 ret void 100} 101 102define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { 103; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: 104; GCN: ; %bb.0: ; %bb0 105; GCN-NEXT: s_load_dword s0, s[4:5], 0xb 106; GCN-NEXT: s_waitcnt lgkmcnt(0) 107; GCN-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0 108; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] 109; GCN-NEXT: s_cbranch_vccz .LBB2_1 110; GCN-NEXT: ; %bb.3: ; %bb0 111; GCN-NEXT: s_getpc_b64 s[8:9] 112; GCN-NEXT: .Lpost_getpc1: 113; GCN-NEXT: s_add_u32 s8, s8, (.LBB2_2-.Lpost_getpc1)&4294967295 114; GCN-NEXT: s_addc_u32 s9, s9, (.LBB2_2-.Lpost_getpc1)>>32 115; GCN-NEXT: s_setpc_b64 s[8:9] 116; GCN-NEXT: .LBB2_1: ; %bb2 117; GCN-NEXT: ;;#ASMSTART 118; GCN-NEXT: ; 32 bytes 119; GCN-NEXT: v_nop_e64 120; GCN-NEXT: v_nop_e64 121; GCN-NEXT: v_nop_e64 122; GCN-NEXT: v_nop_e64 123; GCN-NEXT: ;;#ASMEND 124; GCN-NEXT: .LBB2_2: ; %bb3 125; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 126; GCN-NEXT: s_mov_b32 s7, 0xf000 127; GCN-NEXT: s_mov_b32 s6, -1 128; GCN-NEXT: v_mov_b32_e32 v0, s0 129; GCN-NEXT: s_waitcnt lgkmcnt(0) 130; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 131; GCN-NEXT: s_waitcnt vmcnt(0) 132; GCN-NEXT: s_endpgm 133bb0: 134 %cmp = fcmp oeq float %cnd, 0.0 135 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch 136 137bb2: 138 call void asm sideeffect " ; 32 bytes 139 v_nop_e64 140 v_nop_e64 141 v_nop_e64 142 v_nop_e64", ""() #0 143 br label %bb3 144 145bb3: 146 store volatile float %cnd, ptr addrspace(1) %arg 147 ret void 148} 149 150define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { 151; GCN-LABEL: min_long_forward_vbranch: 152; GCN: ; %bb.0: ; %bb 153; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 154; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 155; GCN-NEXT: v_mov_b32_e32 v1, 0 156; GCN-NEXT: s_mov_b32 s3, 0xf000 157; GCN-NEXT: s_mov_b32 s2, 0 158; GCN-NEXT: s_waitcnt lgkmcnt(0) 159; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc 160; GCN-NEXT: s_waitcnt vmcnt(0) 161; GCN-NEXT: v_mov_b32_e32 v1, s1 162; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 163; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 164; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 165; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc 166; GCN-NEXT: s_cbranch_execnz .LBB3_1 167; GCN-NEXT: ; %bb.3: ; %bb 168; GCN-NEXT: s_getpc_b64 s[6:7] 169; GCN-NEXT: .Lpost_getpc2: 170; GCN-NEXT: s_add_u32 s6, s6, (.LBB3_2-.Lpost_getpc2)&4294967295 171; GCN-NEXT: s_addc_u32 s7, s7, (.LBB3_2-.Lpost_getpc2)>>32 172; GCN-NEXT: s_setpc_b64 s[6:7] 173; GCN-NEXT: .LBB3_1: ; %bb2 174; GCN-NEXT: ;;#ASMSTART 175; GCN-NEXT: ; 32 bytes 176; GCN-NEXT: v_nop_e64 177; GCN-NEXT: v_nop_e64 178; GCN-NEXT: v_nop_e64 179; GCN-NEXT: v_nop_e64 180; GCN-NEXT: ;;#ASMEND 181; GCN-NEXT: .LBB3_2: ; %bb3 182; GCN-NEXT: s_or_b64 exec, exec, s[0:1] 183; GCN-NEXT: s_mov_b32 s0, s2 184; GCN-NEXT: s_mov_b32 s1, s2 185; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 186; GCN-NEXT: s_waitcnt vmcnt(0) 187; GCN-NEXT: s_endpgm 188bb: 189 %tid = call i32 @llvm.amdgcn.workitem.id.x() 190 %tid.ext = zext i32 %tid to i64 191 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext 192 %load = load volatile i32, ptr addrspace(1) %gep 193 %cmp = icmp eq i32 %load, 0 194 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch 195 196bb2: 197 call void asm sideeffect " ; 32 bytes 198 v_nop_e64 199 v_nop_e64 200 v_nop_e64 201 v_nop_e64", ""() #0 202 br label %bb3 203 204bb3: 205 store volatile i32 %load, ptr addrspace(1) %gep 206 ret void 207} 208 209define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { 210; GCN-LABEL: long_backward_sbranch: 211; GCN: ; %bb.0: ; %bb 212; GCN-NEXT: s_mov_b32 s0, 0 213; GCN-NEXT: .LBB4_1: ; %bb2 214; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 215; GCN-NEXT: s_add_i32 s0, s0, 1 216; GCN-NEXT: s_cmp_lt_i32 s0, 10 217; GCN-NEXT: ;;#ASMSTART 218; GCN-NEXT: v_nop_e64 219; GCN-NEXT: v_nop_e64 220; GCN-NEXT: v_nop_e64 221; GCN-NEXT: ;;#ASMEND 222; GCN-NEXT: s_cbranch_scc0 .LBB4_2 223; GCN-NEXT: ; %bb.3: ; %bb2 224; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1 225; GCN-NEXT: s_getpc_b64 s[2:3] 226; GCN-NEXT: .Lpost_getpc3: 227; GCN-NEXT: s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295 228; GCN-NEXT: s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32 229; GCN-NEXT: s_setpc_b64 s[2:3] 230; GCN-NEXT: .LBB4_2: ; %bb3 231; GCN-NEXT: s_endpgm 232 233bb: 234 br label %bb2 235 236bb2: 237 %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ] 238 ; 24 bytes 239 call void asm sideeffect 240 "v_nop_e64 241 v_nop_e64 242 v_nop_e64", ""() #0 243 %inc = add nsw i32 %loop.idx, 1 ; add cost 4 244 %cmp = icmp slt i32 %inc, 10 ; condition cost = 8 245 br i1 %cmp, label %bb2, label %bb3 ; - 246 247bb3: 248 ret void 249} 250 251; Requires expansion of unconditional branch from %bb2 to %bb4 (and 252; expansion of conditional branch from %bb to %bb3. 253 254define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { 255; GCN-LABEL: uniform_unconditional_min_long_forward_branch: 256; GCN: ; %bb.0: ; %bb0 257; GCN-NEXT: s_load_dword s0, s[4:5], 0xb 258; GCN-NEXT: s_waitcnt lgkmcnt(0) 259; GCN-NEXT: s_cmp_eq_u32 s0, 0 260; GCN-NEXT: s_mov_b64 s[0:1], -1 261; GCN-NEXT: s_cbranch_scc0 .LBB5_1 262; GCN-NEXT: ; %bb.7: ; %bb0 263; GCN-NEXT: s_getpc_b64 s[6:7] 264; GCN-NEXT: .Lpost_getpc5: 265; GCN-NEXT: s_add_u32 s6, s6, (.LBB5_4-.Lpost_getpc5)&4294967295 266; GCN-NEXT: s_addc_u32 s7, s7, (.LBB5_4-.Lpost_getpc5)>>32 267; GCN-NEXT: s_setpc_b64 s[6:7] 268; GCN-NEXT: .LBB5_1: ; %Flow 269; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] 270; GCN-NEXT: s_cbranch_vccnz .LBB5_3 271; GCN-NEXT: .LBB5_2: ; %bb2 272; GCN-NEXT: s_mov_b32 s3, 0xf000 273; GCN-NEXT: s_mov_b32 s2, -1 274; GCN-NEXT: v_mov_b32_e32 v0, 17 275; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 276; GCN-NEXT: s_waitcnt vmcnt(0) 277; GCN-NEXT: .LBB5_3: ; %bb4 278; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 279; GCN-NEXT: s_mov_b32 s3, 0xf000 280; GCN-NEXT: s_mov_b32 s2, -1 281; GCN-NEXT: s_waitcnt expcnt(0) 282; GCN-NEXT: v_mov_b32_e32 v0, 63 283; GCN-NEXT: s_waitcnt lgkmcnt(0) 284; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 285; GCN-NEXT: s_waitcnt vmcnt(0) 286; GCN-NEXT: s_endpgm 287; GCN-NEXT: .LBB5_4: ; %bb3 288; GCN-NEXT: ;;#ASMSTART 289; GCN-NEXT: v_nop_e64 290; GCN-NEXT: v_nop_e64 291; GCN-NEXT: v_nop_e64 292; GCN-NEXT: v_nop_e64 293; GCN-NEXT: ;;#ASMEND 294; GCN-NEXT: s_mov_b64 vcc, exec 295; GCN-NEXT: s_cbranch_execnz .LBB5_5 296; GCN-NEXT: ; %bb.9: ; %bb3 297; GCN-NEXT: s_getpc_b64 s[6:7] 298; GCN-NEXT: .Lpost_getpc6: 299; GCN-NEXT: s_add_u32 s6, s6, (.LBB5_2-.Lpost_getpc6)&4294967295 300; GCN-NEXT: s_addc_u32 s7, s7, (.LBB5_2-.Lpost_getpc6)>>32 301; GCN-NEXT: s_setpc_b64 s[6:7] 302; GCN-NEXT: .LBB5_5: ; %bb3 303; GCN-NEXT: s_getpc_b64 s[6:7] 304; GCN-NEXT: .Lpost_getpc4: 305; GCN-NEXT: s_add_u32 s6, s6, (.LBB5_3-.Lpost_getpc4)&4294967295 306; GCN-NEXT: s_addc_u32 s7, s7, (.LBB5_3-.Lpost_getpc4)>>32 307; GCN-NEXT: s_setpc_b64 s[6:7] 308bb0: 309 %tmp = icmp ne i32 %arg1, 0 310 br i1 %tmp, label %bb2, label %bb3 311 312bb2: 313 store volatile i32 17, ptr addrspace(1) undef 314 br label %bb4 315 316bb3: 317 ; 32 byte asm 318 call void asm sideeffect 319 "v_nop_e64 320 v_nop_e64 321 v_nop_e64 322 v_nop_e64", ""() #0 323 br label %bb4 324 325bb4: 326 store volatile i32 63, ptr addrspace(1) %arg 327 ret void 328} 329 330attributes #0 = { nounwind } 331attributes #1 = { nounwind readnone } 332