1; RUN: llc -mtriple=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s 2 3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 4 5; SI-LABEL: {{^}}test_if: 6; Make sure the i1 values created by the cfg structurizer pass are 7; moved using VALU instructions 8 9 10; waitcnt should be inserted after exec modification 11; SI: v_cmp_lt_i32_e32 vcc, 1, 12; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 13; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0 14; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc 15; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] 16; SI-NEXT: s_cbranch_execz [[FLOW_BB:.LBB[0-9]+_[0-9]+]] 17 18; SI-NEXT: ; %bb.{{[0-9]+}}: ; %LeafBlock3 19; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 20; SI: s_and_saveexec_b64 21; SI-NEXT: s_cbranch_execnz 22 23; v_mov should be after exec modification 24; SI: [[FLOW_BB]]: 25; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]] 26; 27define amdgpu_kernel void @test_if(i32 %b, ptr addrspace(1) %src, ptr addrspace(1) %dst) #1 { 28entry: 29 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 30 switch i32 %tid, label %default [ 31 i32 1, label %case1 32 i32 2, label %case2 33 ] 34 35case1: 36 %arrayidx1 = getelementptr i32, ptr addrspace(1) %dst, i32 %b 37 store i32 13, ptr addrspace(1) %arrayidx1, align 4 38 br label %end 39 40case2: 41 %arrayidx5 = getelementptr i32, ptr addrspace(1) %dst, i32 %b 42 store i32 17, ptr addrspace(1) %arrayidx5, align 4 43 br label %end 44 45default: 46 %cmp8 = icmp eq i32 %tid, 2 47 %arrayidx10 = getelementptr i32, ptr addrspace(1) %dst, i32 %b 48 br i1 %cmp8, label %if, label %else 49 50if: 51 store i32 19, ptr addrspace(1) %arrayidx10, align 4 52 br label %end 53 54else: 55 store i32 21, ptr addrspace(1) %arrayidx10, align 4 56 br label %end 57 58end: 59 ret void 60} 61 62; SI-LABEL: {{^}}simple_test_v_if: 63; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 64; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 65; SI-NEXT: s_cbranch_execz [[EXIT:.LBB[0-9]+_[0-9]+]] 66 67; SI-NEXT: ; %bb.{{[0-9]+}}: 68; SI: buffer_store_dword 69 70; SI-NEXT: {{^}}[[EXIT]]: 71; SI: s_endpgm 72define amdgpu_kernel void @simple_test_v_if(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { 73 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 74 %is.0 = icmp ne i32 %tid, 0 75 br i1 %is.0, label %then, label %exit 76 77then: 78 %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid 79 store i32 999, ptr addrspace(1) %gep 80 br label %exit 81 82exit: 83 ret void 84} 85 86; FIXME: It would be better to endpgm in the then block. 87 88; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: 89; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 90; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 91; SI-NEXT: s_cbranch_execz [[EXIT:.LBB[0-9]+_[0-9]+]] 92 93; SI-NEXT: ; %bb.{{[0-9]+}}: 94; SI: buffer_store_dword 95 96; SI-NEXT: {{^}}[[EXIT]]: 97; SI: s_endpgm 98define amdgpu_kernel void @simple_test_v_if_ret_else_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { 99 %tid = call i32 @llvm.amdgcn.workitem.id.x() 100 %is.0 = icmp ne i32 %tid, 0 101 br i1 %is.0, label %then, label %exit 102 103then: 104 %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid 105 store i32 999, ptr addrspace(1) %gep 106 ret void 107 108exit: 109 ret void 110} 111 112; Final block has more than a ret to execute. This was miscompiled 113; before function exit blocks were unified since the endpgm would 114; terminate the then wavefront before reaching the store. 115 116; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: 117; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} 118; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 119; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] 120; SI: s_cbranch_execnz [[EXIT:.LBB[0-9]+_[0-9]+]] 121 122; SI-NEXT: {{^.LBB[0-9]+_[0-9]+}}: ; %Flow 123; SI-NEXT: s_andn2_saveexec_b64 [[BR_SREG]], [[BR_SREG]] 124; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:.LBB[0-9]+_[0-9]+]] 125 126; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then 127; SI: s_waitcnt 128; SI-NEXT: buffer_store_dword 129 130; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock 131; SI: s_endpgm 132 133; SI-NEXT: {{^}}[[EXIT]]: 134; SI: ds_write_b32 135define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { 136 %tid = call i32 @llvm.amdgcn.workitem.id.x() 137 %is.0 = icmp ne i32 %tid, 0 138 br i1 %is.0, label %then, label %exit 139 140then: 141 %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid 142 store i32 999, ptr addrspace(1) %gep 143 ret void 144 145exit: 146 store volatile i32 7, ptr addrspace(3) undef 147 ret void 148} 149 150; SI-LABEL: {{^}}simple_test_v_loop: 151; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 152; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 153; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:.LBB[0-9]+_[0-9]+]] 154 155; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} 156 157; SI: [[LABEL_LOOP:.LBB[0-9]+_[0-9]+]]: 158; SI: buffer_load_dword 159; SI-DAG: buffer_store_dword 160; SI-DAG: s_cmpk_lg_i32 s{{[0-9]+}}, 0x100 161; SI: s_cbranch_scc1 [[LABEL_LOOP]] 162; SI: [[LABEL_EXIT]]: 163; SI: s_endpgm 164define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { 165entry: 166 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 167 %is.0 = icmp ne i32 %tid, 0 168 %limit = add i32 %tid, 64 169 br i1 %is.0, label %loop, label %exit 170 171loop: 172 %i = phi i32 [%tid, %entry], [%i.inc, %loop] 173 %gep.src = getelementptr i32, ptr addrspace(1) %src, i32 %i 174 %gep.dst = getelementptr i32, ptr addrspace(1) %dst, i32 %i 175 %load = load i32, ptr addrspace(1) %src 176 store i32 %load, ptr addrspace(1) %gep.dst 177 %i.inc = add nsw i32 %i, 1 178 %cmp = icmp eq i32 %limit, %i.inc 179 br i1 %cmp, label %exit, label %loop 180 181exit: 182 ret void 183} 184 185; SI-LABEL: {{^}}multi_vcond_loop: 186 187; Load loop limit from buffer 188; Branch to exit if uniformly not taken 189; SI: ; %bb.0: 190; SI: buffer_load_dword [[VBOUND:v[0-9]+]] 191; SI: v_cmp_lt_i32_e32 vcc 192; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc 193; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:.LBB[0-9]+_[0-9]+]] 194 195; Initialize inner condition to false 196; SI: ; %bb.{{[0-9]+}}: ; %bb10.preheader 197; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}} 198 199; Clear exec bits for workitems that load -1s 200; SI: .L[[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 201; SI: buffer_load_dword [[B:v[0-9]+]] 202; SI: buffer_load_dword [[A:v[0-9]+]] 203; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] 204; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] 205; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] 206; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] 207; SI: s_cbranch_execz [[LABEL_FLOW:.LBB[0-9]+_[0-9]+]] 208 209; SI: ; %bb.{{[0-9]+}}: ; %bb20 210; SI: buffer_store_dword 211 212; SI: [[LABEL_FLOW]]: 213; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] 214; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]] 215; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 216; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]] 217; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] 218; SI-NEXT: s_cbranch_execnz .L[[LABEL_LOOP]] 219 220; SI: [[LABEL_EXIT]]: 221; SI-NOT: [[COND_STATE]] 222; SI: s_endpgm 223define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture readonly %arg2, ptr addrspace(1) noalias nocapture readonly %arg3) #1 { 224bb: 225 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 226 %tmp4 = sext i32 %tmp to i64 227 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg3, i64 %tmp4 228 %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4 229 %tmp7 = icmp sgt i32 %tmp6, 0 230 %tmp8 = sext i32 %tmp6 to i64 231 br i1 %tmp7, label %bb10, label %bb26 232 233bb10: ; preds = %bb, %bb20 234 %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] 235 %tmp12 = add nsw i64 %tmp11, %tmp4 236 %tmp13 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp12 237 %tmp14 = load i32, ptr addrspace(1) %tmp13, align 4 238 %tmp15 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp12 239 %tmp16 = load i32, ptr addrspace(1) %tmp15, align 4 240 %tmp17 = icmp ne i32 %tmp14, -1 241 %tmp18 = icmp ne i32 %tmp16, -1 242 %tmp19 = and i1 %tmp17, %tmp18 243 br i1 %tmp19, label %bb20, label %bb26 244 245bb20: ; preds = %bb10 246 %tmp21 = add nsw i32 %tmp16, %tmp14 247 %tmp22 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp12 248 store i32 %tmp21, ptr addrspace(1) %tmp22, align 4 249 %tmp23 = add nuw nsw i64 %tmp11, 1 250 %tmp24 = icmp slt i64 %tmp23, %tmp8 251 br i1 %tmp24, label %bb10, label %bb26 252 253bb26: ; preds = %bb10, %bb20, %bb 254 ret void 255} 256 257attributes #0 = { nounwind readnone } 258attributes #1 = { nounwind } 259