1; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 | FileCheck --check-prefix=GCN %s 2 3; Check that the waitcnt insertion algorithm correctly propagates wait counts 4; from before a loop to the loop header. 5 6; GCN-LABEL: {{^}}testKernel 7; GCN: BB0_1: 8; GCN: s_waitcnt vmcnt(0){{$}} 9; GCN-NEXT: v_cmp_eq_f32_e32 10; GCN: s_waitcnt vmcnt(0){{$}} 11; GCN-NEXT: v_cmp_eq_f32_e32 12; GCN: s_waitcnt vmcnt(0){{$}} 13; GCN-NEXT: v_cmp_eq_f32_e32 14 15@data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4 16@data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4 17 18define amdgpu_kernel void @testKernel(ptr addrspace(1) nocapture %arg) local_unnamed_addr #0 { 19bb: 20 store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr bitcast (ptr getelementptr ([100 x float], ptr addrspacecast ([100 x float] addrspace(1)* @data_generic to ptr), i64 0, i64 4) to ptr), align 4 21 store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr bitcast (ptr getelementptr ([100 x float], ptr addrspacecast ([100 x float] addrspace(1)* @data_reference to ptr), i64 0, i64 4) to ptr), align 4 22 br label %bb18 23 24bb1: ; preds = %bb18 25 %tmp = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() 26 %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() 27 %tmp3 = tail call i32 @llvm.amdgcn.workgroup.id.x() 28 %tmp4 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 4 29 %tmp6 = load i16, ptr addrspace(4) %tmp4, align 4 30 %tmp7 = zext i16 %tmp6 to i32 31 %tmp8 = mul i32 %tmp3, %tmp7 32 %tmp9 = add i32 %tmp8, %tmp2 33 %tmp10 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 34 %tmp11 = zext i32 %tmp9 to i64 35 %tmp13 = load i64, ptr addrspace(4) %tmp10, align 8 36 %tmp14 = add i64 %tmp13, %tmp11 37 %tmp15 = zext i1 %tmp99 to i32 38 %tmp16 = and i64 %tmp14, 4294967295 39 %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16 40 store i32 %tmp15, ptr addrspace(1) %tmp17, align 4 41 ret void 42 43bb18: ; preds = %bb18, %bb 44 %tmp19 = phi i64 [ 0, %bb ], [ %tmp102, %bb18 ] 45 %tmp20 = phi i32 [ 0, %bb ], [ %tmp100, %bb18 ] 46 %tmp21 = phi i1 [ true, %bb ], [ %tmp99, %bb18 ] 47 %tmp22 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp19 48 %tmp23 = load float, ptr addrspace(1) %tmp22, align 4 49 %tmp24 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp19 50 %tmp25 = load float, ptr addrspace(1) %tmp24, align 4 51 %tmp26 = fcmp oeq float %tmp23, %tmp25 52 %tmp27 = and i1 %tmp21, %tmp26 53 %tmp28 = or i32 %tmp20, 1 54 %tmp29 = sext i32 %tmp28 to i64 55 %tmp30 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp29 56 %tmp31 = load float, ptr addrspace(1) %tmp30, align 4 57 %tmp32 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp29 58 %tmp33 = load float, ptr addrspace(1) %tmp32, align 4 59 %tmp34 = fcmp oeq float %tmp31, %tmp33 60 %tmp35 = and i1 %tmp27, %tmp34 61 %tmp36 = add nuw nsw i32 %tmp20, 2 62 %tmp37 = sext i32 %tmp36 to i64 63 %tmp38 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp37 64 %tmp39 = load float, ptr addrspace(1) %tmp38, align 4 65 %tmp40 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp37 66 %tmp41 = load float, ptr addrspace(1) %tmp40, align 4 67 %tmp42 = fcmp oeq float %tmp39, %tmp41 68 %tmp43 = and i1 %tmp35, %tmp42 69 %tmp44 = add nuw nsw i32 %tmp20, 3 70 %tmp45 = sext i32 %tmp44 to i64 71 %tmp46 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp45 72 %tmp47 = load float, ptr addrspace(1) %tmp46, align 4 73 %tmp48 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp45 74 %tmp49 = load float, ptr addrspace(1) %tmp48, align 4 75 %tmp50 = fcmp oeq float %tmp47, %tmp49 76 %tmp51 = and i1 %tmp43, %tmp50 77 %tmp52 = add nuw nsw i32 %tmp20, 4 78 %tmp53 = sext i32 %tmp52 to i64 79 %tmp54 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp53 80 %tmp55 = load float, ptr addrspace(1) %tmp54, align 4 81 %tmp56 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp53 82 %tmp57 = load float, ptr addrspace(1) %tmp56, align 4 83 %tmp58 = fcmp oeq float %tmp55, %tmp57 84 %tmp59 = and i1 %tmp51, %tmp58 85 %tmp60 = add nuw nsw i32 %tmp20, 5 86 %tmp61 = sext i32 %tmp60 to i64 87 %tmp62 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp61 88 %tmp63 = load float, ptr addrspace(1) %tmp62, align 4 89 %tmp64 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp61 90 %tmp65 = load float, ptr addrspace(1) %tmp64, align 4 91 %tmp66 = fcmp oeq float %tmp63, %tmp65 92 %tmp67 = and i1 %tmp59, %tmp66 93 %tmp68 = add nuw nsw i32 %tmp20, 6 94 %tmp69 = sext i32 %tmp68 to i64 95 %tmp70 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp69 96 %tmp71 = load float, ptr addrspace(1) %tmp70, align 4 97 %tmp72 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp69 98 %tmp73 = load float, ptr addrspace(1) %tmp72, align 4 99 %tmp74 = fcmp oeq float %tmp71, %tmp73 100 %tmp75 = and i1 %tmp67, %tmp74 101 %tmp76 = add nuw nsw i32 %tmp20, 7 102 %tmp77 = sext i32 %tmp76 to i64 103 %tmp78 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp77 104 %tmp79 = load float, ptr addrspace(1) %tmp78, align 4 105 %tmp80 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp77 106 %tmp81 = load float, ptr addrspace(1) %tmp80, align 4 107 %tmp82 = fcmp oeq float %tmp79, %tmp81 108 %tmp83 = and i1 %tmp75, %tmp82 109 %tmp84 = add nuw nsw i32 %tmp20, 8 110 %tmp85 = sext i32 %tmp84 to i64 111 %tmp86 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp85 112 %tmp87 = load float, ptr addrspace(1) %tmp86, align 4 113 %tmp88 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp85 114 %tmp89 = load float, ptr addrspace(1) %tmp88, align 4 115 %tmp90 = fcmp oeq float %tmp87, %tmp89 116 %tmp91 = and i1 %tmp83, %tmp90 117 %tmp92 = add nuw nsw i32 %tmp20, 9 118 %tmp93 = sext i32 %tmp92 to i64 119 %tmp94 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp93 120 %tmp95 = load float, ptr addrspace(1) %tmp94, align 4 121 %tmp96 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp93 122 %tmp97 = load float, ptr addrspace(1) %tmp96, align 4 123 %tmp98 = fcmp oeq float %tmp95, %tmp97 124 %tmp99 = and i1 %tmp91, %tmp98 125 %tmp100 = add nuw nsw i32 %tmp20, 10 126 %tmp101 = icmp eq i32 %tmp100, 100 127 %tmp102 = sext i32 %tmp100 to i64 128 br i1 %tmp101, label %bb1, label %bb18 129} 130 131; Function Attrs: nounwind readnone speculatable 132declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1 133 134; Function Attrs: nounwind readnone speculatable 135declare i32 @llvm.amdgcn.workitem.id.x() #1 136 137; Function Attrs: nounwind readnone speculatable 138declare i32 @llvm.amdgcn.workgroup.id.x() #1 139 140; Function Attrs: nounwind readnone speculatable 141declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1 142 143attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" } 144attributes #1 = { nounwind readnone speculatable } 145