1; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s 2 3; Test that loops with different maximum offsets for different address 4; spaces are correctly handled. 5 6target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 7 8; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32( 9; OPT: .lr.ph.preheader: 10; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4095 11; OPT: br label %.lr.ph 12; OPT: {{^}}.lr.ph: 13; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 14; OPT: load i8, ptr addrspace(1) %lsr.iv3, align 1 15; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1 16define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 { 17bb: 18 %tmp = icmp sgt i32 %n, 0 19 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 20 21.lr.ph.preheader: ; preds = %bb 22 br label %.lr.ph 23 24._crit_edge.loopexit: ; preds = %.lr.ph 25 br label %._crit_edge 26 27._crit_edge: ; preds = %._crit_edge.loopexit, %bb 28 ret void 29 30.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 31 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 32 %tmp1 = add nuw nsw i64 %indvars.iv, 4095 33 %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %tmp1 34 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1 35 %tmp4 = sext i8 %tmp3 to i32 36 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv 37 %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4 38 %tmp7 = add nsw i32 %tmp6, %tmp4 39 store i32 %tmp7, ptr addrspace(1) %tmp5, align 4 40 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 41 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 42 %exitcond = icmp eq i32 %lftr.wideiv, %n 43 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 44} 45 46; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32( 47; OPT: {{^}}.lr.ph.preheader: 48; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4096 49; OPT: br label %.lr.ph 50 51; OPT: {{^}}.lr.ph: 52; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 53; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1 54define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 { 55bb: 56 %tmp = icmp sgt i32 %n, 0 57 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 58 59.lr.ph.preheader: ; preds = %bb 60 br label %.lr.ph 61 62._crit_edge.loopexit: ; preds = %.lr.ph 63 br label %._crit_edge 64 65._crit_edge: ; preds = %._crit_edge.loopexit, %bb 66 ret void 67 68.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 69 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 70 %tmp1 = add nuw nsw i64 %indvars.iv, 4096 71 %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %tmp1 72 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1 73 %tmp4 = sext i8 %tmp3 to i32 74 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv 75 %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4 76 %tmp7 = add nsw i32 %tmp6, %tmp4 77 store i32 %tmp7, ptr addrspace(1) %tmp5, align 4 78 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 79 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 80 %exitcond = icmp eq i32 %lftr.wideiv, %n 81 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 82} 83 84; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32( 85; OPT: .lr.ph.preheader: 86; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65535 87; OPT: br label %.lr.ph 88; OPT: {{^}}.lr.ph 89; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 90; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv3, align 1 91; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1 92define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 { 93bb: 94 %tmp = icmp sgt i32 %n, 0 95 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 96 97.lr.ph.preheader: ; preds = %bb 98 br label %.lr.ph 99 100._crit_edge.loopexit: ; preds = %.lr.ph 101 br label %._crit_edge 102 103._crit_edge: ; preds = %._crit_edge.loopexit, %bb 104 ret void 105 106.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 107 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 108 %tmp1 = add nuw nsw i64 %indvars.iv, 65535 109 %tmp2 = trunc i64 %tmp1 to i32 110 %tmp3 = getelementptr inbounds i8, ptr addrspace(3) %arg1, i32 %tmp2 111 %tmp4 = load i8, ptr addrspace(3) %tmp3, align 1 112 %tmp5 = sext i8 %tmp4 to i32 113 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv 114 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4 115 %tmp8 = add nsw i32 %tmp7, %tmp5 116 store i32 %tmp8, ptr addrspace(1) %tmp6, align 4 117 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 118 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 119 %exitcond = icmp eq i32 %lftr.wideiv, %n 120 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 121} 122 123; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32( 124; OPT: {{^}}.lr.ph.preheader: 125; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65536 126; OPT: br label %.lr.ph 127 128; OPT: {{^}}.lr.ph: 129; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] 130; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1 131define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 { 132bb: 133 %tmp = icmp sgt i32 %n, 0 134 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge 135 136.lr.ph.preheader: ; preds = %bb 137 br label %.lr.ph 138 139._crit_edge.loopexit: ; preds = %.lr.ph 140 br label %._crit_edge 141 142._crit_edge: ; preds = %._crit_edge.loopexit, %bb 143 ret void 144 145.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 146 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 147 %tmp1 = add nuw nsw i64 %indvars.iv, 65536 148 %tmp2 = trunc i64 %tmp1 to i32 149 %tmp3 = getelementptr inbounds i8, ptr addrspace(3) %arg1, i32 %tmp2 150 %tmp4 = load i8, ptr addrspace(3) %tmp3, align 1 151 %tmp5 = sext i8 %tmp4 to i32 152 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv 153 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4 154 %tmp8 = add nsw i32 %tmp7, %tmp5 155 store i32 %tmp8, ptr addrspace(1) %tmp6, align 4 156 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 157 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 158 %exitcond = icmp eq i32 %lftr.wideiv, %n 159 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 160} 161 162attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" } 163