1; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
2
3; Test that loops with different maximum offsets for different address
4; spaces are correctly handled.
5
6target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
7
8; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32(
9; OPT: .lr.ph.preheader:
10; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4095
11; OPT: br label %.lr.ph
12; OPT: {{^}}.lr.ph:
13; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
14; OPT: load i8, ptr addrspace(1) %lsr.iv3, align 1
15; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
16define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 {
17bb:
18  %tmp = icmp sgt i32 %n, 0
19  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
20
21.lr.ph.preheader:                                 ; preds = %bb
22  br label %.lr.ph
23
24._crit_edge.loopexit:                             ; preds = %.lr.ph
25  br label %._crit_edge
26
27._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
28  ret void
29
30.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
31  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
32  %tmp1 = add nuw nsw i64 %indvars.iv, 4095
33  %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %tmp1
34  %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
35  %tmp4 = sext i8 %tmp3 to i32
36  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
37  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
38  %tmp7 = add nsw i32 %tmp6, %tmp4
39  store i32 %tmp7, ptr addrspace(1) %tmp5, align 4
40  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
41  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
42  %exitcond = icmp eq i32 %lftr.wideiv, %n
43  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
44}
45
46; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32(
47; OPT: {{^}}.lr.ph.preheader:
48; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4096
49; OPT: br label %.lr.ph
50
51; OPT: {{^}}.lr.ph:
52; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
53; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
54define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 {
55bb:
56  %tmp = icmp sgt i32 %n, 0
57  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
58
59.lr.ph.preheader:                                 ; preds = %bb
60  br label %.lr.ph
61
62._crit_edge.loopexit:                             ; preds = %.lr.ph
63  br label %._crit_edge
64
65._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
66  ret void
67
68.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
69  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
70  %tmp1 = add nuw nsw i64 %indvars.iv, 4096
71  %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %tmp1
72  %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
73  %tmp4 = sext i8 %tmp3 to i32
74  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
75  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
76  %tmp7 = add nsw i32 %tmp6, %tmp4
77  store i32 %tmp7, ptr addrspace(1) %tmp5, align 4
78  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
79  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
80  %exitcond = icmp eq i32 %lftr.wideiv, %n
81  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
82}
83
84; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32(
85; OPT: .lr.ph.preheader:
86; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65535
87; OPT: br label %.lr.ph
88; OPT: {{^}}.lr.ph
89; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
90; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv3, align 1
91; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1
92define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
93bb:
94  %tmp = icmp sgt i32 %n, 0
95  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
96
97.lr.ph.preheader:                                 ; preds = %bb
98  br label %.lr.ph
99
100._crit_edge.loopexit:                             ; preds = %.lr.ph
101  br label %._crit_edge
102
103._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
104  ret void
105
106.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
107  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
108  %tmp1 = add nuw nsw i64 %indvars.iv, 65535
109  %tmp2 = trunc i64 %tmp1 to i32
110  %tmp3 = getelementptr inbounds i8, ptr addrspace(3) %arg1, i32 %tmp2
111  %tmp4 = load i8, ptr addrspace(3) %tmp3, align 1
112  %tmp5 = sext i8 %tmp4 to i32
113  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
114  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
115  %tmp8 = add nsw i32 %tmp7, %tmp5
116  store i32 %tmp8, ptr addrspace(1) %tmp6, align 4
117  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
118  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
119  %exitcond = icmp eq i32 %lftr.wideiv, %n
120  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
121}
122
123; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32(
124; OPT: {{^}}.lr.ph.preheader:
125; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65536
126; OPT: br label %.lr.ph
127
128; OPT: {{^}}.lr.ph:
129; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
130; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1
131define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
132bb:
133  %tmp = icmp sgt i32 %n, 0
134  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
135
136.lr.ph.preheader:                                 ; preds = %bb
137  br label %.lr.ph
138
139._crit_edge.loopexit:                             ; preds = %.lr.ph
140  br label %._crit_edge
141
142._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
143  ret void
144
145.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
146  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
147  %tmp1 = add nuw nsw i64 %indvars.iv, 65536
148  %tmp2 = trunc i64 %tmp1 to i32
149  %tmp3 = getelementptr inbounds i8, ptr addrspace(3) %arg1, i32 %tmp2
150  %tmp4 = load i8, ptr addrspace(3) %tmp3, align 1
151  %tmp5 = sext i8 %tmp4 to i32
152  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
153  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
154  %tmp8 = add nsw i32 %tmp7, %tmp5
155  store i32 %tmp8, ptr addrspace(1) %tmp6, align 4
156  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
157  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
158  %exitcond = icmp eq i32 %lftr.wideiv, %n
159  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
160}
161
162attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }
163