xref: /llvm-project/llvm/test/CodeGen/SystemZ/loop-01.ll (revision 84dcf3d35b6ea8d8b6c34bc9cf21135863c47b8c)
1; Test loop tuning.
2;
3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-block-placement | FileCheck %s
4; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -disable-block-placement \
5; RUN:  | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-Z13
6
7; Test that strength reduction is applied to addresses with a scale factor,
8; but that indexed addressing can still be used.
9define void @f1(ptr %dest, i32 %a) {
10; CHECK-LABEL: f1:
11; CHECK-NOT: sllg
12; CHECK: st %r3, 400({{%r[1-5],%r[1-5]}})
13; CHECK: br %r14
14entry:
15  br label %loop
16
17loop:
18  %index = phi i64 [ 0, %entry ], [ %next, %loop ]
19  %ptr = getelementptr i32, ptr %dest, i64 %index
20  store i32 %a, ptr %ptr
21  %next = add i64 %index, 1
22  %cmp = icmp ne i64 %next, 100
23  br i1 %cmp, label %loop, label %exit
24
25exit:
26  ret void
27}
28
29; Test a loop that should be converted into dbr form and then use BRCT.
30define void @f2(ptr %src, ptr %dest) {
31; CHECK-LABEL: f2:
32; CHECK: lhi [[REG:%r[0-5]]], 100
33; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
34; CHECK: brct [[REG]], [[LABEL]]
35; CHECK: br %r14
36entry:
37  br label %loop
38
39loop:
40  %count = phi i32 [ 0, %entry ], [ %next, %loop.next ]
41  %next = add i32 %count, 1
42  %val = load volatile i32, ptr %src
43  %cmp = icmp eq i32 %val, 0
44  br i1 %cmp, label %loop.next, label %loop.store
45
46loop.store:
47  %add = add i32 %val, 1
48  store volatile i32 %add, ptr %dest
49  br label %loop.next
50
51loop.next:
52  %cont = icmp ne i32 %next, 100
53  br i1 %cont, label %loop, label %exit
54
55exit:
56  ret void
57}
58
59; Like f2, but for BRCTG.
60define void @f3(ptr %src, ptr %dest) {
61; CHECK-LABEL: f3:
62; CHECK: lghi [[REG:%r[0-5]]], 100
63; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
64; CHECK: brctg [[REG]], [[LABEL]]
65; CHECK: br %r14
66entry:
67  br label %loop
68
69loop:
70  %count = phi i64 [ 0, %entry ], [ %next, %loop.next ]
71  %next = add i64 %count, 1
72  %val = load volatile i64, ptr %src
73  %cmp = icmp eq i64 %val, 0
74  br i1 %cmp, label %loop.next, label %loop.store
75
76loop.store:
77  %add = add i64 %val, 1
78  store volatile i64 %add, ptr %dest
79  br label %loop.next
80
81loop.next:
82  %cont = icmp ne i64 %next, 100
83  br i1 %cont, label %loop, label %exit
84
85exit:
86  ret void
87}
88
89; Test a loop with a 64-bit decremented counter in which the 32-bit
90; low part of the counter is used after the decrement.  This is an example
91; of a subregister use being the only thing that blocks a conversion to BRCTG.
92define void @f4(ptr %src, ptr %dest, ptr %dest2, i64 %count) {
93; CHECK-LABEL: f4:
94; CHECK: aghi [[REG:%r[0-5]]], -1
95; CHECK: lr [[REG2:%r[0-5]]], [[REG]]
96; CHECK: stg [[REG2]],
97; CHECK: jne {{\..*}}
98; CHECK: br %r14
99entry:
100  br label %loop
101
102loop:
103  %left = phi i64 [ %count, %entry ], [ %next, %loop.next ]
104  store volatile i64 %left, ptr %dest2
105  %val = load volatile i32, ptr %src
106  %cmp = icmp eq i32 %val, 0
107  br i1 %cmp, label %loop.next, label %loop.store
108
109loop.store:
110  %add = add i32 %val, 1
111  store volatile i32 %add, ptr %dest
112  br label %loop.next
113
114loop.next:
115  %next = add i64 %left, -1
116  %ext = zext i32 %val to i64
117  %shl = shl i64 %ext, 32
118  %and = and i64 %next, 4294967295
119  %or = or i64 %shl, %and
120  store volatile i64 %or, ptr %dest2
121  %cont = icmp ne i64 %next, 0
122  br i1 %cont, label %loop, label %exit
123
124exit:
125  ret void
126}
127
128; Test that negative offsets are avoided for loads of floating point.
129%s.float = type { float, float, float }
130define void @f5(ptr nocapture %a,
131                ptr nocapture readonly %b,
132                i32 zeroext %S) {
133; CHECK-Z13-LABEL: f5:
134; CHECK-Z13-NOT: -{{[0-9]+}}(%r
135
136entry:
137  %cmp9 = icmp eq i32 %S, 0
138  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
139
140for.body.preheader:                 ; preds = %entry
141  br label %for.body
142
143for.cond.cleanup.loopexit:          ; preds = %for.body
144  br label %for.cond.cleanup
145
146for.cond.cleanup:                   ; preds = %for.cond.cleanup.loopexit, %entry
147  ret void
148
149for.body:                           ; preds = %for.body.preheader, %for.body
150  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
151  %a1 = getelementptr inbounds %s.float, ptr %b, i64 %indvars.iv, i32 0
152  %tmp = load float, ptr %a1, align 4
153  %b4 = getelementptr inbounds %s.float, ptr %b, i64 %indvars.iv, i32 1
154  %tmp1 = load float, ptr %b4, align 4
155  %add = fadd float %tmp, %tmp1
156  %c = getelementptr inbounds %s.float, ptr %b, i64 %indvars.iv, i32 2
157  %tmp2 = load float, ptr %c, align 4
158  %add7 = fadd float %add, %tmp2
159  %a10 = getelementptr inbounds %s.float, ptr %a, i64 %indvars.iv, i32 0
160  store float %add7, ptr %a10, align 4
161  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
162  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
163  %exitcond = icmp eq i32 %lftr.wideiv, %S
164  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
165}
166
167; Test that negative offsets are avoided for loads of double.
168%s.double = type { double, double, double }
169define void @f6(ptr nocapture %a,
170                ptr nocapture readonly %b,
171                i32 zeroext %S) {
172; CHECK-Z13-LABEL: f6:
173; CHECK-Z13-NOT: -{{[0-9]+}}(%r
174entry:
175  %cmp9 = icmp eq i32 %S, 0
176  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
177
178for.body.preheader:                  ; preds = %entry
179  br label %for.body
180
181for.cond.cleanup.loopexit:           ; preds = %for.body
182  br label %for.cond.cleanup
183
184for.cond.cleanup:                    ; preds = %for.cond.cleanup.loopexit, %entry
185  ret void
186
187for.body:                            ; preds = %for.body.preheader, %for.body
188  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
189  %a1 = getelementptr inbounds %s.double, ptr %b, i64 %indvars.iv, i32 0
190  %tmp = load double, ptr %a1, align 4
191  %b4 = getelementptr inbounds %s.double, ptr %b, i64 %indvars.iv, i32 1
192  %tmp1 = load double, ptr %b4, align 4
193  %add = fadd double %tmp, %tmp1
194  %c = getelementptr inbounds %s.double, ptr %b, i64 %indvars.iv, i32 2
195  %tmp2 = load double, ptr %c, align 4
196  %add7 = fadd double %add, %tmp2
197  %a10 = getelementptr inbounds %s.double, ptr %a, i64 %indvars.iv, i32 0
198  store double %add7, ptr %a10, align 4
199  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
200  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
201  %exitcond = icmp eq i32 %lftr.wideiv, %S
202  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
203}
204
205; Test that negative offsets are avoided for memory accesses of vector type.
206%s.vec = type { <4 x i32>, <4 x i32>, <4 x i32> }
207define void @f7(ptr nocapture %a,
208                ptr nocapture readonly %b,
209                i32 zeroext %S) {
210; CHECK-Z13-LABEL: f7:
211; CHECK-Z13-NOT: -{{[0-9]+}}(%r
212entry:
213  %cmp9 = icmp eq i32 %S, 0
214  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
215
216for.body.preheader:                 ; preds = %entry
217  br label %for.body
218
219for.cond.cleanup.loopexit:          ; preds = %for.body
220  br label %for.cond.cleanup
221
222for.cond.cleanup:                   ; preds = %for.cond.cleanup.loopexit, %entry
223  ret void
224
225for.body:                           ; preds = %for.body.preheader, %for.body
226  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
227  %a1 = getelementptr inbounds %s.vec, ptr %b, i64 %indvars.iv, i32 0
228  %tmp = load <4 x i32>, ptr %a1, align 4
229  %b4 = getelementptr inbounds %s.vec, ptr %b, i64 %indvars.iv, i32 1
230  %tmp1 = load <4 x i32>, ptr %b4, align 4
231  %add = add <4 x i32> %tmp1, %tmp
232  %c = getelementptr inbounds %s.vec, ptr %b, i64 %indvars.iv, i32 2
233  %tmp2 = load <4 x i32>, ptr %c, align 4
234  %add7 = add <4 x i32> %add, %tmp2
235  %a10 = getelementptr inbounds %s.vec, ptr %a, i64 %indvars.iv, i32 0
236  store <4 x i32> %add7, ptr %a10, align 4
237  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
238  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
239  %exitcond = icmp eq i32 %lftr.wideiv, %S
240  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
241}
242
243; Test that a memcpy loop does not get a lot of lays before each mvc (D12 and no index-reg).
244%0 = type { %1, ptr }
245%1 = type { ptr, ptr }
246%2 = type <{ %3, i32, [4 x i8] }>
247%3 = type { ptr, ptr, ptr }
248
249declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #0
250
251define void @f8() {
252; CHECK-Z13-LABEL: f8:
253; CHECK-Z13: mvc
254; CHECK-Z13-NEXT: mvc
255; CHECK-Z13-NEXT: mvc
256; CHECK-Z13-NEXT: mvc
257
258bb:
259  %tmp = load ptr, ptr undef, align 8
260  br i1 undef, label %bb2, label %bb1
261
262bb1:                                              ; preds = %bb
263  br label %bb2
264
265bb2:                                              ; preds = %bb1, %bb
266  %tmp3 = phi ptr [ %tmp, %bb ], [ undef, %bb1 ]
267  %tmp4 = phi ptr [ undef, %bb ], [ undef, %bb1 ]
268  br label %bb5
269
270bb5:                                              ; preds = %bb5, %bb2
271  %tmp6 = phi ptr [ %tmp21, %bb5 ], [ %tmp3, %bb2 ]
272  %tmp7 = phi ptr [ %tmp20, %bb5 ], [ %tmp4, %bb2 ]
273  %tmp8 = getelementptr inbounds %0, ptr %tmp7, i64 -1
274  %tmp9 = getelementptr inbounds %0, ptr %tmp6, i64 -1
275  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp9, ptr align 8 %tmp8, i64 24, i1 false)
276  %tmp12 = getelementptr inbounds %0, ptr %tmp7, i64 -2
277  %tmp13 = getelementptr inbounds %0, ptr %tmp6, i64 -2
278  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp13, ptr align 8 %tmp12, i64 24, i1 false)
279  %tmp16 = getelementptr inbounds %0, ptr %tmp7, i64 -3
280  %tmp17 = getelementptr inbounds %0, ptr %tmp6, i64 -3
281  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp17, ptr align 8 %tmp16, i64 24, i1 false)
282  %tmp20 = getelementptr inbounds %0, ptr %tmp7, i64 -4
283  %tmp21 = getelementptr inbounds %0, ptr %tmp6, i64 -4
284  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp21, ptr align 8 %tmp20, i64 24, i1 false)
285  br label %bb5
286}
287
288; Test that a chsi does not need an aghik inside the loop (no index reg)
289define void @f9() {
290; CHECK-Z13-LABEL: f9:
291; CHECK-Z13: # =>This Inner Loop Header: Depth=1
292; CHECK-Z13-NOT: aghik
293; CHECK-Z13: chsi
294
295entry:
296  br label %for.body.i63
297
298for.body.i63:                                     ; preds = %for.inc.i, %entry
299  %indvars.iv155.i = phi i64 [ 0, %entry ], [ %indvars.iv.next156.i.3, %for.inc.i ]
300  %arrayidx.i62 = getelementptr inbounds i32, ptr undef, i64 %indvars.iv155.i
301  %tmp = load i32, ptr %arrayidx.i62, align 4
302  %cmp9.i = icmp eq i32 %tmp, 0
303  br i1 %cmp9.i, label %for.inc.i, label %if.then10.i
304
305if.then10.i:                                      ; preds = %for.body.i63
306  unreachable
307
308for.inc.i:                                        ; preds = %for.body.i63
309  %indvars.iv.next156.i = or i64 %indvars.iv155.i, 1
310  %arrayidx.i62.1 = getelementptr inbounds i32, ptr undef, i64 %indvars.iv.next156.i
311  %tmp1 = load i32, ptr %arrayidx.i62.1, align 4
312  %indvars.iv.next156.i.3 = add nsw i64 %indvars.iv155.i, 4
313  br label %for.body.i63
314}
315
316; Test that offsets are in range for i128 memory accesses.
317define void @fun10() {
318; CHECK-Z13-LABEL: fun10:
319; CHECK-Z13: # =>This Inner Loop Header: Depth=1
320; CHECK-Z13-NOT: lay
321entry:
322  %A1 = alloca [3 x [7 x [10 x i128]]], align 8
323  br label %for.body
324
325for.body:                        ; preds = %for.body, %entry
326  %IV = phi i64 [ 0, %entry ], [ %IV.next, %for.body ]
327  %Addr1 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 6
328  store i128 17174966165894859678, ptr %Addr1, align 8
329  %Addr2 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 8
330  store i128 17174966165894859678, ptr %Addr2, align 8
331  %IV.next = add nuw nsw i64 %IV, 1
332  %exitcond.not.i.i = icmp eq i64 %IV.next, 3
333  br i1 %exitcond.not.i.i, label %exit, label %for.body
334
335exit:                        ; preds = %for.body
336  unreachable
337}
338