xref: /llvm-project/llvm/test/CodeGen/PowerPC/common-chain.ll (revision fc157522c5680b0ff982442bc8043c1e8c998161)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \
3; RUN:   -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
4
5; addresses:
6; 1: base1 + offset
7; 2: + offset
8; 3: + offset
9; 4: + offset
10;
11; chains:
12; 1: base: base1 + offset, offsets: (0, offset)
13; 2: base: base1 + 3*offset, offsets: (0, offset)
14;
15; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) {
16;   long long o1 = base1 + offset;
17;   long long o2 = base1 + 2 * offset;
18;   long long o3 = base1 + 3 * offset;
19;   long long o4 = base1 + 4 * offset;
20;   char *p1 = p + o1;
21;   char *p2 = p + o2;
22;   char *p3 = p + o3;
23;   char *p4 = p + o4;
24;   long long sum = 0;
25;   for (long long i = 0; i < n; ++i) {
26;     unsigned long x1 = *(unsigned long *)(p1 + i);
27;     unsigned long x2 = *(unsigned long *)(p2 + i);
28;     unsigned long x3 = *(unsigned long *)(p3 + i);
29;     unsigned long x4 = *(unsigned long *)(p4 + i);
30;     sum += x1 * x2 * x3 * x4;
31;   }
32;   return sum;
33; }
34;
35define i64 @two_chain_same_offset_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) {
36; CHECK-LABEL: two_chain_same_offset_succ:
37; CHECK:       # %bb.0: # %entry
38; CHECK-NEXT:    cmpdi r6, 0
39; CHECK-NEXT:    ble cr0, .LBB0_4
40; CHECK-NEXT:  # %bb.1: # %for.body.preheader
41; CHECK-NEXT:    sldi r7, r4, 1
42; CHECK-NEXT:    mtctr r6
43; CHECK-NEXT:    add r8, r4, r7
44; CHECK-NEXT:    add r7, r5, r4
45; CHECK-NEXT:    add r5, r5, r8
46; CHECK-NEXT:    add r7, r3, r7
47; CHECK-NEXT:    add r5, r3, r5
48; CHECK-NEXT:    li r3, 0
49; CHECK-NEXT:    .p2align 4
50; CHECK-NEXT:  .LBB0_2: # %for.body
51; CHECK-NEXT:    #
52; CHECK-NEXT:    ld r6, 0(r7)
53; CHECK-NEXT:    ldx r8, r7, r4
54; CHECK-NEXT:    ld r9, 0(r5)
55; CHECK-NEXT:    ldx r10, r5, r4
56; CHECK-NEXT:    addi r7, r7, 1
57; CHECK-NEXT:    addi r5, r5, 1
58; CHECK-NEXT:    mulld r6, r8, r6
59; CHECK-NEXT:    mulld r6, r6, r9
60; CHECK-NEXT:    maddld r3, r6, r10, r3
61; CHECK-NEXT:    bdnz .LBB0_2
62; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
63; CHECK-NEXT:    blr
64; CHECK-NEXT:  .LBB0_4:
65; CHECK-NEXT:    li r3, 0
66; CHECK-NEXT:    blr
67entry:
68  %mul = shl nsw i64 %offset, 1
69  %mul2 = mul nsw i64 %offset, 3
70  %mul4 = shl nsw i64 %offset, 2
71  %cmp46 = icmp sgt i64 %n, 0
72  br i1 %cmp46, label %for.body, label %for.cond.cleanup
73
74for.cond.cleanup:                                 ; preds = %for.body, %entry
75  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
76  ret i64 %sum.0.lcssa
77
78for.body:                                         ; preds = %entry, %for.body
79  %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
80  %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
81  %add = add i64 %i.047, %base1
82  %add.ptr9.idx = add i64 %add, %offset
83  %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
84  %0 = load i64, ptr %add.ptr9, align 8
85  %add.ptr10.idx = add i64 %add, %mul
86  %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
87  %1 = load i64, ptr %add.ptr10, align 8
88  %add.ptr11.idx = add i64 %add, %mul2
89  %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
90  %2 = load i64, ptr %add.ptr11, align 8
91  %add.ptr12.idx = add i64 %add, %mul4
92  %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
93  %3 = load i64, ptr %add.ptr12, align 8
94  %mul13 = mul i64 %1, %0
95  %mul14 = mul i64 %mul13, %2
96  %mul15 = mul i64 %mul14, %3
97  %add16 = add i64 %mul15, %sum.048
98  %inc = add nuw nsw i64 %i.047, 1
99  %exitcond.not = icmp eq i64 %inc, %n
100  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
101}
102
103; addresses:
104; 1: base1 + offset
105; 2: + offset
106; 3: + offset
107; 4: + offset
108; 5: + offset
109;
110; It can not be commoned to chains because we need a chain for a single address.
111; It is not profitable to common chains if not all addresses are in chains.
112;
113; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
114;   long long o1 = base1 + offset;
115;   long long o2 = base1 + 2 * offset;
116;   long long o3 = base1 + 3 * offset;
117;   long long o4 = base1 + 4 * offset;
118;   long long o5 = base1 + 5 * offset;
119;   char *p1 = p + o1;
120;   char *p2 = p + o2;
121;   char *p3 = p + o3;
122;   char *p4 = p + o4;
123;   char *p5 = p + o5;
124;   long long sum = 0;
125;   for (long long i = 0; i < n; ++i) {
126;     unsigned long x1 = *(unsigned long *)(p1 + i);
127;     unsigned long x2 = *(unsigned long *)(p2 + i);
128;     unsigned long x3 = *(unsigned long *)(p3 + i);
129;     unsigned long x4 = *(unsigned long *)(p4 + i);
130;     unsigned long x5 = *(unsigned long *)(p5 + i);
131;     sum += x1 * x2 * x3 * x4 * x5;
132;   }
133;   return sum;
134; }
135;
136define i64 @not_perfect_chain_all_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
137; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
138; CHECK:       # %bb.0: # %entry
139; CHECK-NEXT:    cmpdi r6, 0
140; CHECK-NEXT:    ble cr0, .LBB1_4
141; CHECK-NEXT:  # %bb.1: # %for.body.preheader
142; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
143; CHECK-NEXT:    sldi r7, r4, 1
144; CHECK-NEXT:    add r5, r3, r5
145; CHECK-NEXT:    li r3, 0
146; CHECK-NEXT:    add r8, r4, r7
147; CHECK-NEXT:    sldi r9, r4, 2
148; CHECK-NEXT:    mtctr r6
149; CHECK-NEXT:    add r10, r4, r9
150; CHECK-NEXT:    .p2align 4
151; CHECK-NEXT:  .LBB1_2: # %for.body
152; CHECK-NEXT:    #
153; CHECK-NEXT:    ldx r6, r5, r4
154; CHECK-NEXT:    ldx r11, r5, r7
155; CHECK-NEXT:    ldx r12, r5, r8
156; CHECK-NEXT:    ldx r0, r5, r9
157; CHECK-NEXT:    mulld r6, r11, r6
158; CHECK-NEXT:    ldx r30, r5, r10
159; CHECK-NEXT:    addi r5, r5, 1
160; CHECK-NEXT:    mulld r6, r6, r12
161; CHECK-NEXT:    mulld r6, r6, r0
162; CHECK-NEXT:    maddld r3, r6, r30, r3
163; CHECK-NEXT:    bdnz .LBB1_2
164; CHECK-NEXT:  # %bb.3:
165; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
166; CHECK-NEXT:    blr
167; CHECK-NEXT:  .LBB1_4:
168; CHECK-NEXT:    li r3, 0
169; CHECK-NEXT:    blr
170entry:
171  %mul = shl nsw i64 %offset, 1
172  %mul2 = mul nsw i64 %offset, 3
173  %mul4 = shl nsw i64 %offset, 2
174  %mul6 = mul nsw i64 %offset, 5
175  %cmp58 = icmp sgt i64 %n, 0
176  br i1 %cmp58, label %for.body, label %for.cond.cleanup
177
178for.cond.cleanup:                                 ; preds = %for.body, %entry
179  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ]
180  ret i64 %sum.0.lcssa
181
182for.body:                                         ; preds = %entry, %for.body
183  %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ]
184  %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
185  %add = add i64 %i.059, %base1
186  %add.ptr12.idx = add i64 %add, %offset
187  %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
188  %0 = load i64, ptr %add.ptr12, align 8
189  %add.ptr13.idx = add i64 %add, %mul
190  %add.ptr13 = getelementptr inbounds i8, ptr %p, i64 %add.ptr13.idx
191  %1 = load i64, ptr %add.ptr13, align 8
192  %add.ptr14.idx = add i64 %add, %mul2
193  %add.ptr14 = getelementptr inbounds i8, ptr %p, i64 %add.ptr14.idx
194  %2 = load i64, ptr %add.ptr14, align 8
195  %add.ptr15.idx = add i64 %add, %mul4
196  %add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx
197  %3 = load i64, ptr %add.ptr15, align 8
198  %add.ptr16.idx = add i64 %add, %mul6
199  %add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx
200  %4 = load i64, ptr %add.ptr16, align 8
201  %mul17 = mul i64 %1, %0
202  %mul18 = mul i64 %mul17, %2
203  %mul19 = mul i64 %mul18, %3
204  %mul20 = mul i64 %mul19, %4
205  %add21 = add i64 %mul20, %sum.060
206  %inc = add nuw nsw i64 %i.059, 1
207  %exitcond.not = icmp eq i64 %inc, %n
208  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
209}
210
211; addresses:
212; 1: base1
213; 2: + 2*offset
214; 3: + offset
215;
216; We need at least 4 addresses to common 2 chains to reuse at least 1 offset.
217;
218; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) {
219;   long long o1 = base1;
220;   long long o2 = base1 + 2 * offset;
221;   long long o3 = base1 + 3 * offset;
222;   char *p1 = p + o1;
223;   char *p2 = p + o2;
224;   char *p3 = p + o3;
225;   long long sum = 0;
226;   for (long long i = 0; i < n; ++i) {
227;     unsigned long x1 = *(unsigned long *)(p1 + i);
228;     unsigned long x2 = *(unsigned long *)(p2 + i);
229;     unsigned long x3 = *(unsigned long *)(p3 + i);
230;     sum += x1 * x2 * x3;
231;   }
232;   return sum;
233; }
234;
235define i64 @no_enough_elements_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
236; CHECK-LABEL: no_enough_elements_fail:
237; CHECK:       # %bb.0: # %entry
238; CHECK-NEXT:    cmpdi r6, 0
239; CHECK-NEXT:    ble cr0, .LBB2_4
240; CHECK-NEXT:  # %bb.1: # %for.body.preheader
241; CHECK-NEXT:    sldi r7, r4, 1
242; CHECK-NEXT:    mtctr r6
243; CHECK-NEXT:    add r5, r3, r5
244; CHECK-NEXT:    li r3, 0
245; CHECK-NEXT:    add r4, r4, r7
246; CHECK-NEXT:    .p2align 5
247; CHECK-NEXT:  .LBB2_2: # %for.body
248; CHECK-NEXT:    #
249; CHECK-NEXT:    ld r6, 0(r5)
250; CHECK-NEXT:    ldx r8, r5, r7
251; CHECK-NEXT:    ldx r9, r5, r4
252; CHECK-NEXT:    addi r5, r5, 1
253; CHECK-NEXT:    mulld r6, r8, r6
254; CHECK-NEXT:    maddld r3, r6, r9, r3
255; CHECK-NEXT:    bdnz .LBB2_2
256; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
257; CHECK-NEXT:    blr
258; CHECK-NEXT:  .LBB2_4:
259; CHECK-NEXT:    li r3, 0
260; CHECK-NEXT:    blr
261entry:
262  %mul = shl nsw i64 %offset, 1
263  %mul1 = mul nsw i64 %offset, 3
264  %cmp32 = icmp sgt i64 %n, 0
265  br i1 %cmp32, label %for.body, label %for.cond.cleanup
266
267for.cond.cleanup:                                 ; preds = %for.body, %entry
268  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ]
269  ret i64 %sum.0.lcssa
270
271for.body:                                         ; preds = %entry, %for.body
272  %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ]
273  %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
274  %add.ptr5.idx = add i64 %i.033, %base1
275  %add.ptr5 = getelementptr inbounds i8, ptr %p, i64 %add.ptr5.idx
276  %0 = load i64, ptr %add.ptr5, align 8
277  %add.ptr6.idx = add i64 %add.ptr5.idx, %mul
278  %add.ptr6 = getelementptr inbounds i8, ptr %p, i64 %add.ptr6.idx
279  %1 = load i64, ptr %add.ptr6, align 8
280  %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1
281  %add.ptr7 = getelementptr inbounds i8, ptr %p, i64 %add.ptr7.idx
282  %2 = load i64, ptr %add.ptr7, align 8
283  %mul8 = mul i64 %1, %0
284  %mul9 = mul i64 %mul8, %2
285  %add10 = add i64 %mul9, %sum.034
286  %inc = add nuw nsw i64 %i.033, 1
287  %exitcond.not = icmp eq i64 %inc, %n
288  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
289}
290
291; addresses:
292; 1: base1
293; 2: + 2*offset
294; 3: + 2*offset
295; 4: + 3*offset
296;
297; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
298; so we can not common any chains.
299;
300; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
301;   long long o1 = base1;
302;   long long o2 = base1 + 2 * offset;
303;   long long o3 = base1 + 4 * offset;
304;   long long o4 = base1 + 7 * offset;
305;   char *p1 = p + o1;
306;   char *p2 = p + o2;
307;   char *p3 = p + o3;
308;   char *p4 = p + o4;
309;   long long sum = 0;
310;   for (long long i = 0; i < n; ++i) {
311;     unsigned long x1 = *(unsigned long *)(p1 + i);
312;     unsigned long x2 = *(unsigned long *)(p2 + i);
313;     unsigned long x3 = *(unsigned long *)(p3 + i);
314;     unsigned long x4 = *(unsigned long *)(p4 + i);
315;     sum += x1 * x2 * x3 * x4;
316;   }
317;   return sum;
318; }
319;
320define i64 @no_reuseable_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
321; CHECK-LABEL: no_reuseable_offset_fail:
322; CHECK:       # %bb.0: # %entry
323; CHECK-NEXT:    cmpdi r6, 0
324; CHECK-NEXT:    ble cr0, .LBB3_4
325; CHECK-NEXT:  # %bb.1: # %for.body.preheader
326; CHECK-NEXT:    sldi r9, r4, 3
327; CHECK-NEXT:    mtctr r6
328; CHECK-NEXT:    add r5, r3, r5
329; CHECK-NEXT:    li r3, 0
330; CHECK-NEXT:    sldi r7, r4, 1
331; CHECK-NEXT:    sldi r8, r4, 2
332; CHECK-NEXT:    sub r4, r9, r4
333; CHECK-NEXT:    .p2align 4
334; CHECK-NEXT:  .LBB3_2: # %for.body
335; CHECK-NEXT:    #
336; CHECK-NEXT:    ld r6, 0(r5)
337; CHECK-NEXT:    ldx r9, r5, r7
338; CHECK-NEXT:    ldx r10, r5, r8
339; CHECK-NEXT:    ldx r11, r5, r4
340; CHECK-NEXT:    addi r5, r5, 1
341; CHECK-NEXT:    mulld r6, r9, r6
342; CHECK-NEXT:    mulld r6, r6, r10
343; CHECK-NEXT:    maddld r3, r6, r11, r3
344; CHECK-NEXT:    bdnz .LBB3_2
345; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
346; CHECK-NEXT:    blr
347; CHECK-NEXT:  .LBB3_4:
348; CHECK-NEXT:    li r3, 0
349; CHECK-NEXT:    blr
350entry:
351  %mul = shl nsw i64 %offset, 1
352  %mul1 = shl nsw i64 %offset, 2
353  %mul3 = mul nsw i64 %offset, 7
354  %cmp44 = icmp sgt i64 %n, 0
355  br i1 %cmp44, label %for.body, label %for.cond.cleanup
356
357for.cond.cleanup:                                 ; preds = %for.body, %entry
358  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
359  ret i64 %sum.0.lcssa
360
361for.body:                                         ; preds = %entry, %for.body
362  %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
363  %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
364  %add.ptr8.idx = add i64 %i.045, %base1
365  %add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx
366  %0 = load i64, ptr %add.ptr8, align 8
367  %add.ptr9.idx = add i64 %add.ptr8.idx, %mul
368  %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
369  %1 = load i64, ptr %add.ptr9, align 8
370  %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1
371  %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
372  %2 = load i64, ptr %add.ptr10, align 8
373  %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3
374  %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
375  %3 = load i64, ptr %add.ptr11, align 8
376  %mul12 = mul i64 %1, %0
377  %mul13 = mul i64 %mul12, %2
378  %mul14 = mul i64 %mul13, %3
379  %add15 = add i64 %mul14, %sum.046
380  %inc = add nuw nsw i64 %i.045, 1
381  %exitcond.not = icmp eq i64 %inc, %n
382  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
383}
384
385; addresses:
386; 1: base1 + offset
387; 2: + offset
388; 3: + 3*offset
389; 4: + 2*offset
390; 5: + 1*offset
391; 6: + 2*offset
392;
393; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
394; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6
395; and address 5(2*offset), so we can not common chains for these addresses.
396;
397; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) {
398;   long long o1 = base1 + offset;
399;   long long o2 = base1 + 2 * offset;
400;   long long o3 = base1 + 5 * offset;
401;   long long o4 = base1 + 7 * offset;
402;   long long o5 = base1 + 8 * offset;
403;   long long o6 = base1 + 10 * offset;
404;   char *p1 = p + o1;
405;   char *p2 = p + o2;
406;   char *p3 = p + o3;
407;   char *p4 = p + o4;
408;   char *p5 = p + o5;
409;   char *p6 = p + o6;
410;   long long sum = 0;
411;   for (long long i = 0; i < n; ++i) {
412;     unsigned long x1 = *(unsigned long *)(p1 + i);
413;     unsigned long x2 = *(unsigned long *)(p2 + i);
414;     unsigned long x3 = *(unsigned long *)(p3 + i);
415;     unsigned long x4 = *(unsigned long *)(p4 + i);
416;     unsigned long x5 = *(unsigned long *)(p5 + i);
417;     unsigned long x6 = *(unsigned long *)(p6 + i);
418;     sum += x1 * x2 * x3 * x4 * x5 * x6;
419;   }
420;   return sum;
421; }
422;
423define i64 @not_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
424; CHECK-LABEL: not_same_offset_fail:
425; CHECK:       # %bb.0: # %entry
426; CHECK-NEXT:    cmpdi r6, 0
427; CHECK-NEXT:    ble cr0, .LBB4_4
428; CHECK-NEXT:  # %bb.1: # %for.body.preheader
429; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
430; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
431; CHECK-NEXT:    add r5, r3, r5
432; CHECK-NEXT:    li r3, 0
433; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
434; CHECK-NEXT:    mtctr r6
435; CHECK-NEXT:    mulli r11, r4, 10
436; CHECK-NEXT:    sldi r8, r4, 2
437; CHECK-NEXT:    add r8, r4, r8
438; CHECK-NEXT:    sldi r9, r4, 3
439; CHECK-NEXT:    sub r10, r9, r4
440; CHECK-NEXT:    sldi r7, r4, 1
441; CHECK-NEXT:    .p2align 4
442; CHECK-NEXT:  .LBB4_2: # %for.body
443; CHECK-NEXT:    #
444; CHECK-NEXT:    ldx r6, r5, r4
445; CHECK-NEXT:    ldx r12, r5, r7
446; CHECK-NEXT:    ldx r0, r5, r8
447; CHECK-NEXT:    ldx r30, r5, r10
448; CHECK-NEXT:    mulld r6, r12, r6
449; CHECK-NEXT:    ldx r29, r5, r9
450; CHECK-NEXT:    ldx r28, r5, r11
451; CHECK-NEXT:    addi r5, r5, 1
452; CHECK-NEXT:    mulld r6, r6, r0
453; CHECK-NEXT:    mulld r6, r6, r30
454; CHECK-NEXT:    mulld r6, r6, r29
455; CHECK-NEXT:    maddld r3, r6, r28, r3
456; CHECK-NEXT:    bdnz .LBB4_2
457; CHECK-NEXT:  # %bb.3:
458; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
459; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
460; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
461; CHECK-NEXT:    blr
462; CHECK-NEXT:  .LBB4_4:
463; CHECK-NEXT:    li r3, 0
464; CHECK-NEXT:    blr
465entry:
466  %mul = shl nsw i64 %offset, 1
467  %mul2 = mul nsw i64 %offset, 5
468  %mul4 = mul nsw i64 %offset, 7
469  %mul6 = shl nsw i64 %offset, 3
470  %mul8 = mul nsw i64 %offset, 10
471  %cmp70 = icmp sgt i64 %n, 0
472  br i1 %cmp70, label %for.body, label %for.cond.cleanup
473
474for.cond.cleanup:                                 ; preds = %for.body, %entry
475  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ]
476  ret i64 %sum.0.lcssa
477
478for.body:                                         ; preds = %entry, %for.body
479  %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ]
480  %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
481  %add = add i64 %i.071, %base1
482  %add.ptr15.idx = add i64 %add, %offset
483  %add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx
484  %0 = load i64, ptr %add.ptr15, align 8
485  %add.ptr16.idx = add i64 %add, %mul
486  %add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx
487  %1 = load i64, ptr %add.ptr16, align 8
488  %add.ptr17.idx = add i64 %add, %mul2
489  %add.ptr17 = getelementptr inbounds i8, ptr %p, i64 %add.ptr17.idx
490  %2 = load i64, ptr %add.ptr17, align 8
491  %add.ptr18.idx = add i64 %add, %mul4
492  %add.ptr18 = getelementptr inbounds i8, ptr %p, i64 %add.ptr18.idx
493  %3 = load i64, ptr %add.ptr18, align 8
494  %add.ptr19.idx = add i64 %add, %mul6
495  %add.ptr19 = getelementptr inbounds i8, ptr %p, i64 %add.ptr19.idx
496  %4 = load i64, ptr %add.ptr19, align 8
497  %add.ptr20.idx = add i64 %add, %mul8
498  %add.ptr20 = getelementptr inbounds i8, ptr %p, i64 %add.ptr20.idx
499  %5 = load i64, ptr %add.ptr20, align 8
500  %mul21 = mul i64 %1, %0
501  %mul22 = mul i64 %mul21, %2
502  %mul23 = mul i64 %mul22, %3
503  %mul24 = mul i64 %mul23, %4
504  %mul25 = mul i64 %mul24, %5
505  %add26 = add i64 %mul25, %sum.072
506  %inc = add nuw nsw i64 %i.071, 1
507  %exitcond.not = icmp eq i64 %inc, %n
508  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
509}
510
511; addresses:
512; 1: base1 + offset
513; 2: + offset
514; 3: + 3*offset
515; 4: + 2*offset
516;
517; chains:
518; 1: base1 + offset, offsets: (0, 2*offset)
519; 2: base1 + 4*offset, offsets: (0, 2*offset)
520;
521; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) {
522;   long long o1 = base1 + offset;
523;   long long o2 = base1 + 3 * offset;
524;   long long o3 = base1 + 4 * offset;
525;   long long o4 = base1 + 6 * offset;
526;   char *p1 = p + o1;
527;   char *p2 = p + o2;
528;   char *p3 = p + o3;
529;   char *p4 = p + o4;
530;   long long sum = 0;
531;   for (long long i = 0; i < n; ++i) {
532;     unsigned long x1 = *(unsigned long *)(p1 + i);
533;     unsigned long x2 = *(unsigned long *)(p2 + i);
534;     unsigned long x3 = *(unsigned long *)(p3 + i);
535;     unsigned long x4 = *(unsigned long *)(p4 + i);
536;     sum += x1 * x2 * x3 * x4;
537;   }
538;   return sum;
539; }
540;
541define i64 @two_chain_different_offsets_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) {
542; CHECK-LABEL: two_chain_different_offsets_succ:
543; CHECK:       # %bb.0: # %entry
544; CHECK-NEXT:    cmpdi r6, 0
545; CHECK-NEXT:    ble cr0, .LBB5_4
546; CHECK-NEXT:  # %bb.1: # %for.body.preheader
547; CHECK-NEXT:    sldi r8, r4, 2
548; CHECK-NEXT:    add r7, r5, r4
549; CHECK-NEXT:    mtctr r6
550; CHECK-NEXT:    add r5, r5, r8
551; CHECK-NEXT:    add r7, r3, r7
552; CHECK-NEXT:    sldi r4, r4, 1
553; CHECK-NEXT:    add r5, r3, r5
554; CHECK-NEXT:    li r3, 0
555; CHECK-NEXT:    .p2align 4
556; CHECK-NEXT:  .LBB5_2: # %for.body
557; CHECK-NEXT:    #
558; CHECK-NEXT:    ld r6, 0(r7)
559; CHECK-NEXT:    ldx r8, r7, r4
560; CHECK-NEXT:    ld r9, 0(r5)
561; CHECK-NEXT:    ldx r10, r5, r4
562; CHECK-NEXT:    addi r7, r7, 1
563; CHECK-NEXT:    addi r5, r5, 1
564; CHECK-NEXT:    mulld r6, r8, r6
565; CHECK-NEXT:    mulld r6, r6, r9
566; CHECK-NEXT:    maddld r3, r6, r10, r3
567; CHECK-NEXT:    bdnz .LBB5_2
568; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
569; CHECK-NEXT:    blr
570; CHECK-NEXT:  .LBB5_4:
571; CHECK-NEXT:    li r3, 0
572; CHECK-NEXT:    blr
573entry:
574  %mul = mul nsw i64 %offset, 3
575  %mul2 = shl nsw i64 %offset, 2
576  %mul4 = mul nsw i64 %offset, 6
577  %cmp46 = icmp sgt i64 %n, 0
578  br i1 %cmp46, label %for.body, label %for.cond.cleanup
579
580for.cond.cleanup:                                 ; preds = %for.body, %entry
581  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
582  ret i64 %sum.0.lcssa
583
584for.body:                                         ; preds = %entry, %for.body
585  %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
586  %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
587  %add = add i64 %i.047, %base1
588  %add.ptr9.idx = add i64 %add, %offset
589  %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
590  %0 = load i64, ptr %add.ptr9, align 8
591  %add.ptr10.idx = add i64 %add, %mul
592  %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
593  %1 = load i64, ptr %add.ptr10, align 8
594  %add.ptr11.idx = add i64 %add, %mul2
595  %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
596  %2 = load i64, ptr %add.ptr11, align 8
597  %add.ptr12.idx = add i64 %add, %mul4
598  %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
599  %3 = load i64, ptr %add.ptr12, align 8
600  %mul13 = mul i64 %1, %0
601  %mul14 = mul i64 %mul13, %2
602  %mul15 = mul i64 %mul14, %3
603  %add16 = add i64 %mul15, %sum.048
604  %inc = add nuw nsw i64 %i.047, 1
605  %exitcond.not = icmp eq i64 %inc, %n
606  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
607}
608
609; addresses:
610; 1: base1 + offset
611; 2: + 2*offset
612; 3: + base2 - base1 - 2*offset
613; 4: + 2*offset
614;
615; chains:
616; 1: base1 + offset, offsets: (0, 2*offset)
617; 2: base2 + offset, offsets: (0, 2*offset)
618;
619; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) {
620;   long long o1 = base1 + offset;
621;   long long o2 = base1 + 3 * offset;
622;   long long o3 = base2 + offset;
623;   long long o4 = base2 + 3 * offset;
624;   char *p1 = p + o1;
625;   char *p2 = p + o2;
626;   char *p3 = p + o3;
627;   char *p4 = p + o4;
628;   long long sum = 0;
629;   for (long long i = 0; i < n; ++i) {
630;     unsigned long x1 = *(unsigned long *)(p1 + i);
631;     unsigned long x2 = *(unsigned long *)(p2 + i);
632;     unsigned long x3 = *(unsigned long *)(p3 + i);
633;     unsigned long x4 = *(unsigned long *)(p4 + i);
634;     sum += x1 * x2 * x3 * x4;
635;   }
636;   return sum;
637; }
638;
639define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) {
640; CHECK-LABEL: two_chain_two_bases_succ:
641; CHECK:       # %bb.0: # %entry
642; CHECK-NEXT:    cmpdi r7, 0
643; CHECK-NEXT:    ble cr0, .LBB6_4
644; CHECK-NEXT:  # %bb.1: # %for.body.preheader
645; CHECK-NEXT:    add r5, r5, r4
646; CHECK-NEXT:    add r6, r6, r4
647; CHECK-NEXT:    mtctr r7
648; CHECK-NEXT:    sldi r4, r4, 1
649; CHECK-NEXT:    add r5, r3, r5
650; CHECK-NEXT:    add r6, r3, r6
651; CHECK-NEXT:    li r3, 0
652; CHECK-NEXT:    .p2align 4
653; CHECK-NEXT:  .LBB6_2: # %for.body
654; CHECK-NEXT:    #
655; CHECK-NEXT:    ld r7, 0(r5)
656; CHECK-NEXT:    ldx r8, r5, r4
657; CHECK-NEXT:    ld r9, 0(r6)
658; CHECK-NEXT:    ldx r10, r6, r4
659; CHECK-NEXT:    addi r5, r5, 1
660; CHECK-NEXT:    addi r6, r6, 1
661; CHECK-NEXT:    mulld r7, r8, r7
662; CHECK-NEXT:    mulld r7, r7, r9
663; CHECK-NEXT:    maddld r3, r7, r10, r3
664; CHECK-NEXT:    bdnz .LBB6_2
665; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
666; CHECK-NEXT:    blr
667; CHECK-NEXT:  .LBB6_4:
668; CHECK-NEXT:    li r3, 0
669; CHECK-NEXT:    blr
670entry:
671  %mul = mul nsw i64 %offset, 3
672  %cmp44 = icmp sgt i64 %n, 0
673  br i1 %cmp44, label %for.body, label %for.cond.cleanup
674
675for.cond.cleanup:                                 ; preds = %for.body, %entry
676  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
677  ret i64 %sum.0.lcssa
678
679for.body:                                         ; preds = %entry, %for.body
680  %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
681  %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
682  %add = add i64 %i.045, %base1
683  %add.ptr8.idx = add i64 %add, %offset
684  %add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx
685  %0 = load i64, ptr %add.ptr8, align 8
686  %add1 = add i64 %i.045, %mul
687  %add.ptr9.idx = add i64 %add1, %base1
688  %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
689  %1 = load i64, ptr %add.ptr9, align 8
690  %add2 = add i64 %i.045, %base2
691  %add.ptr10.idx = add i64 %add2, %offset
692  %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
693  %2 = load i64, ptr %add.ptr10, align 8
694  %add.ptr11.idx = add i64 %add2, %mul
695  %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
696  %3 = load i64, ptr %add.ptr11, align 8
697  %mul12 = mul i64 %1, %0
698  %mul13 = mul i64 %mul12, %2
699  %mul14 = mul i64 %mul13, %3
700  %add15 = add i64 %mul14, %sum.046
701  %inc = add nuw nsw i64 %i.045, 1
702  %exitcond.not = icmp eq i64 %inc, %n
703  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
704}
705;
706; Check chain commoning can reduce register pressure to save register spill/reload.
707;
708; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) {
709;   inc = inc4;
710; #pragma unroll 4
711;   for (long long i = 0; i < 4 * m; i++) {
712;     output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1];
713;     output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2];
714;     output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3];
715;     inc =  inc + inc4;
716;   }
717;   return 0;
718; }
719;
720define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
721; CHECK-LABEL: spill_reduce_succ:
722; CHECK:       # %bb.0: # %entry
723; CHECK-NEXT:    cmpdi r6, 0
724; CHECK-NEXT:    std r14, -144(r1) # 8-byte Folded Spill
725; CHECK-NEXT:    std r15, -136(r1) # 8-byte Folded Spill
726; CHECK-NEXT:    std r16, -128(r1) # 8-byte Folded Spill
727; CHECK-NEXT:    std r17, -120(r1) # 8-byte Folded Spill
728; CHECK-NEXT:    std r18, -112(r1) # 8-byte Folded Spill
729; CHECK-NEXT:    std r19, -104(r1) # 8-byte Folded Spill
730; CHECK-NEXT:    std r20, -96(r1) # 8-byte Folded Spill
731; CHECK-NEXT:    std r21, -88(r1) # 8-byte Folded Spill
732; CHECK-NEXT:    std r22, -80(r1) # 8-byte Folded Spill
733; CHECK-NEXT:    std r23, -72(r1) # 8-byte Folded Spill
734; CHECK-NEXT:    std r24, -64(r1) # 8-byte Folded Spill
735; CHECK-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
736; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
737; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
738; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
739; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
740; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
741; CHECK-NEXT:    std r31, -8(r1) # 8-byte Folded Spill
742; CHECK-NEXT:    std r2, -152(r1) # 8-byte Folded Spill
743; CHECK-NEXT:    std r9, -184(r1) # 8-byte Folded Spill
744; CHECK-NEXT:    std r8, -176(r1) # 8-byte Folded Spill
745; CHECK-NEXT:    std r7, -168(r1) # 8-byte Folded Spill
746; CHECK-NEXT:    std r3, -160(r1) # 8-byte Folded Spill
747; CHECK-NEXT:    ble cr0, .LBB7_7
748; CHECK-NEXT:  # %bb.1: # %for.body.preheader
749; CHECK-NEXT:    sldi r6, r6, 2
750; CHECK-NEXT:    li r7, 1
751; CHECK-NEXT:    mr r30, r10
752; CHECK-NEXT:    cmpdi r6, 1
753; CHECK-NEXT:    iselgt r7, r6, r7
754; CHECK-NEXT:    addi r8, r7, -1
755; CHECK-NEXT:    clrldi r6, r7, 63
756; CHECK-NEXT:    cmpldi r8, 3
757; CHECK-NEXT:    blt cr0, .LBB7_4
758; CHECK-NEXT:  # %bb.2: # %for.body.preheader.new
759; CHECK-NEXT:    ld r14, -168(r1) # 8-byte Folded Reload
760; CHECK-NEXT:    mulli r24, r30, 24
761; CHECK-NEXT:    ld r16, -184(r1) # 8-byte Folded Reload
762; CHECK-NEXT:    ld r15, -176(r1) # 8-byte Folded Reload
763; CHECK-NEXT:    ld r3, -160(r1) # 8-byte Folded Reload
764; CHECK-NEXT:    rldicl r0, r7, 62, 2
765; CHECK-NEXT:    sldi r11, r30, 5
766; CHECK-NEXT:    sldi r19, r30, 4
767; CHECK-NEXT:    sldi r7, r14, 3
768; CHECK-NEXT:    add r14, r30, r14
769; CHECK-NEXT:    sldi r10, r16, 3
770; CHECK-NEXT:    sldi r12, r15, 3
771; CHECK-NEXT:    add r16, r30, r16
772; CHECK-NEXT:    add r15, r30, r15
773; CHECK-NEXT:    add r27, r11, r7
774; CHECK-NEXT:    add r22, r24, r7
775; CHECK-NEXT:    add r17, r19, r7
776; CHECK-NEXT:    sldi r2, r14, 3
777; CHECK-NEXT:    add r26, r24, r10
778; CHECK-NEXT:    add r25, r24, r12
779; CHECK-NEXT:    add r21, r19, r10
780; CHECK-NEXT:    add r20, r19, r12
781; CHECK-NEXT:    add r8, r11, r10
782; CHECK-NEXT:    sldi r16, r16, 3
783; CHECK-NEXT:    add r29, r5, r27
784; CHECK-NEXT:    add r28, r4, r27
785; CHECK-NEXT:    add r27, r3, r27
786; CHECK-NEXT:    add r24, r5, r22
787; CHECK-NEXT:    add r23, r4, r22
788; CHECK-NEXT:    add r22, r3, r22
789; CHECK-NEXT:    add r19, r5, r17
790; CHECK-NEXT:    add r18, r4, r17
791; CHECK-NEXT:    add r17, r3, r17
792; CHECK-NEXT:    add r14, r5, r2
793; CHECK-NEXT:    add r31, r4, r2
794; CHECK-NEXT:    add r2, r3, r2
795; CHECK-NEXT:    add r9, r5, r8
796; CHECK-NEXT:    add r8, r11, r12
797; CHECK-NEXT:    add r26, r5, r26
798; CHECK-NEXT:    add r25, r5, r25
799; CHECK-NEXT:    add r21, r5, r21
800; CHECK-NEXT:    add r20, r5, r20
801; CHECK-NEXT:    add r16, r5, r16
802; CHECK-NEXT:    add r8, r5, r8
803; CHECK-NEXT:    rldicl r3, r0, 2, 1
804; CHECK-NEXT:    addi r3, r3, -4
805; CHECK-NEXT:    sub r0, r12, r7
806; CHECK-NEXT:    sub r12, r10, r7
807; CHECK-NEXT:    li r7, 0
808; CHECK-NEXT:    mr r10, r30
809; CHECK-NEXT:    sldi r15, r15, 3
810; CHECK-NEXT:    add r15, r5, r15
811; CHECK-NEXT:    rldicl r3, r3, 62, 2
812; CHECK-NEXT:    addi r3, r3, 1
813; CHECK-NEXT:    mtctr r3
814; CHECK-NEXT:    .p2align 4
815; CHECK-NEXT:  .LBB7_3: # %for.body
816; CHECK-NEXT:    #
817; CHECK-NEXT:    lfd f0, 0(r2)
818; CHECK-NEXT:    lfd f1, 0(r31)
819; CHECK-NEXT:    add r3, r10, r30
820; CHECK-NEXT:    add r3, r3, r30
821; CHECK-NEXT:    xsmuldp f0, f0, f1
822; CHECK-NEXT:    lfd f1, 0(r14)
823; CHECK-NEXT:    add r3, r3, r30
824; CHECK-NEXT:    add r10, r3, r30
825; CHECK-NEXT:    xsadddp f0, f1, f0
826; CHECK-NEXT:    stfd f0, 0(r14)
827; CHECK-NEXT:    add r14, r14, r11
828; CHECK-NEXT:    lfdx f0, r2, r0
829; CHECK-NEXT:    lfdx f1, r31, r0
830; CHECK-NEXT:    xsmuldp f0, f0, f1
831; CHECK-NEXT:    lfdx f1, r15, r7
832; CHECK-NEXT:    xsadddp f0, f1, f0
833; CHECK-NEXT:    stfdx f0, r15, r7
834; CHECK-NEXT:    lfdx f0, r2, r12
835; CHECK-NEXT:    lfdx f1, r31, r12
836; CHECK-NEXT:    add r2, r2, r11
837; CHECK-NEXT:    add r31, r31, r11
838; CHECK-NEXT:    xsmuldp f0, f0, f1
839; CHECK-NEXT:    lfdx f1, r16, r7
840; CHECK-NEXT:    xsadddp f0, f1, f0
841; CHECK-NEXT:    stfdx f0, r16, r7
842; CHECK-NEXT:    lfd f0, 0(r17)
843; CHECK-NEXT:    lfd f1, 0(r18)
844; CHECK-NEXT:    xsmuldp f0, f0, f1
845; CHECK-NEXT:    lfdx f1, r19, r7
846; CHECK-NEXT:    xsadddp f0, f1, f0
847; CHECK-NEXT:    stfdx f0, r19, r7
848; CHECK-NEXT:    lfdx f0, r17, r0
849; CHECK-NEXT:    lfdx f1, r18, r0
850; CHECK-NEXT:    xsmuldp f0, f0, f1
851; CHECK-NEXT:    lfdx f1, r20, r7
852; CHECK-NEXT:    xsadddp f0, f1, f0
853; CHECK-NEXT:    stfdx f0, r20, r7
854; CHECK-NEXT:    lfdx f0, r17, r12
855; CHECK-NEXT:    lfdx f1, r18, r12
856; CHECK-NEXT:    add r17, r17, r11
857; CHECK-NEXT:    add r18, r18, r11
858; CHECK-NEXT:    xsmuldp f0, f0, f1
859; CHECK-NEXT:    lfdx f1, r21, r7
860; CHECK-NEXT:    xsadddp f0, f1, f0
861; CHECK-NEXT:    stfdx f0, r21, r7
862; CHECK-NEXT:    lfd f0, 0(r22)
863; CHECK-NEXT:    lfd f1, 0(r23)
864; CHECK-NEXT:    xsmuldp f0, f0, f1
865; CHECK-NEXT:    lfdx f1, r24, r7
866; CHECK-NEXT:    xsadddp f0, f1, f0
867; CHECK-NEXT:    stfdx f0, r24, r7
868; CHECK-NEXT:    lfdx f0, r22, r0
869; CHECK-NEXT:    lfdx f1, r23, r0
870; CHECK-NEXT:    xsmuldp f0, f0, f1
871; CHECK-NEXT:    lfdx f1, r25, r7
872; CHECK-NEXT:    xsadddp f0, f1, f0
873; CHECK-NEXT:    stfdx f0, r25, r7
874; CHECK-NEXT:    lfdx f0, r22, r12
875; CHECK-NEXT:    lfdx f1, r23, r12
876; CHECK-NEXT:    add r22, r22, r11
877; CHECK-NEXT:    add r23, r23, r11
878; CHECK-NEXT:    xsmuldp f0, f0, f1
879; CHECK-NEXT:    lfdx f1, r26, r7
880; CHECK-NEXT:    xsadddp f0, f1, f0
881; CHECK-NEXT:    stfdx f0, r26, r7
882; CHECK-NEXT:    lfd f0, 0(r27)
883; CHECK-NEXT:    lfd f1, 0(r28)
884; CHECK-NEXT:    xsmuldp f0, f0, f1
885; CHECK-NEXT:    lfdx f1, r29, r7
886; CHECK-NEXT:    xsadddp f0, f1, f0
887; CHECK-NEXT:    stfdx f0, r29, r7
888; CHECK-NEXT:    lfdx f0, r27, r0
889; CHECK-NEXT:    lfdx f1, r28, r0
890; CHECK-NEXT:    xsmuldp f0, f0, f1
891; CHECK-NEXT:    lfdx f1, r8, r7
892; CHECK-NEXT:    xsadddp f0, f1, f0
893; CHECK-NEXT:    stfdx f0, r8, r7
894; CHECK-NEXT:    lfdx f0, r27, r12
895; CHECK-NEXT:    lfdx f1, r28, r12
896; CHECK-NEXT:    add r27, r27, r11
897; CHECK-NEXT:    add r28, r28, r11
898; CHECK-NEXT:    xsmuldp f0, f0, f1
899; CHECK-NEXT:    lfdx f1, r9, r7
900; CHECK-NEXT:    xsadddp f0, f1, f0
901; CHECK-NEXT:    stfdx f0, r9, r7
902; CHECK-NEXT:    add r7, r7, r11
903; CHECK-NEXT:    bdnz .LBB7_3
904; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
905; CHECK-NEXT:    cmpldi r6, 0
906; CHECK-NEXT:    beq cr0, .LBB7_7
907; CHECK-NEXT:  # %bb.5: # %for.body.epil.preheader
908; CHECK-NEXT:    ld r3, -184(r1) # 8-byte Folded Reload
909; CHECK-NEXT:    ld r0, -160(r1) # 8-byte Folded Reload
910; CHECK-NEXT:    sldi r8, r30, 3
911; CHECK-NEXT:    add r3, r10, r3
912; CHECK-NEXT:    sldi r3, r3, 3
913; CHECK-NEXT:    add r7, r5, r3
914; CHECK-NEXT:    add r9, r4, r3
915; CHECK-NEXT:    add r11, r0, r3
916; CHECK-NEXT:    ld r3, -176(r1) # 8-byte Folded Reload
917; CHECK-NEXT:    add r3, r10, r3
918; CHECK-NEXT:    sldi r3, r3, 3
919; CHECK-NEXT:    add r12, r5, r3
920; CHECK-NEXT:    add r30, r4, r3
921; CHECK-NEXT:    add r29, r0, r3
922; CHECK-NEXT:    ld r3, -168(r1) # 8-byte Folded Reload
923; CHECK-NEXT:    add r3, r10, r3
924; CHECK-NEXT:    li r10, 0
925; CHECK-NEXT:    sldi r3, r3, 3
926; CHECK-NEXT:    add r5, r5, r3
927; CHECK-NEXT:    add r4, r4, r3
928; CHECK-NEXT:    add r3, r0, r3
929; CHECK-NEXT:    .p2align 4
930; CHECK-NEXT:  .LBB7_6: # %for.body.epil
931; CHECK-NEXT:    #
932; CHECK-NEXT:    lfdx f0, r3, r10
933; CHECK-NEXT:    lfdx f1, r4, r10
934; CHECK-NEXT:    addi r6, r6, -1
935; CHECK-NEXT:    cmpldi r6, 0
936; CHECK-NEXT:    xsmuldp f0, f0, f1
937; CHECK-NEXT:    lfd f1, 0(r5)
938; CHECK-NEXT:    xsadddp f0, f1, f0
939; CHECK-NEXT:    stfd f0, 0(r5)
940; CHECK-NEXT:    add r5, r5, r8
941; CHECK-NEXT:    lfdx f0, r29, r10
942; CHECK-NEXT:    lfdx f1, r30, r10
943; CHECK-NEXT:    xsmuldp f0, f0, f1
944; CHECK-NEXT:    lfdx f1, r12, r10
945; CHECK-NEXT:    xsadddp f0, f1, f0
946; CHECK-NEXT:    stfdx f0, r12, r10
947; CHECK-NEXT:    lfdx f0, r11, r10
948; CHECK-NEXT:    lfdx f1, r9, r10
949; CHECK-NEXT:    xsmuldp f0, f0, f1
950; CHECK-NEXT:    lfdx f1, r7, r10
951; CHECK-NEXT:    xsadddp f0, f1, f0
952; CHECK-NEXT:    stfdx f0, r7, r10
953; CHECK-NEXT:    add r10, r10, r8
954; CHECK-NEXT:    bne cr0, .LBB7_6
955; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
956; CHECK-NEXT:    ld r2, -152(r1) # 8-byte Folded Reload
957; CHECK-NEXT:    ld r31, -8(r1) # 8-byte Folded Reload
958; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
959; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
960; CHECK-NEXT:    li r3, 0
961; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
962; CHECK-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
963; CHECK-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
964; CHECK-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
965; CHECK-NEXT:    ld r24, -64(r1) # 8-byte Folded Reload
966; CHECK-NEXT:    ld r23, -72(r1) # 8-byte Folded Reload
967; CHECK-NEXT:    ld r22, -80(r1) # 8-byte Folded Reload
968; CHECK-NEXT:    ld r21, -88(r1) # 8-byte Folded Reload
969; CHECK-NEXT:    ld r20, -96(r1) # 8-byte Folded Reload
970; CHECK-NEXT:    ld r19, -104(r1) # 8-byte Folded Reload
971; CHECK-NEXT:    ld r18, -112(r1) # 8-byte Folded Reload
972; CHECK-NEXT:    ld r17, -120(r1) # 8-byte Folded Reload
973; CHECK-NEXT:    ld r16, -128(r1) # 8-byte Folded Reload
974; CHECK-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
975; CHECK-NEXT:    ld r14, -144(r1) # 8-byte Folded Reload
976; CHECK-NEXT:    blr
977entry:
978  %cmp49 = icmp sgt i64 %m, 0
979  br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup
980
981for.body.preheader:                               ; preds = %entry
982  %0 = shl i64 %m, 2
983  %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1)
984  %1 = add nsw i64 %smax52, -1
985  %xtraiter = and i64 %smax52, 1
986  %2 = icmp ult i64 %1, 3
987  br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
988
989for.body.preheader.new:                           ; preds = %for.body.preheader
990  %unroll_iter = and i64 %smax52, 9223372036854775804
991  br label %for.body
992
993for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
994  %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ]
995  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
996  br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
997
998for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
999  %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1000  %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1001  %add.epil = add nsw i64 %inc.addr.050.epil, %inc1
1002  %arrayidx.epil = getelementptr inbounds double, ptr %input1, i64 %add.epil
1003  %3 = load double, ptr %arrayidx.epil, align 8
1004  %arrayidx2.epil = getelementptr inbounds double, ptr %input2, i64 %add.epil
1005  %4 = load double, ptr %arrayidx2.epil, align 8
1006  %mul3.epil = fmul double %3, %4
1007  %arrayidx5.epil = getelementptr inbounds double, ptr %output, i64 %add.epil
1008  %5 = load double, ptr %arrayidx5.epil, align 8
1009  %add6.epil = fadd double %5, %mul3.epil
1010  store double %add6.epil, ptr %arrayidx5.epil, align 8
1011  %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2
1012  %arrayidx8.epil = getelementptr inbounds double, ptr %input1, i64 %add7.epil
1013  %6 = load double, ptr %arrayidx8.epil, align 8
1014  %arrayidx10.epil = getelementptr inbounds double, ptr %input2, i64 %add7.epil
1015  %7 = load double, ptr %arrayidx10.epil, align 8
1016  %mul11.epil = fmul double %6, %7
1017  %arrayidx13.epil = getelementptr inbounds double, ptr %output, i64 %add7.epil
1018  %8 = load double, ptr %arrayidx13.epil, align 8
1019  %add14.epil = fadd double %8, %mul11.epil
1020  store double %add14.epil, ptr %arrayidx13.epil, align 8
1021  %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3
1022  %arrayidx16.epil = getelementptr inbounds double, ptr %input1, i64 %add15.epil
1023  %9 = load double, ptr %arrayidx16.epil, align 8
1024  %arrayidx18.epil = getelementptr inbounds double, ptr %input2, i64 %add15.epil
1025  %10 = load double, ptr %arrayidx18.epil, align 8
1026  %mul19.epil = fmul double %9, %10
1027  %arrayidx21.epil = getelementptr inbounds double, ptr %output, i64 %add15.epil
1028  %11 = load double, ptr %arrayidx21.epil, align 8
1029  %add22.epil = fadd double %11, %mul19.epil
1030  store double %add22.epil, ptr %arrayidx21.epil, align 8
1031  %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4
1032  %epil.iter.sub = add nsw i64 %epil.iter, -1
1033  %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0
1034  br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil
1035
1036for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1037  ret i32 0
1038
1039for.body:                                         ; preds = %for.body, %for.body.preheader.new
1040  %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ]
1041  %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1042  %add = add nsw i64 %inc.addr.050, %inc1
1043  %arrayidx = getelementptr inbounds double, ptr %input1, i64 %add
1044  %12 = load double, ptr %arrayidx, align 8
1045  %arrayidx2 = getelementptr inbounds double, ptr %input2, i64 %add
1046  %13 = load double, ptr %arrayidx2, align 8
1047  %mul3 = fmul double %12, %13
1048  %arrayidx5 = getelementptr inbounds double, ptr %output, i64 %add
1049  %14 = load double, ptr %arrayidx5, align 8
1050  %add6 = fadd double %14, %mul3
1051  store double %add6, ptr %arrayidx5, align 8
1052  %add7 = add nsw i64 %inc.addr.050, %inc2
1053  %arrayidx8 = getelementptr inbounds double, ptr %input1, i64 %add7
1054  %15 = load double, ptr %arrayidx8, align 8
1055  %arrayidx10 = getelementptr inbounds double, ptr %input2, i64 %add7
1056  %16 = load double, ptr %arrayidx10, align 8
1057  %mul11 = fmul double %15, %16
1058  %arrayidx13 = getelementptr inbounds double, ptr %output, i64 %add7
1059  %17 = load double, ptr %arrayidx13, align 8
1060  %add14 = fadd double %17, %mul11
1061  store double %add14, ptr %arrayidx13, align 8
1062  %add15 = add nsw i64 %inc.addr.050, %inc3
1063  %arrayidx16 = getelementptr inbounds double, ptr %input1, i64 %add15
1064  %18 = load double, ptr %arrayidx16, align 8
1065  %arrayidx18 = getelementptr inbounds double, ptr %input2, i64 %add15
1066  %19 = load double, ptr %arrayidx18, align 8
1067  %mul19 = fmul double %18, %19
1068  %arrayidx21 = getelementptr inbounds double, ptr %output, i64 %add15
1069  %20 = load double, ptr %arrayidx21, align 8
1070  %add22 = fadd double %20, %mul19
1071  store double %add22, ptr %arrayidx21, align 8
1072  %add23 = add nsw i64 %inc.addr.050, %inc4
1073  %add.1 = add nsw i64 %add23, %inc1
1074  %arrayidx.1 = getelementptr inbounds double, ptr %input1, i64 %add.1
1075  %21 = load double, ptr %arrayidx.1, align 8
1076  %arrayidx2.1 = getelementptr inbounds double, ptr %input2, i64 %add.1
1077  %22 = load double, ptr %arrayidx2.1, align 8
1078  %mul3.1 = fmul double %21, %22
1079  %arrayidx5.1 = getelementptr inbounds double, ptr %output, i64 %add.1
1080  %23 = load double, ptr %arrayidx5.1, align 8
1081  %add6.1 = fadd double %23, %mul3.1
1082  store double %add6.1, ptr %arrayidx5.1, align 8
1083  %add7.1 = add nsw i64 %add23, %inc2
1084  %arrayidx8.1 = getelementptr inbounds double, ptr %input1, i64 %add7.1
1085  %24 = load double, ptr %arrayidx8.1, align 8
1086  %arrayidx10.1 = getelementptr inbounds double, ptr %input2, i64 %add7.1
1087  %25 = load double, ptr %arrayidx10.1, align 8
1088  %mul11.1 = fmul double %24, %25
1089  %arrayidx13.1 = getelementptr inbounds double, ptr %output, i64 %add7.1
1090  %26 = load double, ptr %arrayidx13.1, align 8
1091  %add14.1 = fadd double %26, %mul11.1
1092  store double %add14.1, ptr %arrayidx13.1, align 8
1093  %add15.1 = add nsw i64 %add23, %inc3
1094  %arrayidx16.1 = getelementptr inbounds double, ptr %input1, i64 %add15.1
1095  %27 = load double, ptr %arrayidx16.1, align 8
1096  %arrayidx18.1 = getelementptr inbounds double, ptr %input2, i64 %add15.1
1097  %28 = load double, ptr %arrayidx18.1, align 8
1098  %mul19.1 = fmul double %27, %28
1099  %arrayidx21.1 = getelementptr inbounds double, ptr %output, i64 %add15.1
1100  %29 = load double, ptr %arrayidx21.1, align 8
1101  %add22.1 = fadd double %29, %mul19.1
1102  store double %add22.1, ptr %arrayidx21.1, align 8
1103  %add23.1 = add nsw i64 %add23, %inc4
1104  %add.2 = add nsw i64 %add23.1, %inc1
1105  %arrayidx.2 = getelementptr inbounds double, ptr %input1, i64 %add.2
1106  %30 = load double, ptr %arrayidx.2, align 8
1107  %arrayidx2.2 = getelementptr inbounds double, ptr %input2, i64 %add.2
1108  %31 = load double, ptr %arrayidx2.2, align 8
1109  %mul3.2 = fmul double %30, %31
1110  %arrayidx5.2 = getelementptr inbounds double, ptr %output, i64 %add.2
1111  %32 = load double, ptr %arrayidx5.2, align 8
1112  %add6.2 = fadd double %32, %mul3.2
1113  store double %add6.2, ptr %arrayidx5.2, align 8
1114  %add7.2 = add nsw i64 %add23.1, %inc2
1115  %arrayidx8.2 = getelementptr inbounds double, ptr %input1, i64 %add7.2
1116  %33 = load double, ptr %arrayidx8.2, align 8
1117  %arrayidx10.2 = getelementptr inbounds double, ptr %input2, i64 %add7.2
1118  %34 = load double, ptr %arrayidx10.2, align 8
1119  %mul11.2 = fmul double %33, %34
1120  %arrayidx13.2 = getelementptr inbounds double, ptr %output, i64 %add7.2
1121  %35 = load double, ptr %arrayidx13.2, align 8
1122  %add14.2 = fadd double %35, %mul11.2
1123  store double %add14.2, ptr %arrayidx13.2, align 8
1124  %add15.2 = add nsw i64 %add23.1, %inc3
1125  %arrayidx16.2 = getelementptr inbounds double, ptr %input1, i64 %add15.2
1126  %36 = load double, ptr %arrayidx16.2, align 8
1127  %arrayidx18.2 = getelementptr inbounds double, ptr %input2, i64 %add15.2
1128  %37 = load double, ptr %arrayidx18.2, align 8
1129  %mul19.2 = fmul double %36, %37
1130  %arrayidx21.2 = getelementptr inbounds double, ptr %output, i64 %add15.2
1131  %38 = load double, ptr %arrayidx21.2, align 8
1132  %add22.2 = fadd double %38, %mul19.2
1133  store double %add22.2, ptr %arrayidx21.2, align 8
1134  %add23.2 = add nsw i64 %add23.1, %inc4
1135  %add.3 = add nsw i64 %add23.2, %inc1
1136  %arrayidx.3 = getelementptr inbounds double, ptr %input1, i64 %add.3
1137  %39 = load double, ptr %arrayidx.3, align 8
1138  %arrayidx2.3 = getelementptr inbounds double, ptr %input2, i64 %add.3
1139  %40 = load double, ptr %arrayidx2.3, align 8
1140  %mul3.3 = fmul double %39, %40
1141  %arrayidx5.3 = getelementptr inbounds double, ptr %output, i64 %add.3
1142  %41 = load double, ptr %arrayidx5.3, align 8
1143  %add6.3 = fadd double %41, %mul3.3
1144  store double %add6.3, ptr %arrayidx5.3, align 8
1145  %add7.3 = add nsw i64 %add23.2, %inc2
1146  %arrayidx8.3 = getelementptr inbounds double, ptr %input1, i64 %add7.3
1147  %42 = load double, ptr %arrayidx8.3, align 8
1148  %arrayidx10.3 = getelementptr inbounds double, ptr %input2, i64 %add7.3
1149  %43 = load double, ptr %arrayidx10.3, align 8
1150  %mul11.3 = fmul double %42, %43
1151  %arrayidx13.3 = getelementptr inbounds double, ptr %output, i64 %add7.3
1152  %44 = load double, ptr %arrayidx13.3, align 8
1153  %add14.3 = fadd double %44, %mul11.3
1154  store double %add14.3, ptr %arrayidx13.3, align 8
1155  %add15.3 = add nsw i64 %add23.2, %inc3
1156  %arrayidx16.3 = getelementptr inbounds double, ptr %input1, i64 %add15.3
1157  %45 = load double, ptr %arrayidx16.3, align 8
1158  %arrayidx18.3 = getelementptr inbounds double, ptr %input2, i64 %add15.3
1159  %46 = load double, ptr %arrayidx18.3, align 8
1160  %mul19.3 = fmul double %45, %46
1161  %arrayidx21.3 = getelementptr inbounds double, ptr %output, i64 %add15.3
1162  %47 = load double, ptr %arrayidx21.3, align 8
1163  %add22.3 = fadd double %47, %mul19.3
1164  store double %add22.3, ptr %arrayidx21.3, align 8
1165  %add23.3 = add nsw i64 %add23.2, %inc4
1166  %niter.nsub.3 = add i64 %niter, -4
1167  %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0
1168  br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1169}
1170
1171declare i64 @llvm.smax.i64(i64, i64)
1172
1173