xref: /dflybsd-src/contrib/gcc-8.0/gcc/loop-doloop.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Perform doloop optimizations
2*38fd1498Szrj    Copyright (C) 2004-2018 Free Software Foundation, Inc.
3*38fd1498Szrj    Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz)
4*38fd1498Szrj 
5*38fd1498Szrj This file is part of GCC.
6*38fd1498Szrj 
7*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
8*38fd1498Szrj the terms of the GNU General Public License as published by the Free
9*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
10*38fd1498Szrj version.
11*38fd1498Szrj 
12*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
14*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15*38fd1498Szrj for more details.
16*38fd1498Szrj 
17*38fd1498Szrj You should have received a copy of the GNU General Public License
18*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
19*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
20*38fd1498Szrj 
21*38fd1498Szrj #include "config.h"
22*38fd1498Szrj #include "system.h"
23*38fd1498Szrj #include "coretypes.h"
24*38fd1498Szrj #include "backend.h"
25*38fd1498Szrj #include "target.h"
26*38fd1498Szrj #include "rtl.h"
27*38fd1498Szrj #include "tree.h"
28*38fd1498Szrj #include "cfghooks.h"
29*38fd1498Szrj #include "memmodel.h"
30*38fd1498Szrj #include "emit-rtl.h"
31*38fd1498Szrj #include "dojump.h"
32*38fd1498Szrj #include "expr.h"
33*38fd1498Szrj #include "cfgloop.h"
34*38fd1498Szrj #include "cfgrtl.h"
35*38fd1498Szrj #include "params.h"
36*38fd1498Szrj #include "dumpfile.h"
37*38fd1498Szrj #include "loop-unroll.h"
38*38fd1498Szrj #include "regs.h"
39*38fd1498Szrj #include "df.h"
40*38fd1498Szrj 
41*38fd1498Szrj /* This module is used to modify loops with a determinable number of
42*38fd1498Szrj    iterations to use special low-overhead looping instructions.
43*38fd1498Szrj 
44*38fd1498Szrj    It first validates whether the loop is well behaved and has a
45*38fd1498Szrj    determinable number of iterations (either at compile or run-time).
46*38fd1498Szrj    It then modifies the loop to use a low-overhead looping pattern as
47*38fd1498Szrj    follows:
48*38fd1498Szrj 
49*38fd1498Szrj    1. A pseudo register is allocated as the loop iteration counter.
50*38fd1498Szrj 
51*38fd1498Szrj    2. The number of loop iterations is calculated and is stored
52*38fd1498Szrj       in the loop counter.
53*38fd1498Szrj 
54*38fd1498Szrj    3. At the end of the loop, the jump insn is replaced by the
55*38fd1498Szrj       doloop_end pattern.  The compare must remain because it might be
56*38fd1498Szrj       used elsewhere.  If the loop-variable or condition register are
57*38fd1498Szrj       used elsewhere, they will be eliminated by flow.
58*38fd1498Szrj 
59*38fd1498Szrj    4. An optional doloop_begin pattern is inserted at the top of the
60*38fd1498Szrj       loop.
61*38fd1498Szrj 
62*38fd1498Szrj    TODO The optimization should only performed when either the biv used for exit
63*38fd1498Szrj    condition is unused at all except for the exit test, or if we do not have to
64*38fd1498Szrj    change its value, since otherwise we have to add a new induction variable,
65*38fd1498Szrj    which usually will not pay up (unless the cost of the doloop pattern is
66*38fd1498Szrj    somehow extremely lower than the cost of compare & jump, or unless the bct
67*38fd1498Szrj    register cannot be used for anything else but doloop -- ??? detect these
68*38fd1498Szrj    cases).  */
69*38fd1498Szrj 
70*38fd1498Szrj /* Return the loop termination condition for PATTERN or zero
71*38fd1498Szrj    if it is not a decrement and branch jump insn.  */
72*38fd1498Szrj 
73*38fd1498Szrj rtx
doloop_condition_get(rtx_insn * doloop_pat)74*38fd1498Szrj doloop_condition_get (rtx_insn *doloop_pat)
75*38fd1498Szrj {
76*38fd1498Szrj   rtx cmp;
77*38fd1498Szrj   rtx inc;
78*38fd1498Szrj   rtx reg;
79*38fd1498Szrj   rtx inc_src;
80*38fd1498Szrj   rtx condition;
81*38fd1498Szrj   rtx pattern;
82*38fd1498Szrj   rtx cc_reg = NULL_RTX;
83*38fd1498Szrj   rtx reg_orig = NULL_RTX;
84*38fd1498Szrj 
85*38fd1498Szrj   /* The canonical doloop pattern we expect has one of the following
86*38fd1498Szrj      forms:
87*38fd1498Szrj 
88*38fd1498Szrj      1)  (parallel [(set (pc) (if_then_else (condition)
89*38fd1498Szrj 	  			            (label_ref (label))
90*38fd1498Szrj 				            (pc)))
91*38fd1498Szrj 	             (set (reg) (plus (reg) (const_int -1)))
92*38fd1498Szrj 	             (additional clobbers and uses)])
93*38fd1498Szrj 
94*38fd1498Szrj      The branch must be the first entry of the parallel (also required
95*38fd1498Szrj      by jump.c), and the second entry of the parallel must be a set of
96*38fd1498Szrj      the loop counter register.  Some targets (IA-64) wrap the set of
97*38fd1498Szrj      the loop counter in an if_then_else too.
98*38fd1498Szrj 
99*38fd1498Szrj      2)  (set (reg) (plus (reg) (const_int -1))
100*38fd1498Szrj          (set (pc) (if_then_else (reg != 0)
101*38fd1498Szrj 	                         (label_ref (label))
102*38fd1498Szrj 			         (pc))).
103*38fd1498Szrj 
104*38fd1498Szrj      Some targets (ARM) do the comparison before the branch, as in the
105*38fd1498Szrj      following form:
106*38fd1498Szrj 
107*38fd1498Szrj      3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
108*38fd1498Szrj                    (set (reg) (plus (reg) (const_int -1)))])
109*38fd1498Szrj         (set (pc) (if_then_else (cc == NE)
110*38fd1498Szrj                                 (label_ref (label))
111*38fd1498Szrj                                 (pc))) */
112*38fd1498Szrj 
113*38fd1498Szrj   pattern = PATTERN (doloop_pat);
114*38fd1498Szrj 
115*38fd1498Szrj   if (GET_CODE (pattern) != PARALLEL)
116*38fd1498Szrj     {
117*38fd1498Szrj       rtx cond;
118*38fd1498Szrj       rtx_insn *prev_insn = prev_nondebug_insn (doloop_pat);
119*38fd1498Szrj       rtx cmp_arg1, cmp_arg2;
120*38fd1498Szrj       rtx cmp_orig;
121*38fd1498Szrj 
122*38fd1498Szrj       /* In case the pattern is not PARALLEL we expect two forms
123*38fd1498Szrj 	 of doloop which are cases 2) and 3) above: in case 2) the
124*38fd1498Szrj 	 decrement immediately precedes the branch, while in case 3)
125*38fd1498Szrj 	 the compare and decrement instructions immediately precede
126*38fd1498Szrj 	 the branch.  */
127*38fd1498Szrj 
128*38fd1498Szrj       if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
129*38fd1498Szrj         return 0;
130*38fd1498Szrj 
131*38fd1498Szrj       cmp = pattern;
132*38fd1498Szrj       if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)
133*38fd1498Szrj         {
134*38fd1498Szrj 	  /* The third case: the compare and decrement instructions
135*38fd1498Szrj 	     immediately precede the branch.  */
136*38fd1498Szrj 	  cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);
137*38fd1498Szrj 	  if (GET_CODE (cmp_orig) != SET)
138*38fd1498Szrj 	    return 0;
139*38fd1498Szrj 	  if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)
140*38fd1498Szrj 	    return 0;
141*38fd1498Szrj 	  cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);
142*38fd1498Szrj           cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);
143*38fd1498Szrj 	  if (cmp_arg2 != const0_rtx
144*38fd1498Szrj 	      || GET_CODE (cmp_arg1) != PLUS)
145*38fd1498Szrj 	    return 0;
146*38fd1498Szrj 	  reg_orig = XEXP (cmp_arg1, 0);
147*38fd1498Szrj 	  if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
148*38fd1498Szrj 	      || !REG_P (reg_orig))
149*38fd1498Szrj 	    return 0;
150*38fd1498Szrj 	  cc_reg = SET_DEST (cmp_orig);
151*38fd1498Szrj 
152*38fd1498Szrj 	  inc = XVECEXP (PATTERN (prev_insn), 0, 1);
153*38fd1498Szrj 	}
154*38fd1498Szrj       else
155*38fd1498Szrj         inc = PATTERN (prev_insn);
156*38fd1498Szrj       if (GET_CODE (cmp) == SET && GET_CODE (SET_SRC (cmp)) == IF_THEN_ELSE)
157*38fd1498Szrj 	{
158*38fd1498Szrj 	  /* We expect the condition to be of the form (reg != 0)  */
159*38fd1498Szrj 	  cond = XEXP (SET_SRC (cmp), 0);
160*38fd1498Szrj 	  if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
161*38fd1498Szrj 	    return 0;
162*38fd1498Szrj 	}
163*38fd1498Szrj     }
164*38fd1498Szrj   else
165*38fd1498Szrj     {
166*38fd1498Szrj       cmp = XVECEXP (pattern, 0, 0);
167*38fd1498Szrj       inc = XVECEXP (pattern, 0, 1);
168*38fd1498Szrj     }
169*38fd1498Szrj 
170*38fd1498Szrj   /* Check for (set (reg) (something)).  */
171*38fd1498Szrj   if (GET_CODE (inc) != SET)
172*38fd1498Szrj     return 0;
173*38fd1498Szrj   reg = SET_DEST (inc);
174*38fd1498Szrj   if (! REG_P (reg))
175*38fd1498Szrj     return 0;
176*38fd1498Szrj 
177*38fd1498Szrj   /* Check if something = (plus (reg) (const_int -1)).
178*38fd1498Szrj      On IA-64, this decrement is wrapped in an if_then_else.  */
179*38fd1498Szrj   inc_src = SET_SRC (inc);
180*38fd1498Szrj   if (GET_CODE (inc_src) == IF_THEN_ELSE)
181*38fd1498Szrj     inc_src = XEXP (inc_src, 1);
182*38fd1498Szrj   if (GET_CODE (inc_src) != PLUS
183*38fd1498Szrj       || XEXP (inc_src, 0) != reg
184*38fd1498Szrj       || XEXP (inc_src, 1) != constm1_rtx)
185*38fd1498Szrj     return 0;
186*38fd1498Szrj 
187*38fd1498Szrj   /* Check for (set (pc) (if_then_else (condition)
188*38fd1498Szrj                                        (label_ref (label))
189*38fd1498Szrj                                        (pc))).  */
190*38fd1498Szrj   if (GET_CODE (cmp) != SET
191*38fd1498Szrj       || SET_DEST (cmp) != pc_rtx
192*38fd1498Szrj       || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
193*38fd1498Szrj       || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
194*38fd1498Szrj       || XEXP (SET_SRC (cmp), 2) != pc_rtx)
195*38fd1498Szrj     return 0;
196*38fd1498Szrj 
197*38fd1498Szrj   /* Extract loop termination condition.  */
198*38fd1498Szrj   condition = XEXP (SET_SRC (cmp), 0);
199*38fd1498Szrj 
200*38fd1498Szrj   /* We expect a GE or NE comparison with 0 or 1.  */
201*38fd1498Szrj   if ((GET_CODE (condition) != GE
202*38fd1498Szrj        && GET_CODE (condition) != NE)
203*38fd1498Szrj       || (XEXP (condition, 1) != const0_rtx
204*38fd1498Szrj           && XEXP (condition, 1) != const1_rtx))
205*38fd1498Szrj     return 0;
206*38fd1498Szrj 
207*38fd1498Szrj   if ((XEXP (condition, 0) == reg)
208*38fd1498Szrj       /* For the third case:  */
209*38fd1498Szrj       || ((cc_reg != NULL_RTX)
210*38fd1498Szrj 	  && (XEXP (condition, 0) == cc_reg)
211*38fd1498Szrj 	  && (reg_orig == reg))
212*38fd1498Szrj       || (GET_CODE (XEXP (condition, 0)) == PLUS
213*38fd1498Szrj 	  && XEXP (XEXP (condition, 0), 0) == reg))
214*38fd1498Szrj    {
215*38fd1498Szrj      if (GET_CODE (pattern) != PARALLEL)
216*38fd1498Szrj      /*  For the second form we expect:
217*38fd1498Szrj 
218*38fd1498Szrj          (set (reg) (plus (reg) (const_int -1))
219*38fd1498Szrj          (set (pc) (if_then_else (reg != 0)
220*38fd1498Szrj                                  (label_ref (label))
221*38fd1498Szrj                                  (pc))).
222*38fd1498Szrj 
223*38fd1498Szrj          is equivalent to the following:
224*38fd1498Szrj 
225*38fd1498Szrj          (parallel [(set (pc) (if_then_else (reg != 1)
226*38fd1498Szrj                                             (label_ref (label))
227*38fd1498Szrj                                             (pc)))
228*38fd1498Szrj                      (set (reg) (plus (reg) (const_int -1)))
229*38fd1498Szrj                      (additional clobbers and uses)])
230*38fd1498Szrj 
231*38fd1498Szrj         For the third form we expect:
232*38fd1498Szrj 
233*38fd1498Szrj         (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
234*38fd1498Szrj                    (set (reg) (plus (reg) (const_int -1)))])
235*38fd1498Szrj         (set (pc) (if_then_else (cc == NE)
236*38fd1498Szrj                                 (label_ref (label))
237*38fd1498Szrj                                 (pc)))
238*38fd1498Szrj 
239*38fd1498Szrj         which is equivalent to the following:
240*38fd1498Szrj 
241*38fd1498Szrj         (parallel [(set (cc) (compare (reg,  1))
242*38fd1498Szrj                    (set (reg) (plus (reg) (const_int -1)))
243*38fd1498Szrj                    (set (pc) (if_then_else (NE == cc)
244*38fd1498Szrj                                            (label_ref (label))
245*38fd1498Szrj                                            (pc))))])
246*38fd1498Szrj 
247*38fd1498Szrj         So we return the second form instead for the two cases.
248*38fd1498Szrj 
249*38fd1498Szrj      */
250*38fd1498Szrj         condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
251*38fd1498Szrj 
252*38fd1498Szrj     return condition;
253*38fd1498Szrj    }
254*38fd1498Szrj 
255*38fd1498Szrj   /* ??? If a machine uses a funny comparison, we could return a
256*38fd1498Szrj      canonicalized form here.  */
257*38fd1498Szrj 
258*38fd1498Szrj   return 0;
259*38fd1498Szrj }
260*38fd1498Szrj 
261*38fd1498Szrj /* Return nonzero if the loop specified by LOOP is suitable for
262*38fd1498Szrj    the use of special low-overhead looping instructions.  DESC
263*38fd1498Szrj    describes the number of iterations of the loop.  */
264*38fd1498Szrj 
265*38fd1498Szrj static bool
doloop_valid_p(struct loop * loop,struct niter_desc * desc)266*38fd1498Szrj doloop_valid_p (struct loop *loop, struct niter_desc *desc)
267*38fd1498Szrj {
268*38fd1498Szrj   basic_block *body = get_loop_body (loop), bb;
269*38fd1498Szrj   rtx_insn *insn;
270*38fd1498Szrj   unsigned i;
271*38fd1498Szrj   bool result = true;
272*38fd1498Szrj 
273*38fd1498Szrj   /* Check for loops that may not terminate under special conditions.  */
274*38fd1498Szrj   if (!desc->simple_p
275*38fd1498Szrj       || desc->assumptions
276*38fd1498Szrj       || desc->infinite)
277*38fd1498Szrj     {
278*38fd1498Szrj       /* There are some cases that would require a special attention.
279*38fd1498Szrj 	 For example if the comparison is LEU and the comparison value
280*38fd1498Szrj 	 is UINT_MAX then the loop will not terminate.  Similarly, if the
281*38fd1498Szrj 	 comparison code is GEU and the comparison value is 0, the
282*38fd1498Szrj 	 loop will not terminate.
283*38fd1498Szrj 
284*38fd1498Szrj 	 If the absolute increment is not 1, the loop can be infinite
285*38fd1498Szrj 	 even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2)
286*38fd1498Szrj 
287*38fd1498Szrj 	 ??? We could compute these conditions at run-time and have a
288*38fd1498Szrj 	 additional jump around the loop to ensure an infinite loop.
289*38fd1498Szrj 	 However, it is very unlikely that this is the intended
290*38fd1498Szrj 	 behavior of the loop and checking for these rare boundary
291*38fd1498Szrj 	 conditions would pessimize all other code.
292*38fd1498Szrj 
293*38fd1498Szrj 	 If the loop is executed only a few times an extra check to
294*38fd1498Szrj 	 restart the loop could use up most of the benefits of using a
295*38fd1498Szrj 	 count register loop.  Note however, that normally, this
296*38fd1498Szrj 	 restart branch would never execute, so it could be predicted
297*38fd1498Szrj 	 well by the CPU.  We should generate the pessimistic code by
298*38fd1498Szrj 	 default, and have an option, e.g. -funsafe-loops that would
299*38fd1498Szrj 	 enable count-register loops in this case.  */
300*38fd1498Szrj       if (dump_file)
301*38fd1498Szrj 	fprintf (dump_file, "Doloop: Possible infinite iteration case.\n");
302*38fd1498Szrj       result = false;
303*38fd1498Szrj       goto cleanup;
304*38fd1498Szrj     }
305*38fd1498Szrj 
306*38fd1498Szrj   for (i = 0; i < loop->num_nodes; i++)
307*38fd1498Szrj     {
308*38fd1498Szrj       bb = body[i];
309*38fd1498Szrj 
310*38fd1498Szrj       for (insn = BB_HEAD (bb);
311*38fd1498Szrj 	   insn != NEXT_INSN (BB_END (bb));
312*38fd1498Szrj 	   insn = NEXT_INSN (insn))
313*38fd1498Szrj 	{
314*38fd1498Szrj 	  /* Different targets have different necessities for low-overhead
315*38fd1498Szrj 	     looping.  Call the back end for each instruction within the loop
316*38fd1498Szrj 	     to let it decide whether the insn prohibits a low-overhead loop.
317*38fd1498Szrj 	     It will then return the cause for it to emit to the dump file.  */
318*38fd1498Szrj 	  const char * invalid = targetm.invalid_within_doloop (insn);
319*38fd1498Szrj 	  if (invalid)
320*38fd1498Szrj 	    {
321*38fd1498Szrj 	      if (dump_file)
322*38fd1498Szrj 		fprintf (dump_file, "Doloop: %s\n", invalid);
323*38fd1498Szrj 	      result = false;
324*38fd1498Szrj 	      goto cleanup;
325*38fd1498Szrj 	    }
326*38fd1498Szrj 	}
327*38fd1498Szrj     }
328*38fd1498Szrj   result = true;
329*38fd1498Szrj 
330*38fd1498Szrj cleanup:
331*38fd1498Szrj   free (body);
332*38fd1498Szrj 
333*38fd1498Szrj   return result;
334*38fd1498Szrj }
335*38fd1498Szrj 
336*38fd1498Szrj /* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru
337*38fd1498Szrj    edge.  If the condition is always false, do not do anything.  If it is always
338*38fd1498Szrj    true, redirect E to DEST and return false.  In all other cases, true is
339*38fd1498Szrj    returned.  */
340*38fd1498Szrj 
341*38fd1498Szrj static bool
add_test(rtx cond,edge * e,basic_block dest)342*38fd1498Szrj add_test (rtx cond, edge *e, basic_block dest)
343*38fd1498Szrj {
344*38fd1498Szrj   rtx_insn *seq, *jump;
345*38fd1498Szrj   rtx_code_label *label;
346*38fd1498Szrj   machine_mode mode;
347*38fd1498Szrj   rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1);
348*38fd1498Szrj   enum rtx_code code = GET_CODE (cond);
349*38fd1498Szrj   basic_block bb;
350*38fd1498Szrj   /* The jump is supposed to handle an unlikely special case.  */
351*38fd1498Szrj   profile_probability prob = profile_probability::guessed_never ();
352*38fd1498Szrj 
353*38fd1498Szrj   mode = GET_MODE (XEXP (cond, 0));
354*38fd1498Szrj   if (mode == VOIDmode)
355*38fd1498Szrj     mode = GET_MODE (XEXP (cond, 1));
356*38fd1498Szrj 
357*38fd1498Szrj   start_sequence ();
358*38fd1498Szrj   op0 = force_operand (op0, NULL_RTX);
359*38fd1498Szrj   op1 = force_operand (op1, NULL_RTX);
360*38fd1498Szrj   label = block_label (dest);
361*38fd1498Szrj   do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX, NULL, label,
362*38fd1498Szrj 			   prob);
363*38fd1498Szrj 
364*38fd1498Szrj   jump = get_last_insn ();
365*38fd1498Szrj   if (!jump || !JUMP_P (jump))
366*38fd1498Szrj     {
367*38fd1498Szrj       /* The condition is always false and the jump was optimized out.  */
368*38fd1498Szrj       end_sequence ();
369*38fd1498Szrj       return true;
370*38fd1498Szrj     }
371*38fd1498Szrj 
372*38fd1498Szrj   seq = get_insns ();
373*38fd1498Szrj   unshare_all_rtl_in_chain (seq);
374*38fd1498Szrj   end_sequence ();
375*38fd1498Szrj 
376*38fd1498Szrj   /* There always is at least the jump insn in the sequence.  */
377*38fd1498Szrj   gcc_assert (seq != NULL_RTX);
378*38fd1498Szrj 
379*38fd1498Szrj   bb = split_edge_and_insert (*e, seq);
380*38fd1498Szrj   *e = single_succ_edge (bb);
381*38fd1498Szrj 
382*38fd1498Szrj   if (any_uncondjump_p (jump))
383*38fd1498Szrj     {
384*38fd1498Szrj       /* The condition is always true.  */
385*38fd1498Szrj       delete_insn (jump);
386*38fd1498Szrj       redirect_edge_and_branch_force (*e, dest);
387*38fd1498Szrj       return false;
388*38fd1498Szrj     }
389*38fd1498Szrj 
390*38fd1498Szrj   JUMP_LABEL (jump) = label;
391*38fd1498Szrj 
392*38fd1498Szrj   LABEL_NUSES (label)++;
393*38fd1498Szrj 
394*38fd1498Szrj   edge e2 = make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU);
395*38fd1498Szrj   e2->probability = prob;
396*38fd1498Szrj   (*e)->probability = prob.invert ();
397*38fd1498Szrj   update_br_prob_note (e2->src);
398*38fd1498Szrj   return true;
399*38fd1498Szrj }
400*38fd1498Szrj 
401*38fd1498Szrj /* Modify the loop to use the low-overhead looping insn where LOOP
402*38fd1498Szrj    describes the loop, DESC describes the number of iterations of the
403*38fd1498Szrj    loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the
404*38fd1498Szrj    end of the loop.  CONDITION is the condition separated from the
405*38fd1498Szrj    DOLOOP_SEQ.  COUNT is the number of iterations of the LOOP.  */
406*38fd1498Szrj 
407*38fd1498Szrj static void
doloop_modify(struct loop * loop,struct niter_desc * desc,rtx_insn * doloop_seq,rtx condition,rtx count)408*38fd1498Szrj doloop_modify (struct loop *loop, struct niter_desc *desc,
409*38fd1498Szrj 	       rtx_insn *doloop_seq, rtx condition, rtx count)
410*38fd1498Szrj {
411*38fd1498Szrj   rtx counter_reg;
412*38fd1498Szrj   rtx tmp, noloop = NULL_RTX;
413*38fd1498Szrj   rtx_insn *sequence;
414*38fd1498Szrj   rtx_insn *jump_insn;
415*38fd1498Szrj   rtx_code_label *jump_label;
416*38fd1498Szrj   int nonneg = 0;
417*38fd1498Szrj   bool increment_count;
418*38fd1498Szrj   basic_block loop_end = desc->out_edge->src;
419*38fd1498Szrj   scalar_int_mode mode;
420*38fd1498Szrj   widest_int iterations;
421*38fd1498Szrj 
422*38fd1498Szrj   jump_insn = BB_END (loop_end);
423*38fd1498Szrj 
424*38fd1498Szrj   if (dump_file)
425*38fd1498Szrj     {
426*38fd1498Szrj       fprintf (dump_file, "Doloop: Inserting doloop pattern (");
427*38fd1498Szrj       if (desc->const_iter)
428*38fd1498Szrj 	fprintf (dump_file, "%" PRId64, desc->niter);
429*38fd1498Szrj       else
430*38fd1498Szrj 	fputs ("runtime", dump_file);
431*38fd1498Szrj       fputs (" iterations).\n", dump_file);
432*38fd1498Szrj     }
433*38fd1498Szrj 
434*38fd1498Szrj   /* Discard original jump to continue loop.  The original compare
435*38fd1498Szrj      result may still be live, so it cannot be discarded explicitly.  */
436*38fd1498Szrj   delete_insn (jump_insn);
437*38fd1498Szrj 
438*38fd1498Szrj   counter_reg = XEXP (condition, 0);
439*38fd1498Szrj   if (GET_CODE (counter_reg) == PLUS)
440*38fd1498Szrj     counter_reg = XEXP (counter_reg, 0);
441*38fd1498Szrj   /* These patterns must operate on integer counters.  */
442*38fd1498Szrj   mode = as_a <scalar_int_mode> (GET_MODE (counter_reg));
443*38fd1498Szrj 
444*38fd1498Szrj   increment_count = false;
445*38fd1498Szrj   switch (GET_CODE (condition))
446*38fd1498Szrj     {
447*38fd1498Szrj     case NE:
448*38fd1498Szrj       /* Currently only NE tests against zero and one are supported.  */
449*38fd1498Szrj       noloop = XEXP (condition, 1);
450*38fd1498Szrj       if (noloop != const0_rtx)
451*38fd1498Szrj 	{
452*38fd1498Szrj 	  gcc_assert (noloop == const1_rtx);
453*38fd1498Szrj 	  increment_count = true;
454*38fd1498Szrj 	}
455*38fd1498Szrj       break;
456*38fd1498Szrj 
457*38fd1498Szrj     case GE:
458*38fd1498Szrj       /* Currently only GE tests against zero are supported.  */
459*38fd1498Szrj       gcc_assert (XEXP (condition, 1) == const0_rtx);
460*38fd1498Szrj 
461*38fd1498Szrj       noloop = constm1_rtx;
462*38fd1498Szrj 
463*38fd1498Szrj       /* The iteration count does not need incrementing for a GE test.  */
464*38fd1498Szrj       increment_count = false;
465*38fd1498Szrj 
466*38fd1498Szrj       /* Determine if the iteration counter will be non-negative.
467*38fd1498Szrj 	 Note that the maximum value loaded is iterations_max - 1.  */
468*38fd1498Szrj       if (get_max_loop_iterations (loop, &iterations)
469*38fd1498Szrj 	  && wi::leu_p (iterations,
470*38fd1498Szrj 			wi::set_bit_in_zero <widest_int>
471*38fd1498Szrj 			(GET_MODE_PRECISION (mode) - 1)))
472*38fd1498Szrj 	nonneg = 1;
473*38fd1498Szrj       break;
474*38fd1498Szrj 
475*38fd1498Szrj       /* Abort if an invalid doloop pattern has been generated.  */
476*38fd1498Szrj     default:
477*38fd1498Szrj       gcc_unreachable ();
478*38fd1498Szrj     }
479*38fd1498Szrj 
480*38fd1498Szrj   if (increment_count)
481*38fd1498Szrj     count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
482*38fd1498Szrj 
483*38fd1498Szrj   /* Insert initialization of the count register into the loop header.  */
484*38fd1498Szrj   start_sequence ();
485*38fd1498Szrj   /* count has been already copied through copy_rtx.  */
486*38fd1498Szrj   reset_used_flags (count);
487*38fd1498Szrj   set_used_flags (condition);
488*38fd1498Szrj   tmp = force_operand (count, counter_reg);
489*38fd1498Szrj   convert_move (counter_reg, tmp, 1);
490*38fd1498Szrj   sequence = get_insns ();
491*38fd1498Szrj   unshare_all_rtl_in_chain (sequence);
492*38fd1498Szrj   end_sequence ();
493*38fd1498Szrj   emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
494*38fd1498Szrj 
495*38fd1498Szrj   if (desc->noloop_assumptions)
496*38fd1498Szrj     {
497*38fd1498Szrj       rtx ass = copy_rtx (desc->noloop_assumptions);
498*38fd1498Szrj       basic_block preheader = loop_preheader_edge (loop)->src;
499*38fd1498Szrj       basic_block set_zero = split_edge (loop_preheader_edge (loop));
500*38fd1498Szrj       basic_block new_preheader = split_edge (loop_preheader_edge (loop));
501*38fd1498Szrj       edge te;
502*38fd1498Szrj 
503*38fd1498Szrj       /* Expand the condition testing the assumptions and if it does not pass,
504*38fd1498Szrj 	 reset the count register to 0.  */
505*38fd1498Szrj       redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader);
506*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader);
507*38fd1498Szrj 
508*38fd1498Szrj       set_zero->count = profile_count::uninitialized ();
509*38fd1498Szrj 
510*38fd1498Szrj       te = single_succ_edge (preheader);
511*38fd1498Szrj       for (; ass; ass = XEXP (ass, 1))
512*38fd1498Szrj 	if (!add_test (XEXP (ass, 0), &te, set_zero))
513*38fd1498Szrj 	  break;
514*38fd1498Szrj 
515*38fd1498Szrj       if (ass)
516*38fd1498Szrj 	{
517*38fd1498Szrj 	  /* We reached a condition that is always true.  This is very hard to
518*38fd1498Szrj 	     reproduce (such a loop does not roll, and thus it would most
519*38fd1498Szrj 	     likely get optimized out by some of the preceding optimizations).
520*38fd1498Szrj 	     In fact, I do not have any testcase for it.  However, it would
521*38fd1498Szrj 	     also be very hard to show that it is impossible, so we must
522*38fd1498Szrj 	     handle this case.  */
523*38fd1498Szrj 	  set_zero->count = preheader->count;
524*38fd1498Szrj 	}
525*38fd1498Szrj 
526*38fd1498Szrj       if (EDGE_COUNT (set_zero->preds) == 0)
527*38fd1498Szrj 	{
528*38fd1498Szrj 	  /* All the conditions were simplified to false, remove the
529*38fd1498Szrj 	     unreachable set_zero block.  */
530*38fd1498Szrj 	  delete_basic_block (set_zero);
531*38fd1498Szrj 	}
532*38fd1498Szrj       else
533*38fd1498Szrj 	{
534*38fd1498Szrj 	  /* Reset the counter to zero in the set_zero block.  */
535*38fd1498Szrj 	  start_sequence ();
536*38fd1498Szrj 	  convert_move (counter_reg, noloop, 0);
537*38fd1498Szrj 	  sequence = get_insns ();
538*38fd1498Szrj 	  end_sequence ();
539*38fd1498Szrj 	  emit_insn_after (sequence, BB_END (set_zero));
540*38fd1498Szrj 
541*38fd1498Szrj 	  set_immediate_dominator (CDI_DOMINATORS, set_zero,
542*38fd1498Szrj 				   recompute_dominator (CDI_DOMINATORS,
543*38fd1498Szrj 							set_zero));
544*38fd1498Szrj 	}
545*38fd1498Szrj 
546*38fd1498Szrj       set_immediate_dominator (CDI_DOMINATORS, new_preheader,
547*38fd1498Szrj 			       recompute_dominator (CDI_DOMINATORS,
548*38fd1498Szrj 						    new_preheader));
549*38fd1498Szrj     }
550*38fd1498Szrj 
551*38fd1498Szrj   /* Some targets (eg, C4x) need to initialize special looping
552*38fd1498Szrj      registers.  */
553*38fd1498Szrj   if (targetm.have_doloop_begin ())
554*38fd1498Szrj     if (rtx_insn *seq = targetm.gen_doloop_begin (counter_reg, doloop_seq))
555*38fd1498Szrj       emit_insn_after (seq, BB_END (loop_preheader_edge (loop)->src));
556*38fd1498Szrj 
557*38fd1498Szrj   /* Insert the new low-overhead looping insn.  */
558*38fd1498Szrj   emit_jump_insn_after (doloop_seq, BB_END (loop_end));
559*38fd1498Szrj   jump_insn = BB_END (loop_end);
560*38fd1498Szrj   jump_label = block_label (desc->in_edge->dest);
561*38fd1498Szrj   JUMP_LABEL (jump_insn) = jump_label;
562*38fd1498Szrj   LABEL_NUSES (jump_label)++;
563*38fd1498Szrj 
564*38fd1498Szrj   /* Ensure the right fallthru edge is marked, for case we have reversed
565*38fd1498Szrj      the condition.  */
566*38fd1498Szrj   desc->in_edge->flags &= ~EDGE_FALLTHRU;
567*38fd1498Szrj   desc->out_edge->flags |= EDGE_FALLTHRU;
568*38fd1498Szrj 
569*38fd1498Szrj   /* Add a REG_NONNEG note if the actual or estimated maximum number
570*38fd1498Szrj      of iterations is non-negative.  */
571*38fd1498Szrj   if (nonneg)
572*38fd1498Szrj     add_reg_note (jump_insn, REG_NONNEG, NULL_RTX);
573*38fd1498Szrj 
574*38fd1498Szrj   /* Update the REG_BR_PROB note.  */
575*38fd1498Szrj   if (desc->in_edge->probability.initialized_p ())
576*38fd1498Szrj     add_reg_br_prob_note (jump_insn, desc->in_edge->probability);
577*38fd1498Szrj }
578*38fd1498Szrj 
579*38fd1498Szrj /* Called through note_stores.  */
580*38fd1498Szrj 
581*38fd1498Szrj static void
record_reg_sets(rtx x,const_rtx pat ATTRIBUTE_UNUSED,void * data)582*38fd1498Szrj record_reg_sets (rtx x, const_rtx pat ATTRIBUTE_UNUSED, void *data)
583*38fd1498Szrj {
584*38fd1498Szrj   bitmap mod = (bitmap)data;
585*38fd1498Szrj   if (REG_P (x))
586*38fd1498Szrj     {
587*38fd1498Szrj       unsigned int regno = REGNO (x);
588*38fd1498Szrj       if (HARD_REGISTER_P (x))
589*38fd1498Szrj 	{
590*38fd1498Szrj 	  unsigned int end_regno = end_hard_regno (GET_MODE (x), regno);
591*38fd1498Szrj 	  do
592*38fd1498Szrj 	    bitmap_set_bit (mod, regno);
593*38fd1498Szrj 	  while (++regno < end_regno);
594*38fd1498Szrj 	}
595*38fd1498Szrj       else
596*38fd1498Szrj 	bitmap_set_bit (mod, regno);
597*38fd1498Szrj     }
598*38fd1498Szrj }
599*38fd1498Szrj 
600*38fd1498Szrj /* Process loop described by LOOP validating that the loop is suitable for
601*38fd1498Szrj    conversion to use a low overhead looping instruction, replacing the jump
602*38fd1498Szrj    insn where suitable.  Returns true if the loop was successfully
603*38fd1498Szrj    modified.  */
604*38fd1498Szrj 
605*38fd1498Szrj static bool
doloop_optimize(struct loop * loop)606*38fd1498Szrj doloop_optimize (struct loop *loop)
607*38fd1498Szrj {
608*38fd1498Szrj   scalar_int_mode mode;
609*38fd1498Szrj   rtx doloop_reg;
610*38fd1498Szrj   rtx count;
611*38fd1498Szrj   widest_int iterations, iterations_max;
612*38fd1498Szrj   rtx_code_label *start_label;
613*38fd1498Szrj   rtx condition;
614*38fd1498Szrj   unsigned level;
615*38fd1498Szrj   HOST_WIDE_INT est_niter;
616*38fd1498Szrj   int max_cost;
617*38fd1498Szrj   struct niter_desc *desc;
618*38fd1498Szrj   unsigned word_mode_size;
619*38fd1498Szrj   unsigned HOST_WIDE_INT word_mode_max;
620*38fd1498Szrj   int entered_at_top;
621*38fd1498Szrj 
622*38fd1498Szrj   if (dump_file)
623*38fd1498Szrj     fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num);
624*38fd1498Szrj 
625*38fd1498Szrj   iv_analysis_loop_init (loop);
626*38fd1498Szrj 
627*38fd1498Szrj   /* Find the simple exit of a LOOP.  */
628*38fd1498Szrj   desc = get_simple_loop_desc (loop);
629*38fd1498Szrj 
630*38fd1498Szrj   /* Check that loop is a candidate for a low-overhead looping insn.  */
631*38fd1498Szrj   if (!doloop_valid_p (loop, desc))
632*38fd1498Szrj     {
633*38fd1498Szrj       if (dump_file)
634*38fd1498Szrj 	fprintf (dump_file,
635*38fd1498Szrj 		 "Doloop: The loop is not suitable.\n");
636*38fd1498Szrj       return false;
637*38fd1498Szrj     }
638*38fd1498Szrj   mode = desc->mode;
639*38fd1498Szrj 
640*38fd1498Szrj   est_niter = get_estimated_loop_iterations_int (loop);
641*38fd1498Szrj   if (est_niter == -1)
642*38fd1498Szrj     est_niter = get_likely_max_loop_iterations_int (loop);
643*38fd1498Szrj 
644*38fd1498Szrj   if (est_niter >= 0 && est_niter < 3)
645*38fd1498Szrj     {
646*38fd1498Szrj       if (dump_file)
647*38fd1498Szrj 	fprintf (dump_file,
648*38fd1498Szrj 		 "Doloop: Too few iterations (%u) to be profitable.\n",
649*38fd1498Szrj 		 (unsigned int)est_niter);
650*38fd1498Szrj       return false;
651*38fd1498Szrj     }
652*38fd1498Szrj 
653*38fd1498Szrj   max_cost
654*38fd1498Szrj     = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
655*38fd1498Szrj   if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
656*38fd1498Szrj       > max_cost)
657*38fd1498Szrj     {
658*38fd1498Szrj       if (dump_file)
659*38fd1498Szrj 	fprintf (dump_file,
660*38fd1498Szrj 		 "Doloop: number of iterations too costly to compute.\n");
661*38fd1498Szrj       return false;
662*38fd1498Szrj     }
663*38fd1498Szrj 
664*38fd1498Szrj   if (desc->const_iter)
665*38fd1498Szrj     iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
666*38fd1498Szrj 				   UNSIGNED);
667*38fd1498Szrj   else
668*38fd1498Szrj     iterations = 0;
669*38fd1498Szrj   if (!get_max_loop_iterations (loop, &iterations_max))
670*38fd1498Szrj     iterations_max = 0;
671*38fd1498Szrj   level = get_loop_level (loop) + 1;
672*38fd1498Szrj   entered_at_top = (loop->latch == desc->in_edge->dest
673*38fd1498Szrj 		    && contains_no_active_insn_p (loop->latch));
674*38fd1498Szrj   if (!targetm.can_use_doloop_p (iterations, iterations_max, level,
675*38fd1498Szrj 				 entered_at_top))
676*38fd1498Szrj     {
677*38fd1498Szrj       if (dump_file)
678*38fd1498Szrj 	fprintf (dump_file, "Loop rejected by can_use_doloop_p.\n");
679*38fd1498Szrj       return false;
680*38fd1498Szrj     }
681*38fd1498Szrj 
682*38fd1498Szrj   /* Generate looping insn.  If the pattern FAILs then give up trying
683*38fd1498Szrj      to modify the loop since there is some aspect the back-end does
684*38fd1498Szrj      not like.  */
685*38fd1498Szrj   count = copy_rtx (desc->niter_expr);
686*38fd1498Szrj   start_label = block_label (desc->in_edge->dest);
687*38fd1498Szrj   doloop_reg = gen_reg_rtx (mode);
688*38fd1498Szrj   rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
689*38fd1498Szrj 
690*38fd1498Szrj   word_mode_size = GET_MODE_PRECISION (word_mode);
691*38fd1498Szrj   word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
692*38fd1498Szrj   if (! doloop_seq
693*38fd1498Szrj       && mode != word_mode
694*38fd1498Szrj       /* Before trying mode different from the one in that # of iterations is
695*38fd1498Szrj 	 computed, we must be sure that the number of iterations fits into
696*38fd1498Szrj 	 the new mode.  */
697*38fd1498Szrj       && (word_mode_size >= GET_MODE_PRECISION (mode)
698*38fd1498Szrj  	  || wi::leu_p (iterations_max, word_mode_max)))
699*38fd1498Szrj     {
700*38fd1498Szrj       if (word_mode_size > GET_MODE_PRECISION (mode))
701*38fd1498Szrj 	count = simplify_gen_unary (ZERO_EXTEND, word_mode, count, mode);
702*38fd1498Szrj       else
703*38fd1498Szrj 	count = lowpart_subreg (word_mode, count, mode);
704*38fd1498Szrj       PUT_MODE (doloop_reg, word_mode);
705*38fd1498Szrj       doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
706*38fd1498Szrj     }
707*38fd1498Szrj   if (! doloop_seq)
708*38fd1498Szrj     {
709*38fd1498Szrj       if (dump_file)
710*38fd1498Szrj 	fprintf (dump_file,
711*38fd1498Szrj 		 "Doloop: Target unwilling to use doloop pattern!\n");
712*38fd1498Szrj       return false;
713*38fd1498Szrj     }
714*38fd1498Szrj 
715*38fd1498Szrj   /* If multiple instructions were created, the last must be the
716*38fd1498Szrj      jump instruction.  */
717*38fd1498Szrj   rtx_insn *doloop_insn = doloop_seq;
718*38fd1498Szrj   while (NEXT_INSN (doloop_insn) != NULL_RTX)
719*38fd1498Szrj     doloop_insn = NEXT_INSN (doloop_insn);
720*38fd1498Szrj   if (!JUMP_P (doloop_insn)
721*38fd1498Szrj       || !(condition = doloop_condition_get (doloop_insn)))
722*38fd1498Szrj     {
723*38fd1498Szrj       if (dump_file)
724*38fd1498Szrj 	fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n");
725*38fd1498Szrj       return false;
726*38fd1498Szrj     }
727*38fd1498Szrj 
728*38fd1498Szrj   /* Ensure that the new sequence doesn't clobber a register that
729*38fd1498Szrj      is live at the end of the block.  */
730*38fd1498Szrj   {
731*38fd1498Szrj     bitmap modified = BITMAP_ALLOC (NULL);
732*38fd1498Szrj 
733*38fd1498Szrj     for (rtx_insn *i = doloop_seq; i != NULL; i = NEXT_INSN (i))
734*38fd1498Szrj       note_stores (PATTERN (i), record_reg_sets, modified);
735*38fd1498Szrj 
736*38fd1498Szrj     basic_block loop_end = desc->out_edge->src;
737*38fd1498Szrj     bool fail = bitmap_intersect_p (df_get_live_out (loop_end), modified);
738*38fd1498Szrj     BITMAP_FREE (modified);
739*38fd1498Szrj 
740*38fd1498Szrj     if (fail)
741*38fd1498Szrj       {
742*38fd1498Szrj 	if (dump_file)
743*38fd1498Szrj 	  fprintf (dump_file, "Doloop: doloop pattern clobbers live out\n");
744*38fd1498Szrj 	return false;
745*38fd1498Szrj       }
746*38fd1498Szrj   }
747*38fd1498Szrj 
748*38fd1498Szrj   doloop_modify (loop, desc, doloop_seq, condition, count);
749*38fd1498Szrj   return true;
750*38fd1498Szrj }
751*38fd1498Szrj 
752*38fd1498Szrj /* This is the main entry point.  Process all loops using doloop_optimize.  */
753*38fd1498Szrj 
754*38fd1498Szrj void
doloop_optimize_loops(void)755*38fd1498Szrj doloop_optimize_loops (void)
756*38fd1498Szrj {
757*38fd1498Szrj   struct loop *loop;
758*38fd1498Szrj 
759*38fd1498Szrj   if (optimize == 1)
760*38fd1498Szrj     {
761*38fd1498Szrj       df_live_add_problem ();
762*38fd1498Szrj       df_live_set_all_dirty ();
763*38fd1498Szrj     }
764*38fd1498Szrj 
765*38fd1498Szrj   FOR_EACH_LOOP (loop, 0)
766*38fd1498Szrj     {
767*38fd1498Szrj       doloop_optimize (loop);
768*38fd1498Szrj     }
769*38fd1498Szrj 
770*38fd1498Szrj   if (optimize == 1)
771*38fd1498Szrj     df_remove_problem (df_live);
772*38fd1498Szrj 
773*38fd1498Szrj   iv_analysis_done ();
774*38fd1498Szrj 
775*38fd1498Szrj   checking_verify_loop_structure ();
776*38fd1498Szrj }
777