1*38fd1498Szrj /* Perform doloop optimizations
2*38fd1498Szrj Copyright (C) 2004-2018 Free Software Foundation, Inc.
3*38fd1498Szrj Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz)
4*38fd1498Szrj
5*38fd1498Szrj This file is part of GCC.
6*38fd1498Szrj
7*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
8*38fd1498Szrj the terms of the GNU General Public License as published by the Free
9*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
10*38fd1498Szrj version.
11*38fd1498Szrj
12*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
14*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15*38fd1498Szrj for more details.
16*38fd1498Szrj
17*38fd1498Szrj You should have received a copy of the GNU General Public License
18*38fd1498Szrj along with GCC; see the file COPYING3. If not see
19*38fd1498Szrj <http://www.gnu.org/licenses/>. */
20*38fd1498Szrj
21*38fd1498Szrj #include "config.h"
22*38fd1498Szrj #include "system.h"
23*38fd1498Szrj #include "coretypes.h"
24*38fd1498Szrj #include "backend.h"
25*38fd1498Szrj #include "target.h"
26*38fd1498Szrj #include "rtl.h"
27*38fd1498Szrj #include "tree.h"
28*38fd1498Szrj #include "cfghooks.h"
29*38fd1498Szrj #include "memmodel.h"
30*38fd1498Szrj #include "emit-rtl.h"
31*38fd1498Szrj #include "dojump.h"
32*38fd1498Szrj #include "expr.h"
33*38fd1498Szrj #include "cfgloop.h"
34*38fd1498Szrj #include "cfgrtl.h"
35*38fd1498Szrj #include "params.h"
36*38fd1498Szrj #include "dumpfile.h"
37*38fd1498Szrj #include "loop-unroll.h"
38*38fd1498Szrj #include "regs.h"
39*38fd1498Szrj #include "df.h"
40*38fd1498Szrj
41*38fd1498Szrj /* This module is used to modify loops with a determinable number of
42*38fd1498Szrj iterations to use special low-overhead looping instructions.
43*38fd1498Szrj
44*38fd1498Szrj It first validates whether the loop is well behaved and has a
45*38fd1498Szrj determinable number of iterations (either at compile or run-time).
46*38fd1498Szrj It then modifies the loop to use a low-overhead looping pattern as
47*38fd1498Szrj follows:
48*38fd1498Szrj
49*38fd1498Szrj 1. A pseudo register is allocated as the loop iteration counter.
50*38fd1498Szrj
51*38fd1498Szrj 2. The number of loop iterations is calculated and is stored
52*38fd1498Szrj in the loop counter.
53*38fd1498Szrj
54*38fd1498Szrj 3. At the end of the loop, the jump insn is replaced by the
55*38fd1498Szrj doloop_end pattern. The compare must remain because it might be
56*38fd1498Szrj used elsewhere. If the loop-variable or condition register are
57*38fd1498Szrj used elsewhere, they will be eliminated by flow.
58*38fd1498Szrj
59*38fd1498Szrj 4. An optional doloop_begin pattern is inserted at the top of the
60*38fd1498Szrj loop.
61*38fd1498Szrj
62*38fd1498Szrj TODO The optimization should only performed when either the biv used for exit
63*38fd1498Szrj condition is unused at all except for the exit test, or if we do not have to
64*38fd1498Szrj change its value, since otherwise we have to add a new induction variable,
65*38fd1498Szrj which usually will not pay up (unless the cost of the doloop pattern is
66*38fd1498Szrj somehow extremely lower than the cost of compare & jump, or unless the bct
67*38fd1498Szrj register cannot be used for anything else but doloop -- ??? detect these
68*38fd1498Szrj cases). */
69*38fd1498Szrj
70*38fd1498Szrj /* Return the loop termination condition for PATTERN or zero
71*38fd1498Szrj if it is not a decrement and branch jump insn. */
72*38fd1498Szrj
73*38fd1498Szrj rtx
doloop_condition_get(rtx_insn * doloop_pat)74*38fd1498Szrj doloop_condition_get (rtx_insn *doloop_pat)
75*38fd1498Szrj {
76*38fd1498Szrj rtx cmp;
77*38fd1498Szrj rtx inc;
78*38fd1498Szrj rtx reg;
79*38fd1498Szrj rtx inc_src;
80*38fd1498Szrj rtx condition;
81*38fd1498Szrj rtx pattern;
82*38fd1498Szrj rtx cc_reg = NULL_RTX;
83*38fd1498Szrj rtx reg_orig = NULL_RTX;
84*38fd1498Szrj
85*38fd1498Szrj /* The canonical doloop pattern we expect has one of the following
86*38fd1498Szrj forms:
87*38fd1498Szrj
88*38fd1498Szrj 1) (parallel [(set (pc) (if_then_else (condition)
89*38fd1498Szrj (label_ref (label))
90*38fd1498Szrj (pc)))
91*38fd1498Szrj (set (reg) (plus (reg) (const_int -1)))
92*38fd1498Szrj (additional clobbers and uses)])
93*38fd1498Szrj
94*38fd1498Szrj The branch must be the first entry of the parallel (also required
95*38fd1498Szrj by jump.c), and the second entry of the parallel must be a set of
96*38fd1498Szrj the loop counter register. Some targets (IA-64) wrap the set of
97*38fd1498Szrj the loop counter in an if_then_else too.
98*38fd1498Szrj
99*38fd1498Szrj 2) (set (reg) (plus (reg) (const_int -1))
100*38fd1498Szrj (set (pc) (if_then_else (reg != 0)
101*38fd1498Szrj (label_ref (label))
102*38fd1498Szrj (pc))).
103*38fd1498Szrj
104*38fd1498Szrj Some targets (ARM) do the comparison before the branch, as in the
105*38fd1498Szrj following form:
106*38fd1498Szrj
107*38fd1498Szrj 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
108*38fd1498Szrj (set (reg) (plus (reg) (const_int -1)))])
109*38fd1498Szrj (set (pc) (if_then_else (cc == NE)
110*38fd1498Szrj (label_ref (label))
111*38fd1498Szrj (pc))) */
112*38fd1498Szrj
113*38fd1498Szrj pattern = PATTERN (doloop_pat);
114*38fd1498Szrj
115*38fd1498Szrj if (GET_CODE (pattern) != PARALLEL)
116*38fd1498Szrj {
117*38fd1498Szrj rtx cond;
118*38fd1498Szrj rtx_insn *prev_insn = prev_nondebug_insn (doloop_pat);
119*38fd1498Szrj rtx cmp_arg1, cmp_arg2;
120*38fd1498Szrj rtx cmp_orig;
121*38fd1498Szrj
122*38fd1498Szrj /* In case the pattern is not PARALLEL we expect two forms
123*38fd1498Szrj of doloop which are cases 2) and 3) above: in case 2) the
124*38fd1498Szrj decrement immediately precedes the branch, while in case 3)
125*38fd1498Szrj the compare and decrement instructions immediately precede
126*38fd1498Szrj the branch. */
127*38fd1498Szrj
128*38fd1498Szrj if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
129*38fd1498Szrj return 0;
130*38fd1498Szrj
131*38fd1498Szrj cmp = pattern;
132*38fd1498Szrj if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)
133*38fd1498Szrj {
134*38fd1498Szrj /* The third case: the compare and decrement instructions
135*38fd1498Szrj immediately precede the branch. */
136*38fd1498Szrj cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);
137*38fd1498Szrj if (GET_CODE (cmp_orig) != SET)
138*38fd1498Szrj return 0;
139*38fd1498Szrj if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)
140*38fd1498Szrj return 0;
141*38fd1498Szrj cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);
142*38fd1498Szrj cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);
143*38fd1498Szrj if (cmp_arg2 != const0_rtx
144*38fd1498Szrj || GET_CODE (cmp_arg1) != PLUS)
145*38fd1498Szrj return 0;
146*38fd1498Szrj reg_orig = XEXP (cmp_arg1, 0);
147*38fd1498Szrj if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
148*38fd1498Szrj || !REG_P (reg_orig))
149*38fd1498Szrj return 0;
150*38fd1498Szrj cc_reg = SET_DEST (cmp_orig);
151*38fd1498Szrj
152*38fd1498Szrj inc = XVECEXP (PATTERN (prev_insn), 0, 1);
153*38fd1498Szrj }
154*38fd1498Szrj else
155*38fd1498Szrj inc = PATTERN (prev_insn);
156*38fd1498Szrj if (GET_CODE (cmp) == SET && GET_CODE (SET_SRC (cmp)) == IF_THEN_ELSE)
157*38fd1498Szrj {
158*38fd1498Szrj /* We expect the condition to be of the form (reg != 0) */
159*38fd1498Szrj cond = XEXP (SET_SRC (cmp), 0);
160*38fd1498Szrj if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
161*38fd1498Szrj return 0;
162*38fd1498Szrj }
163*38fd1498Szrj }
164*38fd1498Szrj else
165*38fd1498Szrj {
166*38fd1498Szrj cmp = XVECEXP (pattern, 0, 0);
167*38fd1498Szrj inc = XVECEXP (pattern, 0, 1);
168*38fd1498Szrj }
169*38fd1498Szrj
170*38fd1498Szrj /* Check for (set (reg) (something)). */
171*38fd1498Szrj if (GET_CODE (inc) != SET)
172*38fd1498Szrj return 0;
173*38fd1498Szrj reg = SET_DEST (inc);
174*38fd1498Szrj if (! REG_P (reg))
175*38fd1498Szrj return 0;
176*38fd1498Szrj
177*38fd1498Szrj /* Check if something = (plus (reg) (const_int -1)).
178*38fd1498Szrj On IA-64, this decrement is wrapped in an if_then_else. */
179*38fd1498Szrj inc_src = SET_SRC (inc);
180*38fd1498Szrj if (GET_CODE (inc_src) == IF_THEN_ELSE)
181*38fd1498Szrj inc_src = XEXP (inc_src, 1);
182*38fd1498Szrj if (GET_CODE (inc_src) != PLUS
183*38fd1498Szrj || XEXP (inc_src, 0) != reg
184*38fd1498Szrj || XEXP (inc_src, 1) != constm1_rtx)
185*38fd1498Szrj return 0;
186*38fd1498Szrj
187*38fd1498Szrj /* Check for (set (pc) (if_then_else (condition)
188*38fd1498Szrj (label_ref (label))
189*38fd1498Szrj (pc))). */
190*38fd1498Szrj if (GET_CODE (cmp) != SET
191*38fd1498Szrj || SET_DEST (cmp) != pc_rtx
192*38fd1498Szrj || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
193*38fd1498Szrj || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
194*38fd1498Szrj || XEXP (SET_SRC (cmp), 2) != pc_rtx)
195*38fd1498Szrj return 0;
196*38fd1498Szrj
197*38fd1498Szrj /* Extract loop termination condition. */
198*38fd1498Szrj condition = XEXP (SET_SRC (cmp), 0);
199*38fd1498Szrj
200*38fd1498Szrj /* We expect a GE or NE comparison with 0 or 1. */
201*38fd1498Szrj if ((GET_CODE (condition) != GE
202*38fd1498Szrj && GET_CODE (condition) != NE)
203*38fd1498Szrj || (XEXP (condition, 1) != const0_rtx
204*38fd1498Szrj && XEXP (condition, 1) != const1_rtx))
205*38fd1498Szrj return 0;
206*38fd1498Szrj
207*38fd1498Szrj if ((XEXP (condition, 0) == reg)
208*38fd1498Szrj /* For the third case: */
209*38fd1498Szrj || ((cc_reg != NULL_RTX)
210*38fd1498Szrj && (XEXP (condition, 0) == cc_reg)
211*38fd1498Szrj && (reg_orig == reg))
212*38fd1498Szrj || (GET_CODE (XEXP (condition, 0)) == PLUS
213*38fd1498Szrj && XEXP (XEXP (condition, 0), 0) == reg))
214*38fd1498Szrj {
215*38fd1498Szrj if (GET_CODE (pattern) != PARALLEL)
216*38fd1498Szrj /* For the second form we expect:
217*38fd1498Szrj
218*38fd1498Szrj (set (reg) (plus (reg) (const_int -1))
219*38fd1498Szrj (set (pc) (if_then_else (reg != 0)
220*38fd1498Szrj (label_ref (label))
221*38fd1498Szrj (pc))).
222*38fd1498Szrj
223*38fd1498Szrj is equivalent to the following:
224*38fd1498Szrj
225*38fd1498Szrj (parallel [(set (pc) (if_then_else (reg != 1)
226*38fd1498Szrj (label_ref (label))
227*38fd1498Szrj (pc)))
228*38fd1498Szrj (set (reg) (plus (reg) (const_int -1)))
229*38fd1498Szrj (additional clobbers and uses)])
230*38fd1498Szrj
231*38fd1498Szrj For the third form we expect:
232*38fd1498Szrj
233*38fd1498Szrj (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
234*38fd1498Szrj (set (reg) (plus (reg) (const_int -1)))])
235*38fd1498Szrj (set (pc) (if_then_else (cc == NE)
236*38fd1498Szrj (label_ref (label))
237*38fd1498Szrj (pc)))
238*38fd1498Szrj
239*38fd1498Szrj which is equivalent to the following:
240*38fd1498Szrj
241*38fd1498Szrj (parallel [(set (cc) (compare (reg, 1))
242*38fd1498Szrj (set (reg) (plus (reg) (const_int -1)))
243*38fd1498Szrj (set (pc) (if_then_else (NE == cc)
244*38fd1498Szrj (label_ref (label))
245*38fd1498Szrj (pc))))])
246*38fd1498Szrj
247*38fd1498Szrj So we return the second form instead for the two cases.
248*38fd1498Szrj
249*38fd1498Szrj */
250*38fd1498Szrj condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
251*38fd1498Szrj
252*38fd1498Szrj return condition;
253*38fd1498Szrj }
254*38fd1498Szrj
255*38fd1498Szrj /* ??? If a machine uses a funny comparison, we could return a
256*38fd1498Szrj canonicalized form here. */
257*38fd1498Szrj
258*38fd1498Szrj return 0;
259*38fd1498Szrj }
260*38fd1498Szrj
261*38fd1498Szrj /* Return nonzero if the loop specified by LOOP is suitable for
262*38fd1498Szrj the use of special low-overhead looping instructions. DESC
263*38fd1498Szrj describes the number of iterations of the loop. */
264*38fd1498Szrj
265*38fd1498Szrj static bool
doloop_valid_p(struct loop * loop,struct niter_desc * desc)266*38fd1498Szrj doloop_valid_p (struct loop *loop, struct niter_desc *desc)
267*38fd1498Szrj {
268*38fd1498Szrj basic_block *body = get_loop_body (loop), bb;
269*38fd1498Szrj rtx_insn *insn;
270*38fd1498Szrj unsigned i;
271*38fd1498Szrj bool result = true;
272*38fd1498Szrj
273*38fd1498Szrj /* Check for loops that may not terminate under special conditions. */
274*38fd1498Szrj if (!desc->simple_p
275*38fd1498Szrj || desc->assumptions
276*38fd1498Szrj || desc->infinite)
277*38fd1498Szrj {
278*38fd1498Szrj /* There are some cases that would require a special attention.
279*38fd1498Szrj For example if the comparison is LEU and the comparison value
280*38fd1498Szrj is UINT_MAX then the loop will not terminate. Similarly, if the
281*38fd1498Szrj comparison code is GEU and the comparison value is 0, the
282*38fd1498Szrj loop will not terminate.
283*38fd1498Szrj
284*38fd1498Szrj If the absolute increment is not 1, the loop can be infinite
285*38fd1498Szrj even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2)
286*38fd1498Szrj
287*38fd1498Szrj ??? We could compute these conditions at run-time and have a
288*38fd1498Szrj additional jump around the loop to ensure an infinite loop.
289*38fd1498Szrj However, it is very unlikely that this is the intended
290*38fd1498Szrj behavior of the loop and checking for these rare boundary
291*38fd1498Szrj conditions would pessimize all other code.
292*38fd1498Szrj
293*38fd1498Szrj If the loop is executed only a few times an extra check to
294*38fd1498Szrj restart the loop could use up most of the benefits of using a
295*38fd1498Szrj count register loop. Note however, that normally, this
296*38fd1498Szrj restart branch would never execute, so it could be predicted
297*38fd1498Szrj well by the CPU. We should generate the pessimistic code by
298*38fd1498Szrj default, and have an option, e.g. -funsafe-loops that would
299*38fd1498Szrj enable count-register loops in this case. */
300*38fd1498Szrj if (dump_file)
301*38fd1498Szrj fprintf (dump_file, "Doloop: Possible infinite iteration case.\n");
302*38fd1498Szrj result = false;
303*38fd1498Szrj goto cleanup;
304*38fd1498Szrj }
305*38fd1498Szrj
306*38fd1498Szrj for (i = 0; i < loop->num_nodes; i++)
307*38fd1498Szrj {
308*38fd1498Szrj bb = body[i];
309*38fd1498Szrj
310*38fd1498Szrj for (insn = BB_HEAD (bb);
311*38fd1498Szrj insn != NEXT_INSN (BB_END (bb));
312*38fd1498Szrj insn = NEXT_INSN (insn))
313*38fd1498Szrj {
314*38fd1498Szrj /* Different targets have different necessities for low-overhead
315*38fd1498Szrj looping. Call the back end for each instruction within the loop
316*38fd1498Szrj to let it decide whether the insn prohibits a low-overhead loop.
317*38fd1498Szrj It will then return the cause for it to emit to the dump file. */
318*38fd1498Szrj const char * invalid = targetm.invalid_within_doloop (insn);
319*38fd1498Szrj if (invalid)
320*38fd1498Szrj {
321*38fd1498Szrj if (dump_file)
322*38fd1498Szrj fprintf (dump_file, "Doloop: %s\n", invalid);
323*38fd1498Szrj result = false;
324*38fd1498Szrj goto cleanup;
325*38fd1498Szrj }
326*38fd1498Szrj }
327*38fd1498Szrj }
328*38fd1498Szrj result = true;
329*38fd1498Szrj
330*38fd1498Szrj cleanup:
331*38fd1498Szrj free (body);
332*38fd1498Szrj
333*38fd1498Szrj return result;
334*38fd1498Szrj }
335*38fd1498Szrj
336*38fd1498Szrj /* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru
337*38fd1498Szrj edge. If the condition is always false, do not do anything. If it is always
338*38fd1498Szrj true, redirect E to DEST and return false. In all other cases, true is
339*38fd1498Szrj returned. */
340*38fd1498Szrj
341*38fd1498Szrj static bool
add_test(rtx cond,edge * e,basic_block dest)342*38fd1498Szrj add_test (rtx cond, edge *e, basic_block dest)
343*38fd1498Szrj {
344*38fd1498Szrj rtx_insn *seq, *jump;
345*38fd1498Szrj rtx_code_label *label;
346*38fd1498Szrj machine_mode mode;
347*38fd1498Szrj rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1);
348*38fd1498Szrj enum rtx_code code = GET_CODE (cond);
349*38fd1498Szrj basic_block bb;
350*38fd1498Szrj /* The jump is supposed to handle an unlikely special case. */
351*38fd1498Szrj profile_probability prob = profile_probability::guessed_never ();
352*38fd1498Szrj
353*38fd1498Szrj mode = GET_MODE (XEXP (cond, 0));
354*38fd1498Szrj if (mode == VOIDmode)
355*38fd1498Szrj mode = GET_MODE (XEXP (cond, 1));
356*38fd1498Szrj
357*38fd1498Szrj start_sequence ();
358*38fd1498Szrj op0 = force_operand (op0, NULL_RTX);
359*38fd1498Szrj op1 = force_operand (op1, NULL_RTX);
360*38fd1498Szrj label = block_label (dest);
361*38fd1498Szrj do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX, NULL, label,
362*38fd1498Szrj prob);
363*38fd1498Szrj
364*38fd1498Szrj jump = get_last_insn ();
365*38fd1498Szrj if (!jump || !JUMP_P (jump))
366*38fd1498Szrj {
367*38fd1498Szrj /* The condition is always false and the jump was optimized out. */
368*38fd1498Szrj end_sequence ();
369*38fd1498Szrj return true;
370*38fd1498Szrj }
371*38fd1498Szrj
372*38fd1498Szrj seq = get_insns ();
373*38fd1498Szrj unshare_all_rtl_in_chain (seq);
374*38fd1498Szrj end_sequence ();
375*38fd1498Szrj
376*38fd1498Szrj /* There always is at least the jump insn in the sequence. */
377*38fd1498Szrj gcc_assert (seq != NULL_RTX);
378*38fd1498Szrj
379*38fd1498Szrj bb = split_edge_and_insert (*e, seq);
380*38fd1498Szrj *e = single_succ_edge (bb);
381*38fd1498Szrj
382*38fd1498Szrj if (any_uncondjump_p (jump))
383*38fd1498Szrj {
384*38fd1498Szrj /* The condition is always true. */
385*38fd1498Szrj delete_insn (jump);
386*38fd1498Szrj redirect_edge_and_branch_force (*e, dest);
387*38fd1498Szrj return false;
388*38fd1498Szrj }
389*38fd1498Szrj
390*38fd1498Szrj JUMP_LABEL (jump) = label;
391*38fd1498Szrj
392*38fd1498Szrj LABEL_NUSES (label)++;
393*38fd1498Szrj
394*38fd1498Szrj edge e2 = make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU);
395*38fd1498Szrj e2->probability = prob;
396*38fd1498Szrj (*e)->probability = prob.invert ();
397*38fd1498Szrj update_br_prob_note (e2->src);
398*38fd1498Szrj return true;
399*38fd1498Szrj }
400*38fd1498Szrj
401*38fd1498Szrj /* Modify the loop to use the low-overhead looping insn where LOOP
402*38fd1498Szrj describes the loop, DESC describes the number of iterations of the
403*38fd1498Szrj loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the
404*38fd1498Szrj end of the loop. CONDITION is the condition separated from the
405*38fd1498Szrj DOLOOP_SEQ. COUNT is the number of iterations of the LOOP. */
406*38fd1498Szrj
407*38fd1498Szrj static void
doloop_modify(struct loop * loop,struct niter_desc * desc,rtx_insn * doloop_seq,rtx condition,rtx count)408*38fd1498Szrj doloop_modify (struct loop *loop, struct niter_desc *desc,
409*38fd1498Szrj rtx_insn *doloop_seq, rtx condition, rtx count)
410*38fd1498Szrj {
411*38fd1498Szrj rtx counter_reg;
412*38fd1498Szrj rtx tmp, noloop = NULL_RTX;
413*38fd1498Szrj rtx_insn *sequence;
414*38fd1498Szrj rtx_insn *jump_insn;
415*38fd1498Szrj rtx_code_label *jump_label;
416*38fd1498Szrj int nonneg = 0;
417*38fd1498Szrj bool increment_count;
418*38fd1498Szrj basic_block loop_end = desc->out_edge->src;
419*38fd1498Szrj scalar_int_mode mode;
420*38fd1498Szrj widest_int iterations;
421*38fd1498Szrj
422*38fd1498Szrj jump_insn = BB_END (loop_end);
423*38fd1498Szrj
424*38fd1498Szrj if (dump_file)
425*38fd1498Szrj {
426*38fd1498Szrj fprintf (dump_file, "Doloop: Inserting doloop pattern (");
427*38fd1498Szrj if (desc->const_iter)
428*38fd1498Szrj fprintf (dump_file, "%" PRId64, desc->niter);
429*38fd1498Szrj else
430*38fd1498Szrj fputs ("runtime", dump_file);
431*38fd1498Szrj fputs (" iterations).\n", dump_file);
432*38fd1498Szrj }
433*38fd1498Szrj
434*38fd1498Szrj /* Discard original jump to continue loop. The original compare
435*38fd1498Szrj result may still be live, so it cannot be discarded explicitly. */
436*38fd1498Szrj delete_insn (jump_insn);
437*38fd1498Szrj
438*38fd1498Szrj counter_reg = XEXP (condition, 0);
439*38fd1498Szrj if (GET_CODE (counter_reg) == PLUS)
440*38fd1498Szrj counter_reg = XEXP (counter_reg, 0);
441*38fd1498Szrj /* These patterns must operate on integer counters. */
442*38fd1498Szrj mode = as_a <scalar_int_mode> (GET_MODE (counter_reg));
443*38fd1498Szrj
444*38fd1498Szrj increment_count = false;
445*38fd1498Szrj switch (GET_CODE (condition))
446*38fd1498Szrj {
447*38fd1498Szrj case NE:
448*38fd1498Szrj /* Currently only NE tests against zero and one are supported. */
449*38fd1498Szrj noloop = XEXP (condition, 1);
450*38fd1498Szrj if (noloop != const0_rtx)
451*38fd1498Szrj {
452*38fd1498Szrj gcc_assert (noloop == const1_rtx);
453*38fd1498Szrj increment_count = true;
454*38fd1498Szrj }
455*38fd1498Szrj break;
456*38fd1498Szrj
457*38fd1498Szrj case GE:
458*38fd1498Szrj /* Currently only GE tests against zero are supported. */
459*38fd1498Szrj gcc_assert (XEXP (condition, 1) == const0_rtx);
460*38fd1498Szrj
461*38fd1498Szrj noloop = constm1_rtx;
462*38fd1498Szrj
463*38fd1498Szrj /* The iteration count does not need incrementing for a GE test. */
464*38fd1498Szrj increment_count = false;
465*38fd1498Szrj
466*38fd1498Szrj /* Determine if the iteration counter will be non-negative.
467*38fd1498Szrj Note that the maximum value loaded is iterations_max - 1. */
468*38fd1498Szrj if (get_max_loop_iterations (loop, &iterations)
469*38fd1498Szrj && wi::leu_p (iterations,
470*38fd1498Szrj wi::set_bit_in_zero <widest_int>
471*38fd1498Szrj (GET_MODE_PRECISION (mode) - 1)))
472*38fd1498Szrj nonneg = 1;
473*38fd1498Szrj break;
474*38fd1498Szrj
475*38fd1498Szrj /* Abort if an invalid doloop pattern has been generated. */
476*38fd1498Szrj default:
477*38fd1498Szrj gcc_unreachable ();
478*38fd1498Szrj }
479*38fd1498Szrj
480*38fd1498Szrj if (increment_count)
481*38fd1498Szrj count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
482*38fd1498Szrj
483*38fd1498Szrj /* Insert initialization of the count register into the loop header. */
484*38fd1498Szrj start_sequence ();
485*38fd1498Szrj /* count has been already copied through copy_rtx. */
486*38fd1498Szrj reset_used_flags (count);
487*38fd1498Szrj set_used_flags (condition);
488*38fd1498Szrj tmp = force_operand (count, counter_reg);
489*38fd1498Szrj convert_move (counter_reg, tmp, 1);
490*38fd1498Szrj sequence = get_insns ();
491*38fd1498Szrj unshare_all_rtl_in_chain (sequence);
492*38fd1498Szrj end_sequence ();
493*38fd1498Szrj emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
494*38fd1498Szrj
495*38fd1498Szrj if (desc->noloop_assumptions)
496*38fd1498Szrj {
497*38fd1498Szrj rtx ass = copy_rtx (desc->noloop_assumptions);
498*38fd1498Szrj basic_block preheader = loop_preheader_edge (loop)->src;
499*38fd1498Szrj basic_block set_zero = split_edge (loop_preheader_edge (loop));
500*38fd1498Szrj basic_block new_preheader = split_edge (loop_preheader_edge (loop));
501*38fd1498Szrj edge te;
502*38fd1498Szrj
503*38fd1498Szrj /* Expand the condition testing the assumptions and if it does not pass,
504*38fd1498Szrj reset the count register to 0. */
505*38fd1498Szrj redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader);
506*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader);
507*38fd1498Szrj
508*38fd1498Szrj set_zero->count = profile_count::uninitialized ();
509*38fd1498Szrj
510*38fd1498Szrj te = single_succ_edge (preheader);
511*38fd1498Szrj for (; ass; ass = XEXP (ass, 1))
512*38fd1498Szrj if (!add_test (XEXP (ass, 0), &te, set_zero))
513*38fd1498Szrj break;
514*38fd1498Szrj
515*38fd1498Szrj if (ass)
516*38fd1498Szrj {
517*38fd1498Szrj /* We reached a condition that is always true. This is very hard to
518*38fd1498Szrj reproduce (such a loop does not roll, and thus it would most
519*38fd1498Szrj likely get optimized out by some of the preceding optimizations).
520*38fd1498Szrj In fact, I do not have any testcase for it. However, it would
521*38fd1498Szrj also be very hard to show that it is impossible, so we must
522*38fd1498Szrj handle this case. */
523*38fd1498Szrj set_zero->count = preheader->count;
524*38fd1498Szrj }
525*38fd1498Szrj
526*38fd1498Szrj if (EDGE_COUNT (set_zero->preds) == 0)
527*38fd1498Szrj {
528*38fd1498Szrj /* All the conditions were simplified to false, remove the
529*38fd1498Szrj unreachable set_zero block. */
530*38fd1498Szrj delete_basic_block (set_zero);
531*38fd1498Szrj }
532*38fd1498Szrj else
533*38fd1498Szrj {
534*38fd1498Szrj /* Reset the counter to zero in the set_zero block. */
535*38fd1498Szrj start_sequence ();
536*38fd1498Szrj convert_move (counter_reg, noloop, 0);
537*38fd1498Szrj sequence = get_insns ();
538*38fd1498Szrj end_sequence ();
539*38fd1498Szrj emit_insn_after (sequence, BB_END (set_zero));
540*38fd1498Szrj
541*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, set_zero,
542*38fd1498Szrj recompute_dominator (CDI_DOMINATORS,
543*38fd1498Szrj set_zero));
544*38fd1498Szrj }
545*38fd1498Szrj
546*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, new_preheader,
547*38fd1498Szrj recompute_dominator (CDI_DOMINATORS,
548*38fd1498Szrj new_preheader));
549*38fd1498Szrj }
550*38fd1498Szrj
551*38fd1498Szrj /* Some targets (eg, C4x) need to initialize special looping
552*38fd1498Szrj registers. */
553*38fd1498Szrj if (targetm.have_doloop_begin ())
554*38fd1498Szrj if (rtx_insn *seq = targetm.gen_doloop_begin (counter_reg, doloop_seq))
555*38fd1498Szrj emit_insn_after (seq, BB_END (loop_preheader_edge (loop)->src));
556*38fd1498Szrj
557*38fd1498Szrj /* Insert the new low-overhead looping insn. */
558*38fd1498Szrj emit_jump_insn_after (doloop_seq, BB_END (loop_end));
559*38fd1498Szrj jump_insn = BB_END (loop_end);
560*38fd1498Szrj jump_label = block_label (desc->in_edge->dest);
561*38fd1498Szrj JUMP_LABEL (jump_insn) = jump_label;
562*38fd1498Szrj LABEL_NUSES (jump_label)++;
563*38fd1498Szrj
564*38fd1498Szrj /* Ensure the right fallthru edge is marked, for case we have reversed
565*38fd1498Szrj the condition. */
566*38fd1498Szrj desc->in_edge->flags &= ~EDGE_FALLTHRU;
567*38fd1498Szrj desc->out_edge->flags |= EDGE_FALLTHRU;
568*38fd1498Szrj
569*38fd1498Szrj /* Add a REG_NONNEG note if the actual or estimated maximum number
570*38fd1498Szrj of iterations is non-negative. */
571*38fd1498Szrj if (nonneg)
572*38fd1498Szrj add_reg_note (jump_insn, REG_NONNEG, NULL_RTX);
573*38fd1498Szrj
574*38fd1498Szrj /* Update the REG_BR_PROB note. */
575*38fd1498Szrj if (desc->in_edge->probability.initialized_p ())
576*38fd1498Szrj add_reg_br_prob_note (jump_insn, desc->in_edge->probability);
577*38fd1498Szrj }
578*38fd1498Szrj
579*38fd1498Szrj /* Called through note_stores. */
580*38fd1498Szrj
581*38fd1498Szrj static void
record_reg_sets(rtx x,const_rtx pat ATTRIBUTE_UNUSED,void * data)582*38fd1498Szrj record_reg_sets (rtx x, const_rtx pat ATTRIBUTE_UNUSED, void *data)
583*38fd1498Szrj {
584*38fd1498Szrj bitmap mod = (bitmap)data;
585*38fd1498Szrj if (REG_P (x))
586*38fd1498Szrj {
587*38fd1498Szrj unsigned int regno = REGNO (x);
588*38fd1498Szrj if (HARD_REGISTER_P (x))
589*38fd1498Szrj {
590*38fd1498Szrj unsigned int end_regno = end_hard_regno (GET_MODE (x), regno);
591*38fd1498Szrj do
592*38fd1498Szrj bitmap_set_bit (mod, regno);
593*38fd1498Szrj while (++regno < end_regno);
594*38fd1498Szrj }
595*38fd1498Szrj else
596*38fd1498Szrj bitmap_set_bit (mod, regno);
597*38fd1498Szrj }
598*38fd1498Szrj }
599*38fd1498Szrj
600*38fd1498Szrj /* Process loop described by LOOP validating that the loop is suitable for
601*38fd1498Szrj conversion to use a low overhead looping instruction, replacing the jump
602*38fd1498Szrj insn where suitable. Returns true if the loop was successfully
603*38fd1498Szrj modified. */
604*38fd1498Szrj
605*38fd1498Szrj static bool
doloop_optimize(struct loop * loop)606*38fd1498Szrj doloop_optimize (struct loop *loop)
607*38fd1498Szrj {
608*38fd1498Szrj scalar_int_mode mode;
609*38fd1498Szrj rtx doloop_reg;
610*38fd1498Szrj rtx count;
611*38fd1498Szrj widest_int iterations, iterations_max;
612*38fd1498Szrj rtx_code_label *start_label;
613*38fd1498Szrj rtx condition;
614*38fd1498Szrj unsigned level;
615*38fd1498Szrj HOST_WIDE_INT est_niter;
616*38fd1498Szrj int max_cost;
617*38fd1498Szrj struct niter_desc *desc;
618*38fd1498Szrj unsigned word_mode_size;
619*38fd1498Szrj unsigned HOST_WIDE_INT word_mode_max;
620*38fd1498Szrj int entered_at_top;
621*38fd1498Szrj
622*38fd1498Szrj if (dump_file)
623*38fd1498Szrj fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num);
624*38fd1498Szrj
625*38fd1498Szrj iv_analysis_loop_init (loop);
626*38fd1498Szrj
627*38fd1498Szrj /* Find the simple exit of a LOOP. */
628*38fd1498Szrj desc = get_simple_loop_desc (loop);
629*38fd1498Szrj
630*38fd1498Szrj /* Check that loop is a candidate for a low-overhead looping insn. */
631*38fd1498Szrj if (!doloop_valid_p (loop, desc))
632*38fd1498Szrj {
633*38fd1498Szrj if (dump_file)
634*38fd1498Szrj fprintf (dump_file,
635*38fd1498Szrj "Doloop: The loop is not suitable.\n");
636*38fd1498Szrj return false;
637*38fd1498Szrj }
638*38fd1498Szrj mode = desc->mode;
639*38fd1498Szrj
640*38fd1498Szrj est_niter = get_estimated_loop_iterations_int (loop);
641*38fd1498Szrj if (est_niter == -1)
642*38fd1498Szrj est_niter = get_likely_max_loop_iterations_int (loop);
643*38fd1498Szrj
644*38fd1498Szrj if (est_niter >= 0 && est_niter < 3)
645*38fd1498Szrj {
646*38fd1498Szrj if (dump_file)
647*38fd1498Szrj fprintf (dump_file,
648*38fd1498Szrj "Doloop: Too few iterations (%u) to be profitable.\n",
649*38fd1498Szrj (unsigned int)est_niter);
650*38fd1498Szrj return false;
651*38fd1498Szrj }
652*38fd1498Szrj
653*38fd1498Szrj max_cost
654*38fd1498Szrj = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
655*38fd1498Szrj if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
656*38fd1498Szrj > max_cost)
657*38fd1498Szrj {
658*38fd1498Szrj if (dump_file)
659*38fd1498Szrj fprintf (dump_file,
660*38fd1498Szrj "Doloop: number of iterations too costly to compute.\n");
661*38fd1498Szrj return false;
662*38fd1498Szrj }
663*38fd1498Szrj
664*38fd1498Szrj if (desc->const_iter)
665*38fd1498Szrj iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
666*38fd1498Szrj UNSIGNED);
667*38fd1498Szrj else
668*38fd1498Szrj iterations = 0;
669*38fd1498Szrj if (!get_max_loop_iterations (loop, &iterations_max))
670*38fd1498Szrj iterations_max = 0;
671*38fd1498Szrj level = get_loop_level (loop) + 1;
672*38fd1498Szrj entered_at_top = (loop->latch == desc->in_edge->dest
673*38fd1498Szrj && contains_no_active_insn_p (loop->latch));
674*38fd1498Szrj if (!targetm.can_use_doloop_p (iterations, iterations_max, level,
675*38fd1498Szrj entered_at_top))
676*38fd1498Szrj {
677*38fd1498Szrj if (dump_file)
678*38fd1498Szrj fprintf (dump_file, "Loop rejected by can_use_doloop_p.\n");
679*38fd1498Szrj return false;
680*38fd1498Szrj }
681*38fd1498Szrj
682*38fd1498Szrj /* Generate looping insn. If the pattern FAILs then give up trying
683*38fd1498Szrj to modify the loop since there is some aspect the back-end does
684*38fd1498Szrj not like. */
685*38fd1498Szrj count = copy_rtx (desc->niter_expr);
686*38fd1498Szrj start_label = block_label (desc->in_edge->dest);
687*38fd1498Szrj doloop_reg = gen_reg_rtx (mode);
688*38fd1498Szrj rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
689*38fd1498Szrj
690*38fd1498Szrj word_mode_size = GET_MODE_PRECISION (word_mode);
691*38fd1498Szrj word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
692*38fd1498Szrj if (! doloop_seq
693*38fd1498Szrj && mode != word_mode
694*38fd1498Szrj /* Before trying mode different from the one in that # of iterations is
695*38fd1498Szrj computed, we must be sure that the number of iterations fits into
696*38fd1498Szrj the new mode. */
697*38fd1498Szrj && (word_mode_size >= GET_MODE_PRECISION (mode)
698*38fd1498Szrj || wi::leu_p (iterations_max, word_mode_max)))
699*38fd1498Szrj {
700*38fd1498Szrj if (word_mode_size > GET_MODE_PRECISION (mode))
701*38fd1498Szrj count = simplify_gen_unary (ZERO_EXTEND, word_mode, count, mode);
702*38fd1498Szrj else
703*38fd1498Szrj count = lowpart_subreg (word_mode, count, mode);
704*38fd1498Szrj PUT_MODE (doloop_reg, word_mode);
705*38fd1498Szrj doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
706*38fd1498Szrj }
707*38fd1498Szrj if (! doloop_seq)
708*38fd1498Szrj {
709*38fd1498Szrj if (dump_file)
710*38fd1498Szrj fprintf (dump_file,
711*38fd1498Szrj "Doloop: Target unwilling to use doloop pattern!\n");
712*38fd1498Szrj return false;
713*38fd1498Szrj }
714*38fd1498Szrj
715*38fd1498Szrj /* If multiple instructions were created, the last must be the
716*38fd1498Szrj jump instruction. */
717*38fd1498Szrj rtx_insn *doloop_insn = doloop_seq;
718*38fd1498Szrj while (NEXT_INSN (doloop_insn) != NULL_RTX)
719*38fd1498Szrj doloop_insn = NEXT_INSN (doloop_insn);
720*38fd1498Szrj if (!JUMP_P (doloop_insn)
721*38fd1498Szrj || !(condition = doloop_condition_get (doloop_insn)))
722*38fd1498Szrj {
723*38fd1498Szrj if (dump_file)
724*38fd1498Szrj fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n");
725*38fd1498Szrj return false;
726*38fd1498Szrj }
727*38fd1498Szrj
728*38fd1498Szrj /* Ensure that the new sequence doesn't clobber a register that
729*38fd1498Szrj is live at the end of the block. */
730*38fd1498Szrj {
731*38fd1498Szrj bitmap modified = BITMAP_ALLOC (NULL);
732*38fd1498Szrj
733*38fd1498Szrj for (rtx_insn *i = doloop_seq; i != NULL; i = NEXT_INSN (i))
734*38fd1498Szrj note_stores (PATTERN (i), record_reg_sets, modified);
735*38fd1498Szrj
736*38fd1498Szrj basic_block loop_end = desc->out_edge->src;
737*38fd1498Szrj bool fail = bitmap_intersect_p (df_get_live_out (loop_end), modified);
738*38fd1498Szrj BITMAP_FREE (modified);
739*38fd1498Szrj
740*38fd1498Szrj if (fail)
741*38fd1498Szrj {
742*38fd1498Szrj if (dump_file)
743*38fd1498Szrj fprintf (dump_file, "Doloop: doloop pattern clobbers live out\n");
744*38fd1498Szrj return false;
745*38fd1498Szrj }
746*38fd1498Szrj }
747*38fd1498Szrj
748*38fd1498Szrj doloop_modify (loop, desc, doloop_seq, condition, count);
749*38fd1498Szrj return true;
750*38fd1498Szrj }
751*38fd1498Szrj
752*38fd1498Szrj /* This is the main entry point. Process all loops using doloop_optimize. */
753*38fd1498Szrj
754*38fd1498Szrj void
doloop_optimize_loops(void)755*38fd1498Szrj doloop_optimize_loops (void)
756*38fd1498Szrj {
757*38fd1498Szrj struct loop *loop;
758*38fd1498Szrj
759*38fd1498Szrj if (optimize == 1)
760*38fd1498Szrj {
761*38fd1498Szrj df_live_add_problem ();
762*38fd1498Szrj df_live_set_all_dirty ();
763*38fd1498Szrj }
764*38fd1498Szrj
765*38fd1498Szrj FOR_EACH_LOOP (loop, 0)
766*38fd1498Szrj {
767*38fd1498Szrj doloop_optimize (loop);
768*38fd1498Szrj }
769*38fd1498Szrj
770*38fd1498Szrj if (optimize == 1)
771*38fd1498Szrj df_remove_problem (df_live);
772*38fd1498Szrj
773*38fd1498Szrj iv_analysis_done ();
774*38fd1498Szrj
775*38fd1498Szrj checking_verify_loop_structure ();
776*38fd1498Szrj }
777