xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/loop-unroll.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Loop unrolling and peeling.
2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "hard-reg-set.h"
26 #include "obstack.h"
27 #include "basic-block.h"
28 #include "cfgloop.h"
29 #include "params.h"
30 #include "expr.h"
31 #include "hashtab.h"
32 #include "recog.h"
33 #include "target.h"
34 #include "dumpfile.h"
35 
36 /* This pass performs loop unrolling and peeling.  We only perform these
37    optimizations on innermost loops (with single exception) because
38    the impact on performance is greatest here, and we want to avoid
39    unnecessary code size growth.  The gain is caused by greater sequentiality
40    of code, better code to optimize for further passes and in some cases
41    by fewer testings of exit conditions.  The main problem is code growth,
42    that impacts performance negatively due to effect of caches.
43 
44    What we do:
45 
46    -- complete peeling of once-rolling loops; this is the above mentioned
47       exception, as this causes loop to be cancelled completely and
48       does not cause code growth
49    -- complete peeling of loops that roll (small) constant times.
50    -- simple peeling of first iterations of loops that do not roll much
51       (according to profile feedback)
52    -- unrolling of loops that roll constant times; this is almost always
53       win, as we get rid of exit condition tests.
54    -- unrolling of loops that roll number of times that we can compute
55       in runtime; we also get rid of exit condition tests here, but there
56       is the extra expense for calculating the number of iterations
57    -- simple unrolling of remaining loops; this is performed only if we
58       are asked to, as the gain is questionable in this case and often
59       it may even slow down the code
60    For more detailed descriptions of each of those, see comments at
61    appropriate function below.
62 
63    There is a lot of parameters (defined and described in params.def) that
64    control how much we unroll/peel.
65 
66    ??? A great problem is that we don't have a good way how to determine
67    how many times we should unroll the loop; the experiments I have made
68    showed that this choice may affect performance in order of several %.
69    */
70 
71 /* Information about induction variables to split.  */
72 
73 struct iv_to_split
74 {
75   rtx insn;		/* The insn in that the induction variable occurs.  */
76   rtx orig_var;		/* The variable (register) for the IV before split.  */
77   rtx base_var;		/* The variable on that the values in the further
78 			   iterations are based.  */
79   rtx step;		/* Step of the induction variable.  */
80   struct iv_to_split *next; /* Next entry in walking order.  */
81   unsigned n_loc;
82   unsigned loc[3];	/* Location where the definition of the induction
83 			   variable occurs in the insn.  For example if
84 			   N_LOC is 2, the expression is located at
85 			   XEXP (XEXP (single_set, loc[0]), loc[1]).  */
86 };
87 
88 /* Information about accumulators to expand.  */
89 
90 struct var_to_expand
91 {
92   rtx insn;		           /* The insn in that the variable expansion occurs.  */
93   rtx reg;                         /* The accumulator which is expanded.  */
94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
95   struct var_to_expand *next;	   /* Next entry in walking order.  */
96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
97                                       or multiplication.  */
98   int expansion_count;             /* Count the number of expansions generated so far.  */
99   int reuse_expansion;             /* The expansion we intend to reuse to expand
100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
101                                       the original accumulator.  Else use
102                                       var_expansions[REUSE_EXPANSION - 1].  */
103 };
104 
105 /* Information about optimization applied in
106    the unrolled loop.  */
107 
108 struct opt_info
109 {
110   htab_t insns_to_split;           /* A hashtable of insns to split.  */
111   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
112   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
113   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
114                                       to expand.  */
115   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
116   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
117   unsigned first_new_block;        /* The first basic block that was
118                                       duplicated.  */
119   basic_block loop_exit;           /* The loop exit basic block.  */
120   basic_block loop_preheader;      /* The loop preheader basic block.  */
121 };
122 
123 static void decide_unrolling_and_peeling (int);
124 static void peel_loops_completely (int);
125 static void decide_peel_simple (struct loop *, int);
126 static void decide_peel_once_rolling (struct loop *, int);
127 static void decide_peel_completely (struct loop *, int);
128 static void decide_unroll_stupid (struct loop *, int);
129 static void decide_unroll_constant_iterations (struct loop *, int);
130 static void decide_unroll_runtime_iterations (struct loop *, int);
131 static void peel_loop_simple (struct loop *);
132 static void peel_loop_completely (struct loop *);
133 static void unroll_loop_stupid (struct loop *);
134 static void unroll_loop_constant_iterations (struct loop *);
135 static void unroll_loop_runtime_iterations (struct loop *);
136 static struct opt_info *analyze_insns_in_loop (struct loop *);
137 static void opt_info_start_duplication (struct opt_info *);
138 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
139 static void free_opt_info (struct opt_info *);
140 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
141 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
142 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
143 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
144 static void insert_var_expansion_initialization (struct var_to_expand *,
145 						 basic_block);
146 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
147 					     basic_block);
148 static rtx get_expansion (struct var_to_expand *);
149 
150 /* Emit a message summarizing the unroll or peel that will be
151    performed for LOOP, along with the loop's location LOCUS, if
152    appropriate given the dump or -fopt-info settings.  */
153 
154 static void
155 report_unroll_peel (struct loop *loop, location_t locus)
156 {
157   struct niter_desc *desc;
158   int niters = 0;
159   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
160 
161   if (!dump_enabled_p ())
162     return;
163 
164   /* In the special case where the loop never iterated, emit
165      a different message so that we don't report an unroll by 0.
166      This matches the equivalent message emitted during tree unrolling.  */
167   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
168       && !loop->lpt_decision.times)
169     {
170       dump_printf_loc (report_flags, locus,
171                        "Turned loop into non-loop; it never loops.\n");
172       return;
173     }
174 
175   desc = get_simple_loop_desc (loop);
176 
177   if (desc->const_iter)
178     niters = desc->niter;
179   else if (loop->header->count)
180     niters = expected_loop_iterations (loop);
181 
182   dump_printf_loc (report_flags, locus,
183                    "%s loop %d times",
184                    (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
185                     ?  "Completely unroll"
186                     : (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
187                        ? "Peel" : "Unroll")),
188                    loop->lpt_decision.times);
189   if (profile_info)
190     dump_printf (report_flags,
191                  " (header execution count %d",
192                  (int)loop->header->count);
193   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
194     dump_printf (report_flags,
195                  "%s%s iterations %d)",
196                  profile_info ? ", " : " (",
197                  desc->const_iter ? "const" : "average",
198                  niters);
199   else if (profile_info)
200     dump_printf (report_flags, ")");
201 
202   dump_printf (report_flags, "\n");
203 }
204 
205 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
206 void
207 unroll_and_peel_loops (int flags)
208 {
209   struct loop *loop;
210   bool changed = false;
211   loop_iterator li;
212 
213   /* First perform complete loop peeling (it is almost surely a win,
214      and affects parameters for further decision a lot).  */
215   peel_loops_completely (flags);
216 
217   /* Now decide rest of unrolling and peeling.  */
218   decide_unrolling_and_peeling (flags);
219 
220   /* Scan the loops, inner ones first.  */
221   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
222     {
223       /* And perform the appropriate transformations.  */
224       switch (loop->lpt_decision.decision)
225 	{
226 	case LPT_PEEL_COMPLETELY:
227 	  /* Already done.  */
228 	  gcc_unreachable ();
229 	case LPT_PEEL_SIMPLE:
230 	  peel_loop_simple (loop);
231 	  changed = true;
232 	  break;
233 	case LPT_UNROLL_CONSTANT:
234 	  unroll_loop_constant_iterations (loop);
235 	  changed = true;
236 	  break;
237 	case LPT_UNROLL_RUNTIME:
238 	  unroll_loop_runtime_iterations (loop);
239 	  changed = true;
240 	  break;
241 	case LPT_UNROLL_STUPID:
242 	  unroll_loop_stupid (loop);
243 	  changed = true;
244 	  break;
245 	case LPT_NONE:
246 	  break;
247 	default:
248 	  gcc_unreachable ();
249 	}
250     }
251 
252     if (changed)
253       {
254 	calculate_dominance_info (CDI_DOMINATORS);
255 	fix_loop_structure (NULL);
256       }
257 
258   iv_analysis_done ();
259 }
260 
261 /* Check whether exit of the LOOP is at the end of loop body.  */
262 
263 static bool
264 loop_exit_at_end_p (struct loop *loop)
265 {
266   struct niter_desc *desc = get_simple_loop_desc (loop);
267   rtx insn;
268 
269   if (desc->in_edge->dest != loop->latch)
270     return false;
271 
272   /* Check that the latch is empty.  */
273   FOR_BB_INSNS (loop->latch, insn)
274     {
275       if (NONDEBUG_INSN_P (insn))
276 	return false;
277     }
278 
279   return true;
280 }
281 
282 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
283 static void
284 peel_loops_completely (int flags)
285 {
286   struct loop *loop;
287   loop_iterator li;
288   bool changed = false;
289 
290   /* Scan the loops, the inner ones first.  */
291   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
292     {
293       loop->lpt_decision.decision = LPT_NONE;
294       location_t locus = get_loop_location (loop);
295 
296       if (dump_enabled_p ())
297 	dump_printf_loc (TDF_RTL, locus,
298                          ";; *** Considering loop %d at BB %d for "
299                          "complete peeling ***\n",
300                          loop->num, loop->header->index);
301 
302       loop->ninsns = num_loop_insns (loop);
303 
304       decide_peel_once_rolling (loop, flags);
305       if (loop->lpt_decision.decision == LPT_NONE)
306 	decide_peel_completely (loop, flags);
307 
308       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
309 	{
310 	  report_unroll_peel (loop, locus);
311 	  peel_loop_completely (loop);
312 	  changed = true;
313 	}
314     }
315 
316     if (changed)
317       {
318 	calculate_dominance_info (CDI_DOMINATORS);
319 	fix_loop_structure (NULL);
320       }
321 }
322 
323 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
324 static void
325 decide_unrolling_and_peeling (int flags)
326 {
327   struct loop *loop;
328   loop_iterator li;
329 
330   /* Scan the loops, inner ones first.  */
331   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
332     {
333       loop->lpt_decision.decision = LPT_NONE;
334       location_t locus = get_loop_location (loop);
335 
336       if (dump_enabled_p ())
337 	dump_printf_loc (TDF_RTL, locus,
338                          ";; *** Considering loop %d at BB %d for "
339                          "unrolling and peeling ***\n",
340                          loop->num, loop->header->index);
341 
342       /* Do not peel cold areas.  */
343       if (optimize_loop_for_size_p (loop))
344 	{
345 	  if (dump_file)
346 	    fprintf (dump_file, ";; Not considering loop, cold area\n");
347 	  continue;
348 	}
349 
350       /* Can the loop be manipulated?  */
351       if (!can_duplicate_loop_p (loop))
352 	{
353 	  if (dump_file)
354 	    fprintf (dump_file,
355 		     ";; Not considering loop, cannot duplicate\n");
356 	  continue;
357 	}
358 
359       /* Skip non-innermost loops.  */
360       if (loop->inner)
361 	{
362 	  if (dump_file)
363 	    fprintf (dump_file, ";; Not considering loop, is not innermost\n");
364 	  continue;
365 	}
366 
367       loop->ninsns = num_loop_insns (loop);
368       loop->av_ninsns = average_num_loop_insns (loop);
369 
370       /* Try transformations one by one in decreasing order of
371 	 priority.  */
372 
373       decide_unroll_constant_iterations (loop, flags);
374       if (loop->lpt_decision.decision == LPT_NONE)
375 	decide_unroll_runtime_iterations (loop, flags);
376       if (loop->lpt_decision.decision == LPT_NONE)
377 	decide_unroll_stupid (loop, flags);
378       if (loop->lpt_decision.decision == LPT_NONE)
379 	decide_peel_simple (loop, flags);
380 
381       report_unroll_peel (loop, locus);
382     }
383 }
384 
385 /* Decide whether the LOOP is once rolling and suitable for complete
386    peeling.  */
387 static void
388 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
389 {
390   struct niter_desc *desc;
391 
392   if (dump_file)
393     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
394 
395   /* Is the loop small enough?  */
396   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
397     {
398       if (dump_file)
399 	fprintf (dump_file, ";; Not considering loop, is too big\n");
400       return;
401     }
402 
403   /* Check for simple loops.  */
404   desc = get_simple_loop_desc (loop);
405 
406   /* Check number of iterations.  */
407   if (!desc->simple_p
408       || desc->assumptions
409       || desc->infinite
410       || !desc->const_iter
411       || (desc->niter != 0
412 	  && max_loop_iterations_int (loop) != 0))
413     {
414       if (dump_file)
415 	fprintf (dump_file,
416 		 ";; Unable to prove that the loop rolls exactly once\n");
417       return;
418     }
419 
420   /* Success.  */
421   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
422 }
423 
424 /* Decide whether the LOOP is suitable for complete peeling.  */
425 static void
426 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
427 {
428   unsigned npeel;
429   struct niter_desc *desc;
430 
431   if (dump_file)
432     fprintf (dump_file, "\n;; Considering peeling completely\n");
433 
434   /* Skip non-innermost loops.  */
435   if (loop->inner)
436     {
437       if (dump_file)
438 	fprintf (dump_file, ";; Not considering loop, is not innermost\n");
439       return;
440     }
441 
442   /* Do not peel cold areas.  */
443   if (optimize_loop_for_size_p (loop))
444     {
445       if (dump_file)
446 	fprintf (dump_file, ";; Not considering loop, cold area\n");
447       return;
448     }
449 
450   /* Can the loop be manipulated?  */
451   if (!can_duplicate_loop_p (loop))
452     {
453       if (dump_file)
454 	fprintf (dump_file,
455 		 ";; Not considering loop, cannot duplicate\n");
456       return;
457     }
458 
459   /* npeel = number of iterations to peel.  */
460   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
461   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
462     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
463 
464   /* Is the loop small enough?  */
465   if (!npeel)
466     {
467       if (dump_file)
468 	fprintf (dump_file, ";; Not considering loop, is too big\n");
469       return;
470     }
471 
472   /* Check for simple loops.  */
473   desc = get_simple_loop_desc (loop);
474 
475   /* Check number of iterations.  */
476   if (!desc->simple_p
477       || desc->assumptions
478       || !desc->const_iter
479       || desc->infinite)
480     {
481       if (dump_file)
482 	fprintf (dump_file,
483 		 ";; Unable to prove that the loop iterates constant times\n");
484       return;
485     }
486 
487   if (desc->niter > npeel - 1)
488     {
489       if (dump_file)
490 	{
491 	  fprintf (dump_file,
492 		   ";; Not peeling loop completely, rolls too much (");
493 	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
494 	  fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
495 	}
496       return;
497     }
498 
499   /* Success.  */
500   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
501 }
502 
503 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
504    completely.  The transformation done:
505 
506    for (i = 0; i < 4; i++)
507      body;
508 
509    ==>
510 
511    i = 0;
512    body; i++;
513    body; i++;
514    body; i++;
515    body; i++;
516    */
517 static void
518 peel_loop_completely (struct loop *loop)
519 {
520   sbitmap wont_exit;
521   unsigned HOST_WIDE_INT npeel;
522   unsigned i;
523   vec<edge> remove_edges;
524   edge ein;
525   struct niter_desc *desc = get_simple_loop_desc (loop);
526   struct opt_info *opt_info = NULL;
527 
528   npeel = desc->niter;
529 
530   if (npeel)
531     {
532       bool ok;
533 
534       wont_exit = sbitmap_alloc (npeel + 1);
535       bitmap_ones (wont_exit);
536       bitmap_clear_bit (wont_exit, 0);
537       if (desc->noloop_assumptions)
538 	bitmap_clear_bit (wont_exit, 1);
539 
540       remove_edges.create (0);
541 
542       if (flag_split_ivs_in_unroller)
543         opt_info = analyze_insns_in_loop (loop);
544 
545       opt_info_start_duplication (opt_info);
546       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
547 					  npeel,
548 					  wont_exit, desc->out_edge,
549 					  &remove_edges,
550 					  DLTHE_FLAG_UPDATE_FREQ
551 					  | DLTHE_FLAG_COMPLETTE_PEEL
552 					  | (opt_info
553 					     ? DLTHE_RECORD_COPY_NUMBER : 0));
554       gcc_assert (ok);
555 
556       free (wont_exit);
557 
558       if (opt_info)
559  	{
560  	  apply_opt_in_copies (opt_info, npeel, false, true);
561  	  free_opt_info (opt_info);
562  	}
563 
564       /* Remove the exit edges.  */
565       FOR_EACH_VEC_ELT (remove_edges, i, ein)
566 	remove_path (ein);
567       remove_edges.release ();
568     }
569 
570   ein = desc->in_edge;
571   free_simple_loop_desc (loop);
572 
573   /* Now remove the unreachable part of the last iteration and cancel
574      the loop.  */
575   remove_path (ein);
576 
577   if (dump_file)
578     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
579 }
580 
581 /* Decide whether to unroll LOOP iterating constant number of times
582    and how much.  */
583 
584 static void
585 decide_unroll_constant_iterations (struct loop *loop, int flags)
586 {
587   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
588   struct niter_desc *desc;
589   double_int iterations;
590 
591   if (!(flags & UAP_UNROLL))
592     {
593       /* We were not asked to, just return back silently.  */
594       return;
595     }
596 
597   if (dump_file)
598     fprintf (dump_file,
599 	     "\n;; Considering unrolling loop with constant "
600 	     "number of iterations\n");
601 
602   /* nunroll = total number of copies of the original loop body in
603      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
604   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
605   nunroll_by_av
606     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
607   if (nunroll > nunroll_by_av)
608     nunroll = nunroll_by_av;
609   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
610     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
611 
612   /* Skip big loops.  */
613   if (nunroll <= 1)
614     {
615       if (dump_file)
616 	fprintf (dump_file, ";; Not considering loop, is too big\n");
617       return;
618     }
619 
620   /* Check for simple loops.  */
621   desc = get_simple_loop_desc (loop);
622 
623   /* Check number of iterations.  */
624   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
625     {
626       if (dump_file)
627 	fprintf (dump_file,
628 		 ";; Unable to prove that the loop iterates constant times\n");
629       return;
630     }
631 
632   /* Check whether the loop rolls enough to consider.
633      Consult also loop bounds and profile; in the case the loop has more
634      than one exit it may well loop less than determined maximal number
635      of iterations.  */
636   if (desc->niter < 2 * nunroll
637       || ((estimated_loop_iterations (loop, &iterations)
638 	   || max_loop_iterations (loop, &iterations))
639 	  && iterations.ult (double_int::from_shwi (2 * nunroll))))
640     {
641       if (dump_file)
642 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
643       return;
644     }
645 
646   /* Success; now compute number of iterations to unroll.  We alter
647      nunroll so that as few as possible copies of loop body are
648      necessary, while still not decreasing the number of unrollings
649      too much (at most by 1).  */
650   best_copies = 2 * nunroll + 10;
651 
652   i = 2 * nunroll + 2;
653   if (i - 1 >= desc->niter)
654     i = desc->niter - 2;
655 
656   for (; i >= nunroll - 1; i--)
657     {
658       unsigned exit_mod = desc->niter % (i + 1);
659 
660       if (!loop_exit_at_end_p (loop))
661 	n_copies = exit_mod + i + 1;
662       else if (exit_mod != (unsigned) i
663 	       || desc->noloop_assumptions != NULL_RTX)
664 	n_copies = exit_mod + i + 2;
665       else
666 	n_copies = i + 1;
667 
668       if (n_copies < best_copies)
669 	{
670 	  best_copies = n_copies;
671 	  best_unroll = i;
672 	}
673     }
674 
675   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
676   loop->lpt_decision.times = best_unroll;
677 }
678 
679 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
680    The transformation does this:
681 
682    for (i = 0; i < 102; i++)
683      body;
684 
685    ==>  (LOOP->LPT_DECISION.TIMES == 3)
686 
687    i = 0;
688    body; i++;
689    body; i++;
690    while (i < 102)
691      {
692        body; i++;
693        body; i++;
694        body; i++;
695        body; i++;
696      }
697   */
698 static void
699 unroll_loop_constant_iterations (struct loop *loop)
700 {
701   unsigned HOST_WIDE_INT niter;
702   unsigned exit_mod;
703   sbitmap wont_exit;
704   unsigned i;
705   vec<edge> remove_edges;
706   edge e;
707   unsigned max_unroll = loop->lpt_decision.times;
708   struct niter_desc *desc = get_simple_loop_desc (loop);
709   bool exit_at_end = loop_exit_at_end_p (loop);
710   struct opt_info *opt_info = NULL;
711   bool ok;
712 
713   niter = desc->niter;
714 
715   /* Should not get here (such loop should be peeled instead).  */
716   gcc_assert (niter > max_unroll + 1);
717 
718   exit_mod = niter % (max_unroll + 1);
719 
720   wont_exit = sbitmap_alloc (max_unroll + 1);
721   bitmap_ones (wont_exit);
722 
723   remove_edges.create (0);
724   if (flag_split_ivs_in_unroller
725       || flag_variable_expansion_in_unroller)
726     opt_info = analyze_insns_in_loop (loop);
727 
728   if (!exit_at_end)
729     {
730       /* The exit is not at the end of the loop; leave exit test
731 	 in the first copy, so that the loops that start with test
732 	 of exit condition have continuous body after unrolling.  */
733 
734       if (dump_file)
735 	fprintf (dump_file, ";; Condition at beginning of loop.\n");
736 
737       /* Peel exit_mod iterations.  */
738       bitmap_clear_bit (wont_exit, 0);
739       if (desc->noloop_assumptions)
740 	bitmap_clear_bit (wont_exit, 1);
741 
742       if (exit_mod)
743 	{
744 	  opt_info_start_duplication (opt_info);
745           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
746 					      exit_mod,
747 					      wont_exit, desc->out_edge,
748 					      &remove_edges,
749 					      DLTHE_FLAG_UPDATE_FREQ
750 					      | (opt_info && exit_mod > 1
751 						 ? DLTHE_RECORD_COPY_NUMBER
752 						   : 0));
753 	  gcc_assert (ok);
754 
755           if (opt_info && exit_mod > 1)
756  	    apply_opt_in_copies (opt_info, exit_mod, false, false);
757 
758 	  desc->noloop_assumptions = NULL_RTX;
759 	  desc->niter -= exit_mod;
760 	  loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
761 	  if (loop->any_estimate
762 	      && double_int::from_uhwi (exit_mod).ule
763 	           (loop->nb_iterations_estimate))
764 	    loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
765 	  else
766 	    loop->any_estimate = false;
767 	}
768 
769       bitmap_set_bit (wont_exit, 1);
770     }
771   else
772     {
773       /* Leave exit test in last copy, for the same reason as above if
774 	 the loop tests the condition at the end of loop body.  */
775 
776       if (dump_file)
777 	fprintf (dump_file, ";; Condition at end of loop.\n");
778 
779       /* We know that niter >= max_unroll + 2; so we do not need to care of
780 	 case when we would exit before reaching the loop.  So just peel
781 	 exit_mod + 1 iterations.  */
782       if (exit_mod != max_unroll
783 	  || desc->noloop_assumptions)
784 	{
785 	  bitmap_clear_bit (wont_exit, 0);
786 	  if (desc->noloop_assumptions)
787 	    bitmap_clear_bit (wont_exit, 1);
788 
789           opt_info_start_duplication (opt_info);
790 	  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
791 					      exit_mod + 1,
792 					      wont_exit, desc->out_edge,
793 					      &remove_edges,
794 					      DLTHE_FLAG_UPDATE_FREQ
795 					      | (opt_info && exit_mod > 0
796 						 ? DLTHE_RECORD_COPY_NUMBER
797 						   : 0));
798 	  gcc_assert (ok);
799 
800           if (opt_info && exit_mod > 0)
801   	    apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
802 
803 	  desc->niter -= exit_mod + 1;
804 	  loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
805 	  if (loop->any_estimate
806 	      && double_int::from_uhwi (exit_mod + 1).ule
807 	           (loop->nb_iterations_estimate))
808 	    loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
809 	  else
810 	    loop->any_estimate = false;
811 	  desc->noloop_assumptions = NULL_RTX;
812 
813 	  bitmap_set_bit (wont_exit, 0);
814 	  bitmap_set_bit (wont_exit, 1);
815 	}
816 
817       bitmap_clear_bit (wont_exit, max_unroll);
818     }
819 
820   /* Now unroll the loop.  */
821 
822   opt_info_start_duplication (opt_info);
823   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
824 				      max_unroll,
825 				      wont_exit, desc->out_edge,
826 				      &remove_edges,
827 				      DLTHE_FLAG_UPDATE_FREQ
828 				      | (opt_info
829 					 ? DLTHE_RECORD_COPY_NUMBER
830 					   : 0));
831   gcc_assert (ok);
832 
833   if (opt_info)
834     {
835       apply_opt_in_copies (opt_info, max_unroll, true, true);
836       free_opt_info (opt_info);
837     }
838 
839   free (wont_exit);
840 
841   if (exit_at_end)
842     {
843       basic_block exit_block = get_bb_copy (desc->in_edge->src);
844       /* Find a new in and out edge; they are in the last copy we have made.  */
845 
846       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
847 	{
848 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
849 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
850 	}
851       else
852 	{
853 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
854 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
855 	}
856     }
857 
858   desc->niter /= max_unroll + 1;
859   loop->nb_iterations_upper_bound
860     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
861 								   + 1),
862 					    TRUNC_DIV_EXPR);
863   if (loop->any_estimate)
864     loop->nb_iterations_estimate
865       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
866 							          + 1),
867 				           TRUNC_DIV_EXPR);
868   desc->niter_expr = GEN_INT (desc->niter);
869 
870   /* Remove the edges.  */
871   FOR_EACH_VEC_ELT (remove_edges, i, e)
872     remove_path (e);
873   remove_edges.release ();
874 
875   if (dump_file)
876     fprintf (dump_file,
877 	     ";; Unrolled loop %d times, constant # of iterations %i insns\n",
878 	     max_unroll, num_loop_insns (loop));
879 }
880 
881 /* Decide whether to unroll LOOP iterating runtime computable number of times
882    and how much.  */
883 static void
884 decide_unroll_runtime_iterations (struct loop *loop, int flags)
885 {
886   unsigned nunroll, nunroll_by_av, i;
887   struct niter_desc *desc;
888   double_int iterations;
889 
890   if (!(flags & UAP_UNROLL))
891     {
892       /* We were not asked to, just return back silently.  */
893       return;
894     }
895 
896   if (dump_file)
897     fprintf (dump_file,
898 	     "\n;; Considering unrolling loop with runtime "
899 	     "computable number of iterations\n");
900 
901   /* nunroll = total number of copies of the original loop body in
902      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
903   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
904   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
905   if (nunroll > nunroll_by_av)
906     nunroll = nunroll_by_av;
907   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
908     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
909 
910   if (targetm.loop_unroll_adjust)
911     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
912 
913   /* Skip big loops.  */
914   if (nunroll <= 1)
915     {
916       if (dump_file)
917 	fprintf (dump_file, ";; Not considering loop, is too big\n");
918       return;
919     }
920 
921   /* Check for simple loops.  */
922   desc = get_simple_loop_desc (loop);
923 
924   /* Check simpleness.  */
925   if (!desc->simple_p || desc->assumptions)
926     {
927       if (dump_file)
928 	fprintf (dump_file,
929 		 ";; Unable to prove that the number of iterations "
930 		 "can be counted in runtime\n");
931       return;
932     }
933 
934   if (desc->const_iter)
935     {
936       if (dump_file)
937 	fprintf (dump_file, ";; Loop iterates constant times\n");
938       return;
939     }
940 
941   /* Check whether the loop rolls.  */
942   if ((estimated_loop_iterations (loop, &iterations)
943        || max_loop_iterations (loop, &iterations))
944       && iterations.ult (double_int::from_shwi (2 * nunroll)))
945     {
946       if (dump_file)
947 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
948       return;
949     }
950 
951   /* Success; now force nunroll to be power of 2, as we are unable to
952      cope with overflows in computation of number of iterations.  */
953   for (i = 1; 2 * i <= nunroll; i *= 2)
954     continue;
955 
956   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
957   loop->lpt_decision.times = i - 1;
958 }
959 
960 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
961    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
962    and NULL is returned instead.  */
963 
964 basic_block
965 split_edge_and_insert (edge e, rtx insns)
966 {
967   basic_block bb;
968 
969   if (!insns)
970     return NULL;
971   bb = split_edge (e);
972   emit_insn_after (insns, BB_END (bb));
973 
974   /* ??? We used to assume that INSNS can contain control flow insns, and
975      that we had to try to find sub basic blocks in BB to maintain a valid
976      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
977      and call break_superblocks when going out of cfglayout mode.  But it
978      turns out that this never happens; and that if it does ever happen,
979      the TODO_verify_flow at the end of the RTL loop passes would fail.
980 
981      There are two reasons why we expected we could have control flow insns
982      in INSNS.  The first is when a comparison has to be done in parts, and
983      the second is when the number of iterations is computed for loops with
984      the number of iterations known at runtime.  In both cases, test cases
985      to get control flow in INSNS appear to be impossible to construct:
986 
987       * If do_compare_rtx_and_jump needs several branches to do comparison
988 	in a mode that needs comparison by parts, we cannot analyze the
989 	number of iterations of the loop, and we never get to unrolling it.
990 
991       * The code in expand_divmod that was suspected to cause creation of
992 	branching code seems to be only accessed for signed division.  The
993 	divisions used by # of iterations analysis are always unsigned.
994 	Problems might arise on architectures that emits branching code
995 	for some operations that may appear in the unroller (especially
996 	for division), but we have no such architectures.
997 
998      Considering all this, it was decided that we should for now assume
999      that INSNS can in theory contain control flow insns, but in practice
1000      it never does.  So we don't handle the theoretical case, and should
1001      a real failure ever show up, we have a pretty good clue for how to
1002      fix it.  */
1003 
1004   return bb;
1005 }
1006 
1007 /* Unroll LOOP for which we are able to count number of iterations in runtime
1008    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1009    extra care for case n < 0):
1010 
1011    for (i = 0; i < n; i++)
1012      body;
1013 
1014    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1015 
1016    i = 0;
1017    mod = n % 4;
1018 
1019    switch (mod)
1020      {
1021        case 3:
1022          body; i++;
1023        case 2:
1024          body; i++;
1025        case 1:
1026          body; i++;
1027        case 0: ;
1028      }
1029 
1030    while (i < n)
1031      {
1032        body; i++;
1033        body; i++;
1034        body; i++;
1035        body; i++;
1036      }
1037    */
1038 static void
1039 unroll_loop_runtime_iterations (struct loop *loop)
1040 {
1041   rtx old_niter, niter, init_code, branch_code, tmp;
1042   unsigned i, j, p;
1043   basic_block preheader, *body, swtch, ezc_swtch;
1044   vec<basic_block> dom_bbs;
1045   sbitmap wont_exit;
1046   int may_exit_copy;
1047   unsigned n_peel;
1048   vec<edge> remove_edges;
1049   edge e;
1050   bool extra_zero_check, last_may_exit;
1051   unsigned max_unroll = loop->lpt_decision.times;
1052   struct niter_desc *desc = get_simple_loop_desc (loop);
1053   bool exit_at_end = loop_exit_at_end_p (loop);
1054   struct opt_info *opt_info = NULL;
1055   bool ok;
1056 
1057   if (flag_split_ivs_in_unroller
1058       || flag_variable_expansion_in_unroller)
1059     opt_info = analyze_insns_in_loop (loop);
1060 
1061   /* Remember blocks whose dominators will have to be updated.  */
1062   dom_bbs.create (0);
1063 
1064   body = get_loop_body (loop);
1065   for (i = 0; i < loop->num_nodes; i++)
1066     {
1067       vec<basic_block> ldom;
1068       basic_block bb;
1069 
1070       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1071       FOR_EACH_VEC_ELT (ldom, j, bb)
1072 	if (!flow_bb_inside_loop_p (loop, bb))
1073 	  dom_bbs.safe_push (bb);
1074 
1075       ldom.release ();
1076     }
1077   free (body);
1078 
1079   if (!exit_at_end)
1080     {
1081       /* Leave exit in first copy (for explanation why see comment in
1082 	 unroll_loop_constant_iterations).  */
1083       may_exit_copy = 0;
1084       n_peel = max_unroll - 1;
1085       extra_zero_check = true;
1086       last_may_exit = false;
1087     }
1088   else
1089     {
1090       /* Leave exit in last copy (for explanation why see comment in
1091 	 unroll_loop_constant_iterations).  */
1092       may_exit_copy = max_unroll;
1093       n_peel = max_unroll;
1094       extra_zero_check = false;
1095       last_may_exit = true;
1096     }
1097 
1098   /* Get expression for number of iterations.  */
1099   start_sequence ();
1100   old_niter = niter = gen_reg_rtx (desc->mode);
1101   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1102   if (tmp != niter)
1103     emit_move_insn (niter, tmp);
1104 
1105   /* Count modulo by ANDing it with max_unroll; we use the fact that
1106      the number of unrollings is a power of two, and thus this is correct
1107      even if there is overflow in the computation.  */
1108   niter = expand_simple_binop (desc->mode, AND,
1109 			       niter,
1110 			       GEN_INT (max_unroll),
1111 			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
1112 
1113   init_code = get_insns ();
1114   end_sequence ();
1115   unshare_all_rtl_in_chain (init_code);
1116 
1117   /* Precondition the loop.  */
1118   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1119 
1120   remove_edges.create (0);
1121 
1122   wont_exit = sbitmap_alloc (max_unroll + 2);
1123 
1124   /* Peel the first copy of loop body (almost always we must leave exit test
1125      here; the only exception is when we have extra zero check and the number
1126      of iterations is reliable.  Also record the place of (possible) extra
1127      zero check.  */
1128   bitmap_clear (wont_exit);
1129   if (extra_zero_check
1130       && !desc->noloop_assumptions)
1131     bitmap_set_bit (wont_exit, 1);
1132   ezc_swtch = loop_preheader_edge (loop)->src;
1133   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1134 				      1, wont_exit, desc->out_edge,
1135 				      &remove_edges,
1136 				      DLTHE_FLAG_UPDATE_FREQ);
1137   gcc_assert (ok);
1138 
1139   /* Record the place where switch will be built for preconditioning.  */
1140   swtch = split_edge (loop_preheader_edge (loop));
1141 
1142   for (i = 0; i < n_peel; i++)
1143     {
1144       /* Peel the copy.  */
1145       bitmap_clear (wont_exit);
1146       if (i != n_peel - 1 || !last_may_exit)
1147 	bitmap_set_bit (wont_exit, 1);
1148       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1149 					  1, wont_exit, desc->out_edge,
1150 					  &remove_edges,
1151 					  DLTHE_FLAG_UPDATE_FREQ);
1152       gcc_assert (ok);
1153 
1154       /* Create item for switch.  */
1155       j = n_peel - i - (extra_zero_check ? 0 : 1);
1156       p = REG_BR_PROB_BASE / (i + 2);
1157 
1158       preheader = split_edge (loop_preheader_edge (loop));
1159       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1160 					  block_label (preheader), p,
1161 					  NULL_RTX);
1162 
1163       /* We rely on the fact that the compare and jump cannot be optimized out,
1164 	 and hence the cfg we create is correct.  */
1165       gcc_assert (branch_code != NULL_RTX);
1166 
1167       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1168       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1169       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1170       e = make_edge (swtch, preheader,
1171 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1172       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1173       e->probability = p;
1174     }
1175 
1176   if (extra_zero_check)
1177     {
1178       /* Add branch for zero iterations.  */
1179       p = REG_BR_PROB_BASE / (max_unroll + 1);
1180       swtch = ezc_swtch;
1181       preheader = split_edge (loop_preheader_edge (loop));
1182       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1183 					  block_label (preheader), p,
1184 					  NULL_RTX);
1185       gcc_assert (branch_code != NULL_RTX);
1186 
1187       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1188       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1189       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1190       e = make_edge (swtch, preheader,
1191 		     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1192       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1193       e->probability = p;
1194     }
1195 
1196   /* Recount dominators for outer blocks.  */
1197   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1198 
1199   /* And unroll loop.  */
1200 
1201   bitmap_ones (wont_exit);
1202   bitmap_clear_bit (wont_exit, may_exit_copy);
1203   opt_info_start_duplication (opt_info);
1204 
1205   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1206 				      max_unroll,
1207 				      wont_exit, desc->out_edge,
1208 				      &remove_edges,
1209 				      DLTHE_FLAG_UPDATE_FREQ
1210 				      | (opt_info
1211 					 ? DLTHE_RECORD_COPY_NUMBER
1212 					   : 0));
1213   gcc_assert (ok);
1214 
1215   if (opt_info)
1216     {
1217       apply_opt_in_copies (opt_info, max_unroll, true, true);
1218       free_opt_info (opt_info);
1219     }
1220 
1221   free (wont_exit);
1222 
1223   if (exit_at_end)
1224     {
1225       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1226       /* Find a new in and out edge; they are in the last copy we have
1227 	 made.  */
1228 
1229       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1230 	{
1231 	  desc->out_edge = EDGE_SUCC (exit_block, 0);
1232 	  desc->in_edge = EDGE_SUCC (exit_block, 1);
1233 	}
1234       else
1235 	{
1236 	  desc->out_edge = EDGE_SUCC (exit_block, 1);
1237 	  desc->in_edge = EDGE_SUCC (exit_block, 0);
1238 	}
1239     }
1240 
1241   /* Remove the edges.  */
1242   FOR_EACH_VEC_ELT (remove_edges, i, e)
1243     remove_path (e);
1244   remove_edges.release ();
1245 
1246   /* We must be careful when updating the number of iterations due to
1247      preconditioning and the fact that the value must be valid at entry
1248      of the loop.  After passing through the above code, we see that
1249      the correct new number of iterations is this:  */
1250   gcc_assert (!desc->const_iter);
1251   desc->niter_expr =
1252     simplify_gen_binary (UDIV, desc->mode, old_niter,
1253 			 GEN_INT (max_unroll + 1));
1254   loop->nb_iterations_upper_bound
1255     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1256 								   + 1),
1257 					    TRUNC_DIV_EXPR);
1258   if (loop->any_estimate)
1259     loop->nb_iterations_estimate
1260       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1261 							          + 1),
1262 				           TRUNC_DIV_EXPR);
1263   if (exit_at_end)
1264     {
1265       desc->niter_expr =
1266 	simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1267       desc->noloop_assumptions = NULL_RTX;
1268       --loop->nb_iterations_upper_bound;
1269       if (loop->any_estimate
1270 	  && loop->nb_iterations_estimate != double_int_zero)
1271 	--loop->nb_iterations_estimate;
1272       else
1273 	loop->any_estimate = false;
1274     }
1275 
1276   if (dump_file)
1277     fprintf (dump_file,
1278 	     ";; Unrolled loop %d times, counting # of iterations "
1279 	     "in runtime, %i insns\n",
1280 	     max_unroll, num_loop_insns (loop));
1281 
1282   dom_bbs.release ();
1283 }
1284 
1285 /* Decide whether to simply peel LOOP and how much.  */
1286 static void
1287 decide_peel_simple (struct loop *loop, int flags)
1288 {
1289   unsigned npeel;
1290   double_int iterations;
1291 
1292   if (!(flags & UAP_PEEL))
1293     {
1294       /* We were not asked to, just return back silently.  */
1295       return;
1296     }
1297 
1298   if (dump_file)
1299     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1300 
1301   /* npeel = number of iterations to peel.  */
1302   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1303   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1304     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1305 
1306   /* Skip big loops.  */
1307   if (!npeel)
1308     {
1309       if (dump_file)
1310 	fprintf (dump_file, ";; Not considering loop, is too big\n");
1311       return;
1312     }
1313 
1314   /* Do not simply peel loops with branches inside -- it increases number
1315      of mispredicts.
1316      Exception is when we do have profile and we however have good chance
1317      to peel proper number of iterations loop will iterate in practice.
1318      TODO: this heuristic needs tunning; while for complette unrolling
1319      the branch inside loop mostly eliminates any improvements, for
1320      peeling it is not the case.  Also a function call inside loop is
1321      also branch from branch prediction POV (and probably better reason
1322      to not unroll/peel).  */
1323   if (num_loop_branches (loop) > 1
1324       && profile_status != PROFILE_READ)
1325     {
1326       if (dump_file)
1327 	fprintf (dump_file, ";; Not peeling, contains branches\n");
1328       return;
1329     }
1330 
1331   /* If we have realistic estimate on number of iterations, use it.  */
1332   if (estimated_loop_iterations (loop, &iterations))
1333     {
1334       if (double_int::from_shwi (npeel).ule (iterations))
1335 	{
1336 	  if (dump_file)
1337 	    {
1338 	      fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1339 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1340 		       (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1341 	      fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1342 		       npeel);
1343 	    }
1344 	  return;
1345 	}
1346       npeel = iterations.to_shwi () + 1;
1347     }
1348   /* If we have small enough bound on iterations, we can still peel (completely
1349      unroll).  */
1350   else if (max_loop_iterations (loop, &iterations)
1351            && iterations.ult (double_int::from_shwi (npeel)))
1352     npeel = iterations.to_shwi () + 1;
1353   else
1354     {
1355       /* For now we have no good heuristics to decide whether loop peeling
1356          will be effective, so disable it.  */
1357       if (dump_file)
1358 	fprintf (dump_file,
1359 		 ";; Not peeling loop, no evidence it will be profitable\n");
1360       return;
1361     }
1362 
1363   /* Success.  */
1364   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1365   loop->lpt_decision.times = npeel;
1366 }
1367 
1368 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1369 
1370    while (cond)
1371      body;
1372 
1373    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1374 
1375    if (!cond) goto end;
1376    body;
1377    if (!cond) goto end;
1378    body;
1379    if (!cond) goto end;
1380    body;
1381    while (cond)
1382      body;
1383    end: ;
1384    */
1385 static void
1386 peel_loop_simple (struct loop *loop)
1387 {
1388   sbitmap wont_exit;
1389   unsigned npeel = loop->lpt_decision.times;
1390   struct niter_desc *desc = get_simple_loop_desc (loop);
1391   struct opt_info *opt_info = NULL;
1392   bool ok;
1393 
1394   if (flag_split_ivs_in_unroller && npeel > 1)
1395     opt_info = analyze_insns_in_loop (loop);
1396 
1397   wont_exit = sbitmap_alloc (npeel + 1);
1398   bitmap_clear (wont_exit);
1399 
1400   opt_info_start_duplication (opt_info);
1401 
1402   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1403 				      npeel, wont_exit, NULL,
1404 				      NULL, DLTHE_FLAG_UPDATE_FREQ
1405 				      | (opt_info
1406 					 ? DLTHE_RECORD_COPY_NUMBER
1407 					   : 0));
1408   gcc_assert (ok);
1409 
1410   free (wont_exit);
1411 
1412   if (opt_info)
1413     {
1414       apply_opt_in_copies (opt_info, npeel, false, false);
1415       free_opt_info (opt_info);
1416     }
1417 
1418   if (desc->simple_p)
1419     {
1420       if (desc->const_iter)
1421 	{
1422 	  desc->niter -= npeel;
1423 	  desc->niter_expr = GEN_INT (desc->niter);
1424 	  desc->noloop_assumptions = NULL_RTX;
1425 	}
1426       else
1427 	{
1428 	  /* We cannot just update niter_expr, as its value might be clobbered
1429 	     inside loop.  We could handle this by counting the number into
1430 	     temporary just like we do in runtime unrolling, but it does not
1431 	     seem worthwhile.  */
1432 	  free_simple_loop_desc (loop);
1433 	}
1434     }
1435   if (dump_file)
1436     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1437 }
1438 
1439 /* Decide whether to unroll LOOP stupidly and how much.  */
1440 static void
1441 decide_unroll_stupid (struct loop *loop, int flags)
1442 {
1443   unsigned nunroll, nunroll_by_av, i;
1444   struct niter_desc *desc;
1445   double_int iterations;
1446 
1447   if (!(flags & UAP_UNROLL_ALL))
1448     {
1449       /* We were not asked to, just return back silently.  */
1450       return;
1451     }
1452 
1453   if (dump_file)
1454     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1455 
1456   /* nunroll = total number of copies of the original loop body in
1457      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1458   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1459   nunroll_by_av
1460     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1461   if (nunroll > nunroll_by_av)
1462     nunroll = nunroll_by_av;
1463   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1464     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1465 
1466   if (targetm.loop_unroll_adjust)
1467     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1468 
1469   /* Skip big loops.  */
1470   if (nunroll <= 1)
1471     {
1472       if (dump_file)
1473 	fprintf (dump_file, ";; Not considering loop, is too big\n");
1474       return;
1475     }
1476 
1477   /* Check for simple loops.  */
1478   desc = get_simple_loop_desc (loop);
1479 
1480   /* Check simpleness.  */
1481   if (desc->simple_p && !desc->assumptions)
1482     {
1483       if (dump_file)
1484 	fprintf (dump_file, ";; The loop is simple\n");
1485       return;
1486     }
1487 
1488   /* Do not unroll loops with branches inside -- it increases number
1489      of mispredicts.
1490      TODO: this heuristic needs tunning; call inside the loop body
1491      is also relatively good reason to not unroll.  */
1492   if (num_loop_branches (loop) > 1)
1493     {
1494       if (dump_file)
1495 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
1496       return;
1497     }
1498 
1499   /* Check whether the loop rolls.  */
1500   if ((estimated_loop_iterations (loop, &iterations)
1501        || max_loop_iterations (loop, &iterations))
1502       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1503     {
1504       if (dump_file)
1505 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1506       return;
1507     }
1508 
1509   /* Success.  Now force nunroll to be power of 2, as it seems that this
1510      improves results (partially because of better alignments, partially
1511      because of some dark magic).  */
1512   for (i = 1; 2 * i <= nunroll; i *= 2)
1513     continue;
1514 
1515   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1516   loop->lpt_decision.times = i - 1;
1517 }
1518 
1519 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1520 
1521    while (cond)
1522      body;
1523 
1524    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1525 
1526    while (cond)
1527      {
1528        body;
1529        if (!cond) break;
1530        body;
1531        if (!cond) break;
1532        body;
1533        if (!cond) break;
1534        body;
1535      }
1536    */
1537 static void
1538 unroll_loop_stupid (struct loop *loop)
1539 {
1540   sbitmap wont_exit;
1541   unsigned nunroll = loop->lpt_decision.times;
1542   struct niter_desc *desc = get_simple_loop_desc (loop);
1543   struct opt_info *opt_info = NULL;
1544   bool ok;
1545 
1546   if (flag_split_ivs_in_unroller
1547       || flag_variable_expansion_in_unroller)
1548     opt_info = analyze_insns_in_loop (loop);
1549 
1550 
1551   wont_exit = sbitmap_alloc (nunroll + 1);
1552   bitmap_clear (wont_exit);
1553   opt_info_start_duplication (opt_info);
1554 
1555   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1556 				      nunroll, wont_exit,
1557 				      NULL, NULL,
1558 				      DLTHE_FLAG_UPDATE_FREQ
1559 				      | (opt_info
1560 					 ? DLTHE_RECORD_COPY_NUMBER
1561 					   : 0));
1562   gcc_assert (ok);
1563 
1564   if (opt_info)
1565     {
1566       apply_opt_in_copies (opt_info, nunroll, true, true);
1567       free_opt_info (opt_info);
1568     }
1569 
1570   free (wont_exit);
1571 
1572   if (desc->simple_p)
1573     {
1574       /* We indeed may get here provided that there are nontrivial assumptions
1575 	 for a loop to be really simple.  We could update the counts, but the
1576 	 problem is that we are unable to decide which exit will be taken
1577 	 (not really true in case the number of iterations is constant,
1578 	 but noone will do anything with this information, so we do not
1579 	 worry about it).  */
1580       desc->simple_p = false;
1581     }
1582 
1583   if (dump_file)
1584     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1585 	     nunroll, num_loop_insns (loop));
1586 }
1587 
1588 /* A hash function for information about insns to split.  */
1589 
1590 static hashval_t
1591 si_info_hash (const void *ivts)
1592 {
1593   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1594 }
1595 
1596 /* An equality functions for information about insns to split.  */
1597 
1598 static int
1599 si_info_eq (const void *ivts1, const void *ivts2)
1600 {
1601   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1602   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1603 
1604   return i1->insn == i2->insn;
1605 }
1606 
1607 /* Return a hash for VES, which is really a "var_to_expand *".  */
1608 
1609 static hashval_t
1610 ve_info_hash (const void *ves)
1611 {
1612   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1613 }
1614 
1615 /* Return true if IVTS1 and IVTS2 (which are really both of type
1616    "var_to_expand *") refer to the same instruction.  */
1617 
1618 static int
1619 ve_info_eq (const void *ivts1, const void *ivts2)
1620 {
1621   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1622   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1623 
1624   return i1->insn == i2->insn;
1625 }
1626 
1627 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1628    Set *DEBUG_USES to the number of debug insns that reference the
1629    variable.  */
1630 
1631 bool
1632 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1633 				  int *debug_uses)
1634 {
1635   basic_block *body, bb;
1636   unsigned i;
1637   int count_ref = 0;
1638   rtx insn;
1639 
1640   body = get_loop_body (loop);
1641   for (i = 0; i < loop->num_nodes; i++)
1642     {
1643       bb = body[i];
1644 
1645       FOR_BB_INSNS (bb, insn)
1646 	if (!rtx_referenced_p (reg, insn))
1647 	  continue;
1648 	else if (DEBUG_INSN_P (insn))
1649 	  ++*debug_uses;
1650 	else if (++count_ref > 1)
1651 	  break;
1652     }
1653   free (body);
1654   return (count_ref  == 1);
1655 }
1656 
1657 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1658 
1659 static void
1660 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1661 {
1662   basic_block *body, bb;
1663   unsigned i;
1664   rtx insn;
1665 
1666   body = get_loop_body (loop);
1667   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1668     {
1669       bb = body[i];
1670 
1671       FOR_BB_INSNS (bb, insn)
1672 	if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1673 	  continue;
1674 	else
1675 	  {
1676 	    validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1677 			     gen_rtx_UNKNOWN_VAR_LOC (), 0);
1678 	    if (!--debug_uses)
1679 	      break;
1680 	  }
1681     }
1682   free (body);
1683 }
1684 
1685 /* Determine whether INSN contains an accumulator
1686    which can be expanded into separate copies,
1687    one for each copy of the LOOP body.
1688 
1689    for (i = 0 ; i < n; i++)
1690      sum += a[i];
1691 
1692    ==>
1693 
1694    sum += a[i]
1695    ....
1696    i = i+1;
1697    sum1 += a[i]
1698    ....
1699    i = i+1
1700    sum2 += a[i];
1701    ....
1702 
1703    Return NULL if INSN contains no opportunity for expansion of accumulator.
1704    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1705    information and return a pointer to it.
1706 */
1707 
1708 static struct var_to_expand *
1709 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1710 {
1711   rtx set, dest, src;
1712   struct var_to_expand *ves;
1713   unsigned accum_pos;
1714   enum rtx_code code;
1715   int debug_uses = 0;
1716 
1717   set = single_set (insn);
1718   if (!set)
1719     return NULL;
1720 
1721   dest = SET_DEST (set);
1722   src = SET_SRC (set);
1723   code = GET_CODE (src);
1724 
1725   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1726     return NULL;
1727 
1728   if (FLOAT_MODE_P (GET_MODE (dest)))
1729     {
1730       if (!flag_associative_math)
1731         return NULL;
1732       /* In the case of FMA, we're also changing the rounding.  */
1733       if (code == FMA && !flag_unsafe_math_optimizations)
1734 	return NULL;
1735     }
1736 
1737   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1738      in MD.  But if there is no optab to generate the insn, we can not
1739      perform the variable expansion.  This can happen if an MD provides
1740      an insn but not a named pattern to generate it, for example to avoid
1741      producing code that needs additional mode switches like for x87/mmx.
1742 
1743      So we check have_insn_for which looks for an optab for the operation
1744      in SRC.  If it doesn't exist, we can't perform the expansion even
1745      though INSN is valid.  */
1746   if (!have_insn_for (code, GET_MODE (src)))
1747     return NULL;
1748 
1749   if (!REG_P (dest)
1750       && !(GET_CODE (dest) == SUBREG
1751            && REG_P (SUBREG_REG (dest))))
1752     return NULL;
1753 
1754   /* Find the accumulator use within the operation.  */
1755   if (code == FMA)
1756     {
1757       /* We only support accumulation via FMA in the ADD position.  */
1758       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1759 	return NULL;
1760       accum_pos = 2;
1761     }
1762   else if (rtx_equal_p (dest, XEXP (src, 0)))
1763     accum_pos = 0;
1764   else if (rtx_equal_p (dest, XEXP (src, 1)))
1765     {
1766       /* The method of expansion that we are using; which includes the
1767 	 initialization of the expansions with zero and the summation of
1768          the expansions at the end of the computation will yield wrong
1769 	 results for (x = something - x) thus avoid using it in that case.  */
1770       if (code == MINUS)
1771 	return NULL;
1772       accum_pos = 1;
1773     }
1774   else
1775     return NULL;
1776 
1777   /* It must not otherwise be used.  */
1778   if (code == FMA)
1779     {
1780       if (rtx_referenced_p (dest, XEXP (src, 0))
1781 	  || rtx_referenced_p (dest, XEXP (src, 1)))
1782 	return NULL;
1783     }
1784   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1785     return NULL;
1786 
1787   /* It must be used in exactly one insn.  */
1788   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1789     return NULL;
1790 
1791   if (dump_file)
1792     {
1793       fprintf (dump_file, "\n;; Expanding Accumulator ");
1794       print_rtl (dump_file, dest);
1795       fprintf (dump_file, "\n");
1796     }
1797 
1798   if (debug_uses)
1799     /* Instead of resetting the debug insns, we could replace each
1800        debug use in the loop with the sum or product of all expanded
1801        accummulators.  Since we'll only know of all expansions at the
1802        end, we'd have to keep track of which vars_to_expand a debug
1803        insn in the loop references, take note of each copy of the
1804        debug insn during unrolling, and when it's all done, compute
1805        the sum or product of each variable and adjust the original
1806        debug insn and each copy thereof.  What a pain!  */
1807     reset_debug_uses_in_loop (loop, dest, debug_uses);
1808 
1809   /* Record the accumulator to expand.  */
1810   ves = XNEW (struct var_to_expand);
1811   ves->insn = insn;
1812   ves->reg = copy_rtx (dest);
1813   ves->var_expansions.create (1);
1814   ves->next = NULL;
1815   ves->op = GET_CODE (src);
1816   ves->expansion_count = 0;
1817   ves->reuse_expansion = 0;
1818   return ves;
1819 }
1820 
1821 /* Determine whether there is an induction variable in INSN that
1822    we would like to split during unrolling.
1823 
1824    I.e. replace
1825 
1826    i = i + 1;
1827    ...
1828    i = i + 1;
1829    ...
1830    i = i + 1;
1831    ...
1832 
1833    type chains by
1834 
1835    i0 = i + 1
1836    ...
1837    i = i0 + 1
1838    ...
1839    i = i0 + 2
1840    ...
1841 
1842    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1843    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1844    pointer to it.  */
1845 
1846 static struct iv_to_split *
1847 analyze_iv_to_split_insn (rtx insn)
1848 {
1849   rtx set, dest;
1850   struct rtx_iv iv;
1851   struct iv_to_split *ivts;
1852   bool ok;
1853 
1854   /* For now we just split the basic induction variables.  Later this may be
1855      extended for example by selecting also addresses of memory references.  */
1856   set = single_set (insn);
1857   if (!set)
1858     return NULL;
1859 
1860   dest = SET_DEST (set);
1861   if (!REG_P (dest))
1862     return NULL;
1863 
1864   if (!biv_p (insn, dest))
1865     return NULL;
1866 
1867   ok = iv_analyze_result (insn, dest, &iv);
1868 
1869   /* This used to be an assert under the assumption that if biv_p returns
1870      true that iv_analyze_result must also return true.  However, that
1871      assumption is not strictly correct as evidenced by pr25569.
1872 
1873      Returning NULL when iv_analyze_result returns false is safe and
1874      avoids the problems in pr25569 until the iv_analyze_* routines
1875      can be fixed, which is apparently hard and time consuming
1876      according to their author.  */
1877   if (! ok)
1878     return NULL;
1879 
1880   if (iv.step == const0_rtx
1881       || iv.mode != iv.extend_mode)
1882     return NULL;
1883 
1884   /* Record the insn to split.  */
1885   ivts = XNEW (struct iv_to_split);
1886   ivts->insn = insn;
1887   ivts->orig_var = dest;
1888   ivts->base_var = NULL_RTX;
1889   ivts->step = iv.step;
1890   ivts->next = NULL;
1891   ivts->n_loc = 1;
1892   ivts->loc[0] = 1;
1893 
1894   return ivts;
1895 }
1896 
1897 /* Determines which of insns in LOOP can be optimized.
1898    Return a OPT_INFO struct with the relevant hash tables filled
1899    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1900    is undefined for the return value.  */
1901 
1902 static struct opt_info *
1903 analyze_insns_in_loop (struct loop *loop)
1904 {
1905   basic_block *body, bb;
1906   unsigned i;
1907   struct opt_info *opt_info = XCNEW (struct opt_info);
1908   rtx insn;
1909   struct iv_to_split *ivts = NULL;
1910   struct var_to_expand *ves = NULL;
1911   PTR *slot1;
1912   PTR *slot2;
1913   vec<edge> edges = get_loop_exit_edges (loop);
1914   edge exit;
1915   bool can_apply = false;
1916 
1917   iv_analysis_loop_init (loop);
1918 
1919   body = get_loop_body (loop);
1920 
1921   if (flag_split_ivs_in_unroller)
1922     {
1923       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1924 					      si_info_hash, si_info_eq, free);
1925       opt_info->iv_to_split_head = NULL;
1926       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1927     }
1928 
1929   /* Record the loop exit bb and loop preheader before the unrolling.  */
1930   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1931 
1932   if (edges.length () == 1)
1933     {
1934       exit = edges[0];
1935       if (!(exit->flags & EDGE_COMPLEX))
1936 	{
1937 	  opt_info->loop_exit = split_edge (exit);
1938 	  can_apply = true;
1939 	}
1940     }
1941 
1942   if (flag_variable_expansion_in_unroller
1943       && can_apply)
1944     {
1945       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1946 							ve_info_hash,
1947 							ve_info_eq, free);
1948       opt_info->var_to_expand_head = NULL;
1949       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1950     }
1951 
1952   for (i = 0; i < loop->num_nodes; i++)
1953     {
1954       bb = body[i];
1955       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1956 	continue;
1957 
1958       FOR_BB_INSNS (bb, insn)
1959       {
1960         if (!INSN_P (insn))
1961           continue;
1962 
1963         if (opt_info->insns_to_split)
1964           ivts = analyze_iv_to_split_insn (insn);
1965 
1966         if (ivts)
1967           {
1968             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1969 	    gcc_assert (*slot1 == NULL);
1970             *slot1 = ivts;
1971 	    *opt_info->iv_to_split_tail = ivts;
1972 	    opt_info->iv_to_split_tail = &ivts->next;
1973             continue;
1974           }
1975 
1976         if (opt_info->insns_with_var_to_expand)
1977           ves = analyze_insn_to_expand_var (loop, insn);
1978 
1979         if (ves)
1980           {
1981             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1982 	    gcc_assert (*slot2 == NULL);
1983             *slot2 = ves;
1984 	    *opt_info->var_to_expand_tail = ves;
1985 	    opt_info->var_to_expand_tail = &ves->next;
1986           }
1987       }
1988     }
1989 
1990   edges.release ();
1991   free (body);
1992   return opt_info;
1993 }
1994 
1995 /* Called just before loop duplication.  Records start of duplicated area
1996    to OPT_INFO.  */
1997 
1998 static void
1999 opt_info_start_duplication (struct opt_info *opt_info)
2000 {
2001   if (opt_info)
2002     opt_info->first_new_block = last_basic_block;
2003 }
2004 
2005 /* Determine the number of iterations between initialization of the base
2006    variable and the current copy (N_COPY).  N_COPIES is the total number
2007    of newly created copies.  UNROLLING is true if we are unrolling
2008    (not peeling) the loop.  */
2009 
2010 static unsigned
2011 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2012 {
2013   if (unrolling)
2014     {
2015       /* If we are unrolling, initialization is done in the original loop
2016 	 body (number 0).  */
2017       return n_copy;
2018     }
2019   else
2020     {
2021       /* If we are peeling, the copy in that the initialization occurs has
2022 	 number 1.  The original loop (number 0) is the last.  */
2023       if (n_copy)
2024 	return n_copy - 1;
2025       else
2026 	return n_copies;
2027     }
2028 }
2029 
2030 /* Locate in EXPR the expression corresponding to the location recorded
2031    in IVTS, and return a pointer to the RTX for this location.  */
2032 
2033 static rtx *
2034 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2035 {
2036   unsigned i;
2037   rtx *ret = &expr;
2038 
2039   for (i = 0; i < ivts->n_loc; i++)
2040     ret = &XEXP (*ret, ivts->loc[i]);
2041 
2042   return ret;
2043 }
2044 
2045 /* Allocate basic variable for the induction variable chain.  */
2046 
2047 static void
2048 allocate_basic_variable (struct iv_to_split *ivts)
2049 {
2050   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2051 
2052   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2053 }
2054 
2055 /* Insert initialization of basic variable of IVTS before INSN, taking
2056    the initial value from INSN.  */
2057 
2058 static void
2059 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2060 {
2061   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2062   rtx seq;
2063 
2064   start_sequence ();
2065   expr = force_operand (expr, ivts->base_var);
2066   if (expr != ivts->base_var)
2067     emit_move_insn (ivts->base_var, expr);
2068   seq = get_insns ();
2069   end_sequence ();
2070 
2071   emit_insn_before (seq, insn);
2072 }
2073 
2074 /* Replace the use of induction variable described in IVTS in INSN
2075    by base variable + DELTA * step.  */
2076 
2077 static void
2078 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2079 {
2080   rtx expr, *loc, seq, incr, var;
2081   enum machine_mode mode = GET_MODE (ivts->base_var);
2082   rtx src, dest, set;
2083 
2084   /* Construct base + DELTA * step.  */
2085   if (!delta)
2086     expr = ivts->base_var;
2087   else
2088     {
2089       incr = simplify_gen_binary (MULT, mode,
2090 				  ivts->step, gen_int_mode (delta, mode));
2091       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2092 				  ivts->base_var, incr);
2093     }
2094 
2095   /* Figure out where to do the replacement.  */
2096   loc = get_ivts_expr (single_set (insn), ivts);
2097 
2098   /* If we can make the replacement right away, we're done.  */
2099   if (validate_change (insn, loc, expr, 0))
2100     return;
2101 
2102   /* Otherwise, force EXPR into a register and try again.  */
2103   start_sequence ();
2104   var = gen_reg_rtx (mode);
2105   expr = force_operand (expr, var);
2106   if (expr != var)
2107     emit_move_insn (var, expr);
2108   seq = get_insns ();
2109   end_sequence ();
2110   emit_insn_before (seq, insn);
2111 
2112   if (validate_change (insn, loc, var, 0))
2113     return;
2114 
2115   /* The last chance.  Try recreating the assignment in insn
2116      completely from scratch.  */
2117   set = single_set (insn);
2118   gcc_assert (set);
2119 
2120   start_sequence ();
2121   *loc = var;
2122   src = copy_rtx (SET_SRC (set));
2123   dest = copy_rtx (SET_DEST (set));
2124   src = force_operand (src, dest);
2125   if (src != dest)
2126     emit_move_insn (dest, src);
2127   seq = get_insns ();
2128   end_sequence ();
2129 
2130   emit_insn_before (seq, insn);
2131   delete_insn (insn);
2132 }
2133 
2134 
2135 /* Return one expansion of the accumulator recorded in struct VE.  */
2136 
2137 static rtx
2138 get_expansion (struct var_to_expand *ve)
2139 {
2140   rtx reg;
2141 
2142   if (ve->reuse_expansion == 0)
2143     reg = ve->reg;
2144   else
2145     reg = ve->var_expansions[ve->reuse_expansion - 1];
2146 
2147   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2148     ve->reuse_expansion = 0;
2149   else
2150     ve->reuse_expansion++;
2151 
2152   return reg;
2153 }
2154 
2155 
2156 /* Given INSN replace the uses of the accumulator recorded in VE
2157    with a new register.  */
2158 
2159 static void
2160 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2161 {
2162   rtx new_reg, set;
2163   bool really_new_expansion = false;
2164 
2165   set = single_set (insn);
2166   gcc_assert (set);
2167 
2168   /* Generate a new register only if the expansion limit has not been
2169      reached.  Else reuse an already existing expansion.  */
2170   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2171     {
2172       really_new_expansion = true;
2173       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2174     }
2175   else
2176     new_reg = get_expansion (ve);
2177 
2178   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2179   if (apply_change_group ())
2180     if (really_new_expansion)
2181       {
2182         ve->var_expansions.safe_push (new_reg);
2183         ve->expansion_count++;
2184       }
2185 }
2186 
2187 /* Initialize the variable expansions in loop preheader.  PLACE is the
2188    loop-preheader basic block where the initialization of the
2189    expansions should take place.  The expansions are initialized with
2190    (-0) when the operation is plus or minus to honor sign zero.  This
2191    way we can prevent cases where the sign of the final result is
2192    effected by the sign of the expansion.  Here is an example to
2193    demonstrate this:
2194 
2195    for (i = 0 ; i < n; i++)
2196      sum += something;
2197 
2198    ==>
2199 
2200    sum += something
2201    ....
2202    i = i+1;
2203    sum1 += something
2204    ....
2205    i = i+1
2206    sum2 += something;
2207    ....
2208 
2209    When SUM is initialized with -zero and SOMETHING is also -zero; the
2210    final result of sum should be -zero thus the expansions sum1 and sum2
2211    should be initialized with -zero as well (otherwise we will get +zero
2212    as the final result).  */
2213 
2214 static void
2215 insert_var_expansion_initialization (struct var_to_expand *ve,
2216 				     basic_block place)
2217 {
2218   rtx seq, var, zero_init;
2219   unsigned i;
2220   enum machine_mode mode = GET_MODE (ve->reg);
2221   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2222 
2223   if (ve->var_expansions.length () == 0)
2224     return;
2225 
2226   start_sequence ();
2227   switch (ve->op)
2228     {
2229     case FMA:
2230       /* Note that we only accumulate FMA via the ADD operand.  */
2231     case PLUS:
2232     case MINUS:
2233       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2234         {
2235 	  if (honor_signed_zero_p)
2236 	    zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2237 	  else
2238 	    zero_init = CONST0_RTX (mode);
2239           emit_move_insn (var, zero_init);
2240         }
2241       break;
2242 
2243     case MULT:
2244       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2245         {
2246           zero_init = CONST1_RTX (GET_MODE (var));
2247           emit_move_insn (var, zero_init);
2248         }
2249       break;
2250 
2251     default:
2252       gcc_unreachable ();
2253     }
2254 
2255   seq = get_insns ();
2256   end_sequence ();
2257 
2258   emit_insn_after (seq, BB_END (place));
2259 }
2260 
2261 /* Combine the variable expansions at the loop exit.  PLACE is the
2262    loop exit basic block where the summation of the expansions should
2263    take place.  */
2264 
2265 static void
2266 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2267 {
2268   rtx sum = ve->reg;
2269   rtx expr, seq, var, insn;
2270   unsigned i;
2271 
2272   if (ve->var_expansions.length () == 0)
2273     return;
2274 
2275   start_sequence ();
2276   switch (ve->op)
2277     {
2278     case FMA:
2279       /* Note that we only accumulate FMA via the ADD operand.  */
2280     case PLUS:
2281     case MINUS:
2282       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2283 	sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2284       break;
2285 
2286     case MULT:
2287       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2288 	sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2289       break;
2290 
2291     default:
2292       gcc_unreachable ();
2293     }
2294 
2295   expr = force_operand (sum, ve->reg);
2296   if (expr != ve->reg)
2297     emit_move_insn (ve->reg, expr);
2298   seq = get_insns ();
2299   end_sequence ();
2300 
2301   insn = BB_HEAD (place);
2302   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2303     insn = NEXT_INSN (insn);
2304 
2305   emit_insn_after (seq, insn);
2306 }
2307 
2308 /* Strip away REG_EQUAL notes for IVs we're splitting.
2309 
2310    Updating REG_EQUAL notes for IVs we split is tricky: We
2311    cannot tell until after unrolling, DF-rescanning, and liveness
2312    updating, whether an EQ_USE is reached by the split IV while
2313    the IV reg is still live.  See PR55006.
2314 
2315    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2316    because RTL loop-iv requires us to defer rescanning insns and
2317    any notes attached to them.  So resort to old techniques...  */
2318 
2319 static void
2320 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2321 {
2322   struct iv_to_split *ivts;
2323   rtx note = find_reg_equal_equiv_note (insn);
2324   if (! note)
2325     return;
2326   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2327     if (reg_mentioned_p (ivts->orig_var, note))
2328       {
2329 	remove_note (insn, note);
2330 	return;
2331       }
2332 }
2333 
2334 /* Apply loop optimizations in loop copies using the
2335    data which gathered during the unrolling.  Structure
2336    OPT_INFO record that data.
2337 
2338    UNROLLING is true if we unrolled (not peeled) the loop.
2339    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2340    the loop (as it should happen in complete unrolling, but not in ordinary
2341    peeling of the loop).  */
2342 
2343 static void
2344 apply_opt_in_copies (struct opt_info *opt_info,
2345                      unsigned n_copies, bool unrolling,
2346                      bool rewrite_original_loop)
2347 {
2348   unsigned i, delta;
2349   basic_block bb, orig_bb;
2350   rtx insn, orig_insn, next;
2351   struct iv_to_split ivts_templ, *ivts;
2352   struct var_to_expand ve_templ, *ves;
2353 
2354   /* Sanity check -- we need to put initialization in the original loop
2355      body.  */
2356   gcc_assert (!unrolling || rewrite_original_loop);
2357 
2358   /* Allocate the basic variables (i0).  */
2359   if (opt_info->insns_to_split)
2360     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2361       allocate_basic_variable (ivts);
2362 
2363   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2364     {
2365       bb = BASIC_BLOCK (i);
2366       orig_bb = get_bb_original (bb);
2367 
2368       /* bb->aux holds position in copy sequence initialized by
2369 	 duplicate_loop_to_header_edge.  */
2370       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2371 					unrolling);
2372       bb->aux = 0;
2373       orig_insn = BB_HEAD (orig_bb);
2374       FOR_BB_INSNS_SAFE (bb, insn, next)
2375         {
2376 	  if (!INSN_P (insn)
2377 	      || (DEBUG_INSN_P (insn)
2378 		  && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2379             continue;
2380 
2381 	  while (!INSN_P (orig_insn)
2382 		 || (DEBUG_INSN_P (orig_insn)
2383 		     && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2384 			 == LABEL_DECL)))
2385             orig_insn = NEXT_INSN (orig_insn);
2386 
2387           ivts_templ.insn = orig_insn;
2388           ve_templ.insn = orig_insn;
2389 
2390           /* Apply splitting iv optimization.  */
2391           if (opt_info->insns_to_split)
2392             {
2393 	      maybe_strip_eq_note_for_split_iv (opt_info, insn);
2394 
2395               ivts = (struct iv_to_split *)
2396 		htab_find (opt_info->insns_to_split, &ivts_templ);
2397 
2398               if (ivts)
2399                 {
2400 		  gcc_assert (GET_CODE (PATTERN (insn))
2401 			      == GET_CODE (PATTERN (orig_insn)));
2402 
2403                   if (!delta)
2404                     insert_base_initialization (ivts, insn);
2405                   split_iv (ivts, insn, delta);
2406                 }
2407             }
2408           /* Apply variable expansion optimization.  */
2409           if (unrolling && opt_info->insns_with_var_to_expand)
2410             {
2411               ves = (struct var_to_expand *)
2412 		htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2413               if (ves)
2414                 {
2415 		  gcc_assert (GET_CODE (PATTERN (insn))
2416 			      == GET_CODE (PATTERN (orig_insn)));
2417                   expand_var_during_unrolling (ves, insn);
2418                 }
2419             }
2420           orig_insn = NEXT_INSN (orig_insn);
2421         }
2422     }
2423 
2424   if (!rewrite_original_loop)
2425     return;
2426 
2427   /* Initialize the variable expansions in the loop preheader
2428      and take care of combining them at the loop exit.  */
2429   if (opt_info->insns_with_var_to_expand)
2430     {
2431       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2432 	insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2433       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2434 	combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2435     }
2436 
2437   /* Rewrite also the original loop body.  Find them as originals of the blocks
2438      in the last copied iteration, i.e. those that have
2439      get_bb_copy (get_bb_original (bb)) == bb.  */
2440   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2441     {
2442       bb = BASIC_BLOCK (i);
2443       orig_bb = get_bb_original (bb);
2444       if (get_bb_copy (orig_bb) != bb)
2445 	continue;
2446 
2447       delta = determine_split_iv_delta (0, n_copies, unrolling);
2448       for (orig_insn = BB_HEAD (orig_bb);
2449            orig_insn != NEXT_INSN (BB_END (bb));
2450            orig_insn = next)
2451         {
2452           next = NEXT_INSN (orig_insn);
2453 
2454           if (!INSN_P (orig_insn))
2455  	    continue;
2456 
2457           ivts_templ.insn = orig_insn;
2458           if (opt_info->insns_to_split)
2459             {
2460 	      maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2461 
2462               ivts = (struct iv_to_split *)
2463 		htab_find (opt_info->insns_to_split, &ivts_templ);
2464               if (ivts)
2465                 {
2466                   if (!delta)
2467                     insert_base_initialization (ivts, orig_insn);
2468                   split_iv (ivts, orig_insn, delta);
2469                   continue;
2470                 }
2471             }
2472 
2473         }
2474     }
2475 }
2476 
2477 /* Release OPT_INFO.  */
2478 
2479 static void
2480 free_opt_info (struct opt_info *opt_info)
2481 {
2482   if (opt_info->insns_to_split)
2483     htab_delete (opt_info->insns_to_split);
2484   if (opt_info->insns_with_var_to_expand)
2485     {
2486       struct var_to_expand *ves;
2487 
2488       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2489 	ves->var_expansions.release ();
2490       htab_delete (opt_info->insns_with_var_to_expand);
2491     }
2492   free (opt_info);
2493 }
2494