xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/gimple-loop-versioning.cc (revision 53b02e147d4ed531c0d2a5ca9b3e8026ba3e99b5)
1 /* Loop versioning pass.
2    Copyright (C) 2018-2019 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "tree.h"
25 #include "gimple.h"
26 #include "gimple-iterator.h"
27 #include "tree-pass.h"
28 #include "gimplify-me.h"
29 #include "cfgloop.h"
30 #include "tree-ssa-loop.h"
31 #include "ssa.h"
32 #include "tree-scalar-evolution.h"
33 #include "tree-chrec.h"
34 #include "tree-ssa-loop-ivopts.h"
35 #include "fold-const.h"
36 #include "tree-ssa-propagate.h"
37 #include "tree-inline.h"
38 #include "domwalk.h"
39 #include "alloc-pool.h"
40 #include "vr-values.h"
41 #include "gimple-ssa-evrp-analyze.h"
42 #include "tree-vectorizer.h"
43 #include "omp-general.h"
44 #include "predict.h"
45 #include "tree-into-ssa.h"
46 #include "params.h"
47 
48 namespace {
49 
50 /* This pass looks for loops that could be simplified if certain loop
51    invariant conditions were true.  It is effectively a form of loop
52    splitting in which the pass produces the split conditions itself,
53    instead of using ones that are already present in the IL.
54 
55    Versioning for when strides are 1
56    ---------------------------------
57 
58    At the moment the only thing the pass looks for are memory references
59    like:
60 
61      for (auto i : ...)
62        ...x[i * stride]...
63 
64    It considers changing such loops to:
65 
66      if (stride == 1)
67        for (auto i : ...)    [A]
68 	 ...x[i]...
69      else
70        for (auto i : ...)    [B]
71 	 ...x[i * stride]...
72 
73    This can have several benefits:
74 
75    (1) [A] is often easier or cheaper to vectorize than [B].
76 
77    (2) The scalar code in [A] is simpler than the scalar code in [B]
78        (if the loops cannot be vectorized or need an epilogue loop).
79 
80    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
81 
82    (4) [A] has simpler address evolutions, which can help other passes
83        like loop interchange.
84 
85    The optimization is particularly useful for assumed-shape arrays in
86    Fortran, where the stride of the innermost dimension depends on the
87    array descriptor but is often equal to 1 in practice.  For example:
88 
89      subroutine f1(x)
90        real :: x(:)
91        x(:) = 100
92      end subroutine f1
93 
94    generates the equivalent of:
95 
96      raw_stride = *x.dim[0].stride;
97      stride = raw_stride != 0 ? raw_stride : 1;
98      x_base = *x.data;
99      ...
100      tmp1 = stride * S;
101      tmp2 = tmp1 - stride;
102      *x_base[tmp2] = 1.0e+2;
103 
104    but in the common case that stride == 1, the last three statements
105    simplify to:
106 
107      tmp3 = S + -1;
108      *x_base[tmp3] = 1.0e+2;
109 
110    The optimization is in principle very simple.  The difficult parts are:
111 
112    (a) deciding which parts of a general address calculation correspond
113        to the inner dimension of an array, since this usually isn't explicit
114        in the IL, and for C often isn't even explicit in the source code
115 
116    (b) estimating when the transformation is worthwhile
117 
118    Structure
119    ---------
120 
121    The pass has four phases:
122 
123    (1) Walk through the statements looking for and recording potential
124        versioning opportunities.  Stop if there are none.
125 
126    (2) Use context-sensitive range information to see whether any versioning
127        conditions are impossible in practice.  Remove them if so, and stop
128        if no opportunities remain.
129 
130        (We do this only after (1) to keep compile time down when no
131        versioning opportunities exist.)
132 
133    (3) Apply the cost model.  Decide which versioning opportunities are
134        worthwhile and at which nesting level they should be applied.
135 
136    (4) Attempt to version all the loops selected by (3), so that:
137 
138 	 for (...)
139 	   ...
140 
141        becomes:
142 
143 	 if (!cond)
144 	   for (...) // Original loop
145 	     ...
146 	 else
147 	   for (...) // New loop
148 	     ...
149 
150        Use the version condition COND to simplify the new loop.  */
151 
152 /* Enumerates the likelihood that a particular value indexes the inner
153    dimension of an array.  */
154 enum inner_likelihood {
155   INNER_UNLIKELY,
156   INNER_DONT_KNOW,
157   INNER_LIKELY
158 };
159 
160 /* Information about one term of an address_info.  */
161 struct address_term_info
162 {
163   /* The value of the term is EXPR * MULTIPLIER.  */
164   tree expr;
165   unsigned HOST_WIDE_INT multiplier;
166 
167   /* The stride applied by EXPR in each iteration of some unrecorded loop,
168      or null if no stride has been identified.  */
169   tree stride;
170 
171   /* Enumerates the likelihood that EXPR indexes the inner dimension
172      of an array.  */
173   enum inner_likelihood inner_likelihood;
174 
175   /* True if STRIDE == 1 is a versioning opportunity when considered
176      in isolation.  */
177   bool versioning_opportunity_p;
178 };
179 
180 /* Information about an address calculation, and the range of constant
181    offsets applied to it.  */
182 struct address_info
183 {
184   static const unsigned int MAX_TERMS = 8;
185 
186   /* One statement that calculates the address.  If multiple statements
187      share the same address, we only record the first.  */
188   gimple *stmt;
189 
190   /* The loop containing STMT (cached for convenience).  If multiple
191      statements share the same address, they all belong to this loop.  */
192   struct loop *loop;
193 
194   /* A decomposition of the calculation into a sum of terms plus an
195      optional base.  When BASE is provided, it is never an SSA name.
196      Once initialization is complete, all members of TERMs are SSA names.  */
197   tree base;
198   auto_vec<address_term_info, MAX_TERMS> terms;
199 
200   /* All bytes accessed from the address fall in the offset range
201      [MIN_OFFSET, MAX_OFFSET).  */
202   HOST_WIDE_INT min_offset, max_offset;
203 };
204 
205 /* Stores addresses based on their base and terms (ignoring the offsets).  */
206 struct address_info_hasher : nofree_ptr_hash <address_info>
207 {
208   static hashval_t hash (const address_info *);
209   static bool equal (const address_info *, const address_info *);
210 };
211 
212 /* Information about the versioning we'd like to apply to a loop.  */
213 struct loop_info
214 {
215   bool worth_versioning_p () const;
216 
217   /* True if we've decided not to version this loop.  The remaining
218      fields are meaningless if so.  */
219   bool rejected_p;
220 
221   /* True if at least one subloop of this loop benefits from versioning.  */
222   bool subloops_benefit_p;
223 
224   /* An estimate of the total number of instructions in the loop,
225      excluding those in subloops that benefit from versioning.  */
226   unsigned int num_insns;
227 
228   /* The outermost loop that can handle all the version checks
229      described below.  */
230   struct loop *outermost;
231 
232   /* The first entry in the list of blocks that belong to this loop
233      (and not to subloops).  m_next_block_in_loop provides the chain
234      pointers for the list.  */
235   basic_block block_list;
236 
237   /* We'd like to version the loop for the case in which these SSA names
238      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
239   bitmap_head unity_names;
240 
241   /* If versioning succeeds, this points the version of the loop that
242      assumes the version conditions holds.  */
243   struct loop *optimized_loop;
244 };
245 
246 /* The main pass structure.  */
247 class loop_versioning
248 {
249 public:
250   loop_versioning (function *);
251   ~loop_versioning ();
252   unsigned int run ();
253 
254 private:
255   /* Used to walk the dominator tree to find loop versioning conditions
256      that are always false.  */
257   class lv_dom_walker : public dom_walker
258   {
259   public:
260     lv_dom_walker (loop_versioning &);
261 
262     edge before_dom_children (basic_block) FINAL OVERRIDE;
263     void after_dom_children (basic_block) FINAL OVERRIDE;
264 
265   private:
266     /* The parent pass.  */
267     loop_versioning &m_lv;
268 
269     /* Used to build context-dependent range information.  */
270     evrp_range_analyzer m_range_analyzer;
271   };
272 
273   /* Used to simplify statements based on conditions that are established
274      by the version checks.  */
275   class name_prop : public substitute_and_fold_engine
276   {
277   public:
278     name_prop (loop_info &li) : m_li (li) {}
279     tree get_value (tree) FINAL OVERRIDE;
280 
281   private:
282     /* Information about the versioning we've performed on the loop.  */
283     loop_info &m_li;
284   };
285 
286   loop_info &get_loop_info (struct loop *loop) { return m_loops[loop->num]; }
287 
288   unsigned int max_insns_for_loop (struct loop *);
289   bool expensive_stmt_p (gimple *);
290 
291   void version_for_unity (gimple *, tree);
292   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
293 				unsigned HOST_WIDE_INT * = 0);
294   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
295   bool multiply_term_by (address_term_info &, tree);
296   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
297   void dump_inner_likelihood (address_info &, address_term_info &);
298   void analyze_stride (address_info &, address_term_info &,
299 		       tree, struct loop *);
300   bool find_per_loop_multiplication (address_info &, address_term_info &);
301   bool analyze_term_using_scevs (address_info &, address_term_info &);
302   void analyze_arbitrary_term (address_info &, address_term_info &);
303   void analyze_address_fragment (address_info &);
304   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
305 				tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
306   void analyze_expr (gimple *, tree);
307   bool analyze_block (basic_block);
308   bool analyze_blocks ();
309 
310   void prune_loop_conditions (struct loop *, vr_values *);
311   bool prune_conditions ();
312 
313   void merge_loop_info (struct loop *, struct loop *);
314   void add_loop_to_queue (struct loop *);
315   bool decide_whether_loop_is_versionable (struct loop *);
316   bool make_versioning_decisions ();
317 
318   bool version_loop (struct loop *);
319   void implement_versioning_decisions ();
320 
321   /* The function we're optimizing.  */
322   function *m_fn;
323 
324   /* The obstack to use for all pass-specific bitmaps.  */
325   bitmap_obstack m_bitmap_obstack;
326 
327   /* An obstack to use for general allocation.  */
328   obstack m_obstack;
329 
330   /* The number of loops in the function.  */
331   unsigned int m_nloops;
332 
333   /* The total number of loop version conditions we've found.  */
334   unsigned int m_num_conditions;
335 
336   /* Assume that an address fragment of the form i * stride * scale
337      (for variable stride and constant scale) will not benefit from
338      versioning for stride == 1 when scale is greater than this value.  */
339   unsigned HOST_WIDE_INT m_maximum_scale;
340 
341   /* Information about each loop.  */
342   auto_vec<loop_info> m_loops;
343 
344   /* Used to form a linked list of blocks that belong to a loop,
345      started by loop_info::block_list.  */
346   auto_vec<basic_block> m_next_block_in_loop;
347 
348   /* The list of loops that we've decided to version.  */
349   auto_vec<struct loop *> m_loops_to_version;
350 
351   /* A table of addresses in the current loop, keyed off their values
352      but not their offsets.  */
353   hash_table <address_info_hasher> m_address_table;
354 
355   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
356   auto_vec <address_info *, 32> m_address_list;
357 };
358 
359 /* If EXPR is an SSA name and not a default definition, return the
360    defining statement, otherwise return null.  */
361 
362 static gimple *
363 maybe_get_stmt (tree expr)
364 {
365   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
366     return SSA_NAME_DEF_STMT (expr);
367   return NULL;
368 }
369 
370 /* Like maybe_get_stmt, but also return null if the defining
371    statement isn't an assignment.  */
372 
373 static gassign *
374 maybe_get_assign (tree expr)
375 {
376   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
377 }
378 
379 /* Return true if this pass should look through a cast of expression FROM
380    to type TYPE when analyzing pieces of an address.  */
381 
382 static bool
383 look_through_cast_p (tree type, tree from)
384 {
385   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
386 	  && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
387 }
388 
389 /* Strip all conversions of integers or pointers from EXPR, regardless
390    of whether the conversions are nops.  This is useful in the context
391    of this pass because we're not trying to fold or simulate the
392    expression; we just want to see how it's structured.  */
393 
394 static tree
395 strip_casts (tree expr)
396 {
397   const unsigned int MAX_NITERS = 4;
398 
399   tree type = TREE_TYPE (expr);
400   while (CONVERT_EXPR_P (expr)
401 	 && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
402     expr = TREE_OPERAND (expr, 0);
403 
404   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
405     {
406       gassign *assign = maybe_get_assign (expr);
407       if (assign
408 	  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
409 	  && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
410 	expr = gimple_assign_rhs1 (assign);
411       else
412 	break;
413     }
414   return expr;
415 }
416 
417 /* Compare two address_term_infos in the same address_info.  */
418 
419 static int
420 compare_address_terms (const void *a_uncast, const void *b_uncast)
421 {
422   const address_term_info *a = (const address_term_info *) a_uncast;
423   const address_term_info *b = (const address_term_info *) b_uncast;
424 
425   if (a->expr != b->expr)
426     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
427 
428   if (a->multiplier != b->multiplier)
429     return a->multiplier < b->multiplier ? -1 : 1;
430 
431   return 0;
432 }
433 
434 /* Dump ADDRESS using flags FLAGS.  */
435 
436 static void
437 dump_address_info (dump_flags_t flags, address_info &address)
438 {
439   if (address.base)
440     dump_printf (flags, "%T + ", address.base);
441   for (unsigned int i = 0; i < address.terms.length (); ++i)
442     {
443       if (i != 0)
444 	dump_printf (flags, " + ");
445       dump_printf (flags, "%T", address.terms[i].expr);
446       if (address.terms[i].multiplier != 1)
447 	dump_printf (flags, " * %wd", address.terms[i].multiplier);
448     }
449   dump_printf (flags, " + [%wd, %wd]",
450 	       address.min_offset, address.max_offset - 1);
451 }
452 
453 /* Hash an address_info based on its base and terms.  */
454 
455 hashval_t
456 address_info_hasher::hash (const address_info *info)
457 {
458   inchash::hash hash;
459   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
460   hash.add_int (info->terms.length ());
461   for (unsigned int i = 0; i < info->terms.length (); ++i)
462     {
463       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
464       hash.add_hwi (info->terms[i].multiplier);
465     }
466   return hash.end ();
467 }
468 
469 /* Return true if two address_infos have equal bases and terms.  Other
470    properties might be different (such as the statement or constant
471    offset range).  */
472 
473 bool
474 address_info_hasher::equal (const address_info *a, const address_info *b)
475 {
476   if (a->base != b->base
477       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
478     return false;
479 
480   if (a->terms.length () != b->terms.length ())
481     return false;
482 
483   for (unsigned int i = 0; i < a->terms.length (); ++i)
484     if (a->terms[i].expr != b->terms[i].expr
485 	|| a->terms[i].multiplier != b->terms[i].multiplier)
486       return false;
487 
488   return true;
489 }
490 
491 /* Return true if we want to version the loop, i.e. if we have a
492    specific reason for doing so and no specific reason not to.  */
493 
494 bool
495 loop_info::worth_versioning_p () const
496 {
497   return (!rejected_p
498 	  && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
499 }
500 
501 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
502   : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
503 {
504 }
505 
506 /* Process BB before processing the blocks it dominates.  */
507 
508 edge
509 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
510 {
511   m_range_analyzer.enter (bb);
512 
513   if (bb == bb->loop_father->header)
514     m_lv.prune_loop_conditions (bb->loop_father,
515 				m_range_analyzer.get_vr_values ());
516 
517   for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
518        gsi_next (&si))
519     m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
520 
521   return NULL;
522 }
523 
524 /* Process BB after processing the blocks it dominates.  */
525 
526 void
527 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
528 {
529   m_range_analyzer.leave (bb);
530 }
531 
532 /* Decide whether to replace VAL with a new value in a versioned loop.
533    Return the new value if so, otherwise return null.  */
534 
535 tree
536 loop_versioning::name_prop::get_value (tree val)
537 {
538   if (TREE_CODE (val) == SSA_NAME
539       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
540     return build_one_cst (TREE_TYPE (val));
541   return NULL_TREE;
542 }
543 
544 /* Initialize the structure to optimize FN.  */
545 
546 loop_versioning::loop_versioning (function *fn)
547   : m_fn (fn),
548     m_nloops (number_of_loops (fn)),
549     m_num_conditions (0),
550     m_address_table (31)
551 {
552   bitmap_obstack_initialize (&m_bitmap_obstack);
553   gcc_obstack_init (&m_obstack);
554 
555   /* Initialize the loop information.  */
556   m_loops.safe_grow_cleared (m_nloops);
557   for (unsigned int i = 0; i < m_nloops; ++i)
558     {
559       m_loops[i].outermost = get_loop (m_fn, 0);
560       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
561     }
562 
563   /* Initialize the list of blocks that belong to each loop.  */
564   unsigned int nbbs = last_basic_block_for_fn (fn);
565   m_next_block_in_loop.safe_grow (nbbs);
566   basic_block bb;
567   FOR_EACH_BB_FN (bb, fn)
568     {
569       loop_info &li = get_loop_info (bb->loop_father);
570       m_next_block_in_loop[bb->index] = li.block_list;
571       li.block_list = bb;
572     }
573 
574   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
575      unvectorizable code, since it is the largest size that can be
576      handled efficiently by scalar code.  omp_max_vf calculates the
577      maximum number of bytes in a vector, when such a value is relevant
578      to loop optimization.  */
579   m_maximum_scale = estimated_poly_value (omp_max_vf ());
580   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
581 }
582 
583 loop_versioning::~loop_versioning ()
584 {
585   bitmap_obstack_release (&m_bitmap_obstack);
586   obstack_free (&m_obstack, NULL);
587 }
588 
589 /* Return the maximum number of instructions allowed in LOOP before
590    it becomes too big for versioning.
591 
592    There are separate limits for inner and outer loops.  The limit for
593    inner loops applies only to loops that benefit directly from versioning.
594    The limit for outer loops applies to all code in the outer loop and
595    its subloops that *doesn't* benefit directly from versioning; such code
596    would be "taken along for the ride".  The idea is that if the cost of
597    the latter is small, it is better to version outer loops rather than
598    inner loops, both to reduce the number of repeated checks and to enable
599    more of the loop nest to be optimized as a natural nest (e.g. by loop
600    interchange or outer-loop vectorization).  */
601 
602 unsigned int
603 loop_versioning::max_insns_for_loop (struct loop *loop)
604 {
605   return (loop->inner
606 	  ? PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_OUTER_INSNS)
607 	  : PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_INNER_INSNS));
608 }
609 
610 /* Return true if for cost reasons we should avoid versioning any loop
611    that contains STMT.
612 
613    Note that we don't need to check whether versioning is invalid for
614    correctness reasons, since the versioning process does that for us.
615    The conditions involved are too rare to be worth duplicating here.  */
616 
617 bool
618 loop_versioning::expensive_stmt_p (gimple *stmt)
619 {
620   if (gcall *call = dyn_cast <gcall *> (stmt))
621     /* Assume for now that the time spent in an "expensive" call would
622        overwhelm any saving from versioning.  */
623     return !gimple_inexpensive_call_p (call);
624   return false;
625 }
626 
627 /* Record that we want to version the loop that contains STMT for the
628    case in which SSA name NAME is equal to 1.  We already know that NAME
629    is invariant in the loop.  */
630 
631 void
632 loop_versioning::version_for_unity (gimple *stmt, tree name)
633 {
634   struct loop *loop = loop_containing_stmt (stmt);
635   loop_info &li = get_loop_info (loop);
636 
637   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
638     {
639       /* This is the first time we've wanted to version LOOP for NAME.
640 	 Keep track of the outermost loop that can handle all versioning
641 	 checks in LI.  */
642       struct loop *outermost
643 	= outermost_invariant_loop_for_expr (loop, name);
644       if (loop_depth (li.outermost) < loop_depth (outermost))
645 	li.outermost = outermost;
646 
647       if (dump_enabled_p ())
648 	{
649 	  dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
650 			   " for when %T == 1", name);
651 	  if (outermost == loop)
652 	    dump_printf (MSG_NOTE, "; cannot hoist check further");
653 	  else
654 	    {
655 	      dump_printf (MSG_NOTE, "; could implement the check at loop"
656 			   " depth %d", loop_depth (outermost));
657 	      if (loop_depth (li.outermost) > loop_depth (outermost))
658 		dump_printf (MSG_NOTE, ", but other checks only allow"
659 			     " a depth of %d", loop_depth (li.outermost));
660 	    }
661 	  dump_printf (MSG_NOTE, "\n");
662 	}
663 
664       m_num_conditions += 1;
665     }
666   else
667     {
668       /* This is a duplicate request.  */
669       if (dump_enabled_p ())
670 	dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
671 			 " loop for when %T == 1\n", name);
672     }
673 }
674 
675 /* Return true if OP1_TREE is constant and if in principle it is worth
676    versioning an address fragment of the form:
677 
678      i * OP1_TREE * OP2 * stride
679 
680    for the case in which stride == 1.  This in practice means testing
681    whether:
682 
683      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
684 
685    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
686 
687 bool
688 loop_versioning::acceptable_multiplier_p (tree op1_tree,
689 					  unsigned HOST_WIDE_INT op2,
690 					  unsigned HOST_WIDE_INT *result)
691 {
692   if (tree_fits_uhwi_p (op1_tree))
693     {
694       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
695       /* The first part checks for overflow.  */
696       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
697 	{
698 	  if (result)
699 	    *result = op1 * op2;
700 	  return true;
701 	}
702     }
703   return false;
704 }
705 
706 /* Return true if it is worth using loop versioning on a memory access
707    of type TYPE.  Store the size of the access in *SIZE if so.  */
708 
709 bool
710 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
711 {
712   return (TYPE_SIZE_UNIT (type)
713 	  && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
714 }
715 
716 /* See whether OP is constant and whether we can multiply TERM by that
717    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
718    TERM if so.  */
719 
720 bool
721 loop_versioning::multiply_term_by (address_term_info &term, tree op)
722 {
723   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
724 }
725 
726 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
727    is likely to be indexing an innermost dimension, returning the result
728    as an INNER_* probability.  */
729 
730 inner_likelihood
731 loop_versioning::get_inner_likelihood (tree stride,
732 				       unsigned HOST_WIDE_INT multiplier)
733 {
734   const unsigned int MAX_NITERS = 8;
735 
736   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
737      least one of those values is likely to be for the innermost dimension.
738      Record in UNLIKELY_P if at least one of those values is unlikely to be
739      for the innermost dimension.
740 
741      E.g. for:
742 
743        stride = cond ? a * b : 1
744 
745      we should treat STRIDE as being a likely inner dimension, since
746      we know that it is 1 under at least some circumstances.  (See the
747      Fortran example below.)  However:
748 
749        stride = a * b
750 
751      on its own is unlikely to be for the innermost dimension, since
752      that would require both a and b to be 1 at runtime.  */
753   bool unlikely_p = false;
754   tree worklist[MAX_NITERS];
755   unsigned int length = 0;
756   worklist[length++] = stride;
757   for (unsigned int i = 0; i < length; ++i)
758     {
759       tree expr = worklist[i];
760 
761       if (CONSTANT_CLASS_P (expr))
762 	{
763 	  /* See if EXPR * MULTIPLIER would be consistent with an individual
764 	     access or a small grouped access.  */
765 	  if (acceptable_multiplier_p (expr, multiplier))
766 	    return INNER_LIKELY;
767 	  else
768 	    unlikely_p = true;
769 	}
770       else if (gimple *stmt = maybe_get_stmt (expr))
771 	{
772 	  /* If EXPR is set by a PHI node, queue its arguments in case
773 	     we find one that is consistent with an inner dimension.
774 
775 	     An important instance of this is the Fortran handling of array
776 	     descriptors, which calculates the stride of the inner dimension
777 	     using a PHI equivalent of:
778 
779 		raw_stride = a.dim[0].stride;
780 		stride = raw_stride != 0 ? raw_stride : 1;
781 
782 	     (Strides for outer dimensions do not treat 0 specially.)  */
783 	  if (gphi *phi = dyn_cast <gphi *> (stmt))
784 	    {
785 	      unsigned int nargs = gimple_phi_num_args (phi);
786 	      for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
787 		worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
788 	    }
789 	  /* If the value is set by an assignment, expect it to be read
790 	     from memory (such as an array descriptor) rather than be
791 	     calculated.  */
792 	  else if (gassign *assign = dyn_cast <gassign *> (stmt))
793 	    {
794 	      if (!gimple_assign_load_p (assign))
795 		unlikely_p = true;
796 	    }
797 	  /* Things like calls don't really tell us anything.  */
798 	}
799     }
800 
801   /* We didn't find any possible values of STRIDE that were likely to be
802      for the innermost dimension.  If we found one that was actively
803      unlikely to be for the innermost dimension, assume that that applies
804      to STRIDE too.  */
805   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
806 }
807 
808 /* Dump the likelihood that TERM's stride is for the innermost dimension.
809    ADDRESS is the address that contains TERM.  */
810 
811 void
812 loop_versioning::dump_inner_likelihood (address_info &address,
813 					address_term_info &term)
814 {
815   if (term.inner_likelihood == INNER_LIKELY)
816     dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
817 		     " innermost dimension\n", term.stride);
818   else if (term.inner_likelihood == INNER_UNLIKELY)
819     dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
820 		     " innermost dimension\n", term.stride);
821   else
822     dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
823 		     " is the innermost dimension\n", term.stride);
824 }
825 
826 /* The caller has identified that STRIDE is the stride of interest
827    in TERM, and that the stride is applied in OP_LOOP.  Record this
828    information in TERM, deciding whether STRIDE is likely to be for
829    the innermost dimension of an array and whether it represents a
830    versioning opportunity.  ADDRESS is the address that contains TERM.  */
831 
832 void
833 loop_versioning::analyze_stride (address_info &address,
834 				 address_term_info &term,
835 				 tree stride, struct loop *op_loop)
836 {
837   term.stride = stride;
838 
839   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
840   if (dump_enabled_p ())
841     dump_inner_likelihood (address, term);
842 
843   /* To be a versioning opportunity we require:
844 
845      - The multiplier applied by TERM is equal to the access size,
846        so that when STRIDE is 1, the accesses in successive loop
847        iterations are consecutive.
848 
849        This is deliberately conservative.  We could relax it to handle
850        other cases (such as those with gaps between iterations) if we
851        find any real testcases for which it's useful.
852 
853      - the stride is applied in the same loop as STMT rather than
854        in an outer loop.  Although versioning for strides applied in
855        outer loops could help in some cases -- such as enabling
856        more loop interchange -- the savings are much lower than for
857        inner loops.
858 
859      - the stride is an SSA name that is invariant in STMT's loop,
860        since otherwise versioning isn't possible.  */
861   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
862   if (term.multiplier == access_size
863       && address.loop == op_loop
864       && TREE_CODE (stride) == SSA_NAME
865       && expr_invariant_in_loop_p (address.loop, stride))
866     {
867       term.versioning_opportunity_p = true;
868       if (dump_enabled_p ())
869 	dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
870 			 " opportunity\n", stride);
871     }
872 }
873 
874 /* See whether address term TERM (which belongs to ADDRESS) is the result
875    of multiplying a varying SSA name by a loop-invariant SSA name.
876    Return true and update TERM if so.
877 
878    This handles both cases that SCEV might handle, such as:
879 
880      for (int i = 0; i < n; ++i)
881        res += a[i * stride];
882 
883    and ones in which the term varies arbitrarily between iterations, such as:
884 
885      for (int i = 0; i < n; ++i)
886        res += a[index[i] * stride];  */
887 
888 bool
889 loop_versioning::find_per_loop_multiplication (address_info &address,
890 					       address_term_info &term)
891 {
892   gassign *mult = maybe_get_assign (term.expr);
893   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
894     return false;
895 
896   struct loop *mult_loop = loop_containing_stmt (mult);
897   if (!loop_outer (mult_loop))
898     return false;
899 
900   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
901   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
902   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
903     return false;
904 
905   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
906   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
907   if (invariant1_p == invariant2_p)
908     return false;
909 
910   /* Make sure that the loop invariant is OP2 rather than OP1.  */
911   if (invariant1_p)
912     std::swap (op1, op2);
913 
914   if (dump_enabled_p ())
915     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
916 		     " * loop-invariant %T\n", term.expr, op1, op2);
917   analyze_stride (address, term, op2, mult_loop);
918   return true;
919 }
920 
921 /* Try to use scalar evolutions to find an address stride for TERM,
922    which belongs to ADDRESS.  Return true and update TERM if so.
923 
924    Here we are interested in any evolution information we can find,
925    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
926    an outer loop obviously iterates over the inner dimension of an array,
927    that information can help us eliminate worthless versioning opportunities
928    in inner loops.  */
929 
930 bool
931 loop_versioning::analyze_term_using_scevs (address_info &address,
932 					   address_term_info &term)
933 {
934   gimple *setter = maybe_get_stmt (term.expr);
935   if (!setter)
936     return false;
937 
938   struct loop *wrt_loop = loop_containing_stmt (setter);
939   if (!loop_outer (wrt_loop))
940     return false;
941 
942   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
943   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
944     {
945       if (dump_enabled_p ())
946 	dump_printf_loc (MSG_NOTE, address.stmt,
947 			 "address term %T = %T\n", term.expr, chrec);
948 
949       /* Peel casts and accumulate constant multiplications, up to the
950 	 limit allowed by M_MAXIMUM_SCALE.  */
951       tree stride = strip_casts (CHREC_RIGHT (chrec));
952       while (TREE_CODE (stride) == MULT_EXPR
953 	     && multiply_term_by (term, TREE_OPERAND (stride, 1)))
954 	stride = strip_casts (TREE_OPERAND (stride, 0));
955 
956       gassign *assign;
957       while ((assign = maybe_get_assign (stride))
958 	     && gimple_assign_rhs_code (assign) == MULT_EXPR
959 	     && multiply_term_by (term, gimple_assign_rhs2 (assign)))
960 	{
961 	  if (dump_enabled_p ())
962 	    dump_printf_loc (MSG_NOTE, address.stmt,
963 			     "looking through %G", assign);
964 	  stride = strip_casts (gimple_assign_rhs1 (assign));
965 	}
966 
967       analyze_stride (address, term, stride, get_chrec_loop (chrec));
968       return true;
969     }
970 
971   return false;
972 }
973 
974 /* Address term TERM is an arbitrary term that provides no versioning
975    opportunities.  Analyze it to see whether it contains any likely
976    inner strides, so that we don't mistakenly version for other
977    (less likely) candidates.
978 
979    This copes with invariant innermost indices such as:
980 
981      x(i, :) = 100
982 
983    where the "i" component of the address is invariant in the loop
984    but provides the real inner stride.
985 
986    ADDRESS is the address that contains TERM.  */
987 
988 void
989 loop_versioning::analyze_arbitrary_term (address_info &address,
990 					 address_term_info &term)
991 
992 {
993   /* A multiplication offers two potential strides.  Pick the one that
994      is most likely to be an innermost stride.  */
995   tree expr = term.expr, alt = NULL_TREE;
996   gassign *mult = maybe_get_assign (expr);
997   if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
998     {
999       expr = strip_casts (gimple_assign_rhs1 (mult));
1000       alt = strip_casts (gimple_assign_rhs2 (mult));
1001     }
1002   term.stride = expr;
1003   term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
1004   if (alt)
1005     {
1006       inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
1007       if (alt_l > term.inner_likelihood)
1008 	{
1009 	  term.stride = alt;
1010 	  term.inner_likelihood = alt_l;
1011 	}
1012     }
1013   if (dump_enabled_p ())
1014     dump_inner_likelihood (address, term);
1015 }
1016 
1017 /* Try to identify loop strides in ADDRESS and try to choose realistic
1018    versioning opportunities based on these strides.
1019 
1020    The main difficulty here isn't finding strides that could be used
1021    in a version check (that's pretty easy).  The problem instead is to
1022    avoid versioning for some stride S that is unlikely ever to be 1 at
1023    runtime.  Versioning for S == 1 on its own would lead to unnecessary
1024    code bloat, while adding S == 1 to more realistic version conditions
1025    would lose the optimisation opportunity offered by those other conditions.
1026 
1027    For example, versioning for a stride of 1 in the Fortran code:
1028 
1029      integer :: a(:,:)
1030      a(1,:) = 1
1031 
1032    is not usually a good idea, since the assignment is iterating over
1033    an outer dimension and is relatively unlikely to have a stride of 1.
1034    (It isn't impossible, since the inner dimension might be 1, or the
1035    array might be transposed.)  Similarly, in:
1036 
1037      integer :: a(:,:), b(:,:)
1038      b(:,1) = a(1,:)
1039 
1040    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1041    Versioning for when both strides are 1 would lose most of the benefit
1042    of versioning for b's access.
1043 
1044    The approach we take is as follows:
1045 
1046    - Analyze each term to see whether it has an identifiable stride,
1047      regardless of which loop applies the stride.
1048 
1049    - Evaluate the likelihood that each such stride is for the innermost
1050      dimension of an array, on the scale "likely", "don't know" or "unlikely".
1051 
1052    - If there is a single "likely" innermost stride, and that stride is
1053      applied in the loop that contains STMT, version the loop for when the
1054      stride is 1.  This deals with the cases in which we're fairly
1055      confident of doing the right thing, such as the b(:,1) reference above.
1056 
1057    - If there are no "likely" innermost strides, and the loop that contains
1058      STMT uses a stride that we rated as "don't know", version for when
1059      that stride is 1.  This is principally used for C code such as:
1060 
1061        for (int i = 0; i < n; ++i)
1062 	 a[i * x] = ...;
1063 
1064      and:
1065 
1066        for (int j = 0; j < n; ++j)
1067 	 for (int i = 0; i < n; ++i)
1068 	   a[i * x + j * y] = ...;
1069 
1070      where nothing in the way "x" and "y" are set gives a hint as to
1071      whether "i" iterates over the innermost dimension of the array.
1072      In these situations it seems reasonable to assume the the
1073      programmer has nested the loops appropriately (although of course
1074      there are examples like GEMM in which this assumption doesn't hold
1075      for all accesses in the loop).
1076 
1077      This case is also useful for the Fortran equivalent of the
1078      above C code.  */
1079 
1080 void
1081 loop_versioning::analyze_address_fragment (address_info &address)
1082 {
1083   if (dump_enabled_p ())
1084     {
1085       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1086       dump_address_info (MSG_NOTE, address);
1087       dump_printf (MSG_NOTE, "\n");
1088     }
1089 
1090   /* Analyze each component of the sum to see whether it involves an
1091      apparent stride.
1092 
1093      There is an overlap between the addresses that
1094      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1095      but the former is much cheaper than SCEV analysis, so try it first.  */
1096   for (unsigned int i = 0; i < address.terms.length (); ++i)
1097     if (!find_per_loop_multiplication (address, address.terms[i])
1098 	&& !analyze_term_using_scevs (address, address.terms[i])
1099 	&& !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1100       analyze_arbitrary_term (address, address.terms[i]);
1101 
1102   /* Check for strides that are likely to be for the innermost dimension.
1103 
1104      1. If there is a single likely inner stride, if it is an SSA name,
1105 	and if it is worth versioning the loop for when the SSA name
1106 	equals 1, record that we want to do so.
1107 
1108      2. Otherwise, if there any likely inner strides, bail out.  This means
1109 	one of:
1110 
1111 	(a) There are multiple likely inner strides.  This suggests we're
1112 	    confused and be can't be confident of doing the right thing.
1113 
1114 	(b) There is a single likely inner stride and it is a constant
1115 	    rather than an SSA name.  This can mean either that the access
1116 	    is a natural one without any variable strides, such as:
1117 
1118 	      for (int i = 0; i < n; ++i)
1119 		a[i] += 1;
1120 
1121 	    or that a variable stride is applied to an outer dimension,
1122 	    such as:
1123 
1124 	      for (int i = 0; i < n; ++i)
1125 		for (int j = 0; j < n; ++j)
1126 		  a[j * stride][i] += 1;
1127 
1128 	(c) There is a single likely inner stride, and it is an SSA name,
1129 	    but it isn't a worthwhile versioning opportunity.  This usually
1130 	    means that the variable stride is applied by an outer loop,
1131 	    such as:
1132 
1133 	      for (int i = 0; i < n; ++i)
1134 		for (int j = 0; j < n; ++j)
1135 		  a[j][i * stride] += 1;
1136 
1137 	    or (using an example with a more natural loop nesting):
1138 
1139 	      for (int i = 0; i < n; ++i)
1140 		for (int j = 0; j < n; ++j)
1141 		  a[i][j] += b[i * stride];
1142 
1143 	    in cases where b[i * stride] cannot (yet) be hoisted for
1144 	    aliasing reasons.
1145 
1146      3. If there are no likely inner strides, fall through to the next
1147 	set of checks.
1148 
1149      Pointer equality is enough to check for uniqueness in (1), since we
1150      only care about SSA names.  */
1151   tree chosen_stride = NULL_TREE;
1152   tree version_stride = NULL_TREE;
1153   for (unsigned int i = 0; i < address.terms.length (); ++i)
1154     if (chosen_stride != address.terms[i].stride
1155 	&& address.terms[i].inner_likelihood == INNER_LIKELY)
1156       {
1157 	if (chosen_stride)
1158 	  return;
1159 	chosen_stride = address.terms[i].stride;
1160 	if (address.terms[i].versioning_opportunity_p)
1161 	  version_stride = chosen_stride;
1162       }
1163 
1164   /* If there are no likely inner strides, see if there is a single
1165      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1166      See the comment above the function for the cases that this code
1167      handles.  */
1168   if (!chosen_stride)
1169     for (unsigned int i = 0; i < address.terms.length (); ++i)
1170       if (version_stride != address.terms[i].stride
1171 	  && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1172 	  && address.terms[i].versioning_opportunity_p)
1173 	{
1174 	  if (version_stride)
1175 	    return;
1176 	  version_stride = address.terms[i].stride;
1177 	}
1178 
1179   if (version_stride)
1180     version_for_unity (address.stmt, version_stride);
1181 }
1182 
1183 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1184    TYPE_SIZE bytes and record this address fragment for later processing.
1185    STMT is the statement that contains the address.  */
1186 
1187 void
1188 loop_versioning::record_address_fragment (gimple *stmt,
1189 					  unsigned HOST_WIDE_INT type_size,
1190 					  tree expr,
1191 					  unsigned HOST_WIDE_INT multiplier,
1192 					  HOST_WIDE_INT offset)
1193 {
1194   /* We're only interested in computed values.  */
1195   if (TREE_CODE (expr) != SSA_NAME)
1196     return;
1197 
1198   /* Quick exit if no part of the address is calculated in STMT's loop,
1199      since such addresses have no versioning opportunities.  */
1200   struct loop *loop = loop_containing_stmt (stmt);
1201   if (expr_invariant_in_loop_p (loop, expr))
1202     return;
1203 
1204   /* Set up an address_info for EXPR * MULTIPLIER.  */
1205   address_info *address = XOBNEW (&m_obstack, address_info);
1206   new (address) address_info;
1207   address->stmt = stmt;
1208   address->loop = loop;
1209   address->base = NULL_TREE;
1210   address->terms.quick_grow (1);
1211   address->terms[0].expr = expr;
1212   address->terms[0].multiplier = multiplier;
1213   address->terms[0].stride = NULL_TREE;
1214   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1215   address->terms[0].versioning_opportunity_p = false;
1216   address->min_offset = offset;
1217 
1218   /* Peel apart the expression into a sum of address_terms, where each
1219      term is multiplied by a constant.  Treat a + b and a - b the same,
1220      since it doesn't matter for our purposes whether an address is
1221      increasing or decreasing.  Distribute (a + b) * constant into
1222      a * constant + b * constant.
1223 
1224      We don't care which loop each term belongs to, since we want to
1225      examine as many candidate strides as possible when determining
1226      which is likely to be for the innermost dimension.  We therefore
1227      don't limit the search to statements in STMT's loop.  */
1228   for (unsigned int i = 0; i < address->terms.length (); )
1229     {
1230       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1231 	{
1232 	  tree_code code = gimple_assign_rhs_code (assign);
1233 	  if (code == PLUS_EXPR
1234 	      || code == POINTER_PLUS_EXPR
1235 	      || code == MINUS_EXPR)
1236 	    {
1237 	      tree op1 = gimple_assign_rhs1 (assign);
1238 	      tree op2 = gimple_assign_rhs2 (assign);
1239 	      if (TREE_CODE (op2) == INTEGER_CST)
1240 		{
1241 		  address->terms[i].expr = strip_casts (op1);
1242 		  /* This is heuristic only, so don't worry about truncation
1243 		     or overflow.  */
1244 		  address->min_offset += (TREE_INT_CST_LOW (op2)
1245 					  * address->terms[i].multiplier);
1246 		  continue;
1247 		}
1248 	      else if (address->terms.length () < address_info::MAX_TERMS)
1249 		{
1250 		  unsigned int j = address->terms.length ();
1251 		  address->terms.quick_push (address->terms[i]);
1252 		  address->terms[i].expr = strip_casts (op1);
1253 		  address->terms[j].expr = strip_casts (op2);
1254 		  continue;
1255 		}
1256 	    }
1257 	  if (code == MULT_EXPR)
1258 	    {
1259 	      tree op1 = gimple_assign_rhs1 (assign);
1260 	      tree op2 = gimple_assign_rhs2 (assign);
1261 	      if (multiply_term_by (address->terms[i], op2))
1262 		{
1263 		  address->terms[i].expr = strip_casts (op1);
1264 		  continue;
1265 		}
1266 	    }
1267 	}
1268       i += 1;
1269     }
1270 
1271   /* Peel off any symbolic pointer.  */
1272   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1273       && address->terms[0].multiplier == 1)
1274     {
1275       if (address->terms.length () == 1)
1276 	{
1277 	  obstack_free (&m_obstack, address);
1278 	  return;
1279 	}
1280       address->base = address->terms[0].expr;
1281       address->terms.ordered_remove (0);
1282     }
1283 
1284   /* Require all remaining terms to be SSA names.  (This could be false
1285      for unfolded statements, but they aren't worth dealing with.)  */
1286   for (unsigned int i = 0; i < address->terms.length (); ++i)
1287     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1288       {
1289 	obstack_free (&m_obstack, address);
1290 	return;
1291       }
1292 
1293   /* The loop above set MIN_OFFSET based on the first byte of the
1294      referenced data.  Calculate the end + 1.  */
1295   address->max_offset = address->min_offset + type_size;
1296 
1297   /* Put the terms into a canonical order for the hash table lookup below.  */
1298   address->terms.qsort (compare_address_terms);
1299 
1300   if (dump_enabled_p ())
1301     {
1302       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1303       if (multiplier != 1)
1304 	dump_printf (MSG_NOTE, " * %wd", multiplier);
1305       dump_printf (MSG_NOTE, " = ");
1306       dump_address_info (MSG_NOTE, *address);
1307       dump_printf (MSG_NOTE, "\n");
1308     }
1309 
1310   /* Pool address information with the same terms (but potentially
1311      different offsets).  */
1312   address_info **slot = m_address_table.find_slot (address, INSERT);
1313   if (address_info *old_address = *slot)
1314     {
1315       /* We've already seen an address with the same terms.  Extend the
1316 	 offset range to account for this access.  Doing this can paper
1317 	 over gaps, such as in:
1318 
1319 	   a[i * stride * 4] + a[i * stride * 4 + 3];
1320 
1321 	 where nothing references "+ 1" or "+ 2".  However, the vectorizer
1322 	 handles such gapped accesses without problems, so it's not worth
1323 	 trying to exclude them.  */
1324       if (old_address->min_offset > address->min_offset)
1325 	old_address->min_offset = address->min_offset;
1326       if (old_address->max_offset < address->max_offset)
1327 	old_address->max_offset = address->max_offset;
1328       obstack_free (&m_obstack, address);
1329     }
1330   else
1331     {
1332       /* This is the first time we've seen an address with these terms.  */
1333       *slot = address;
1334       m_address_list.safe_push (address);
1335     }
1336 }
1337 
1338 /* Analyze expression EXPR, which occurs in STMT.  */
1339 
1340 void
1341 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1342 {
1343   unsigned HOST_WIDE_INT type_size;
1344 
1345   while (handled_component_p (expr))
1346     {
1347       /* See whether we can use versioning to avoid a multiplication
1348 	 in an array index.  */
1349       if (TREE_CODE (expr) == ARRAY_REF
1350 	  && acceptable_type_p (TREE_TYPE (expr), &type_size))
1351 	record_address_fragment (stmt, type_size,
1352 				 TREE_OPERAND (expr, 1), type_size, 0);
1353       expr = TREE_OPERAND (expr, 0);
1354     }
1355 
1356   /* See whether we can use versioning to avoid a multiplication
1357      in the pointer calculation of a MEM_REF.  */
1358   if (TREE_CODE (expr) == MEM_REF
1359       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1360     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1361 			     /* This is heuristic only, so don't worry
1362 				about truncation or overflow.  */
1363 			     TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1364 
1365   /* These would be easy to handle if they existed at this stage.  */
1366   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1367 }
1368 
1369 /* Analyze all the statements in BB looking for useful version checks.
1370    Return true on success, false if something prevents the block from
1371    being versioned.  */
1372 
1373 bool
1374 loop_versioning::analyze_block (basic_block bb)
1375 {
1376   struct loop *loop = bb->loop_father;
1377   loop_info &li = get_loop_info (loop);
1378   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1379        gsi_next (&gsi))
1380     {
1381       gimple *stmt = gsi_stmt (gsi);
1382       if (is_gimple_debug (stmt))
1383 	continue;
1384 
1385       if (expensive_stmt_p (stmt))
1386 	{
1387 	  if (dump_enabled_p ())
1388 	    dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1389 			     " prevents versioning: %G", stmt);
1390 	  return false;
1391 	}
1392 
1393       /* Only look for direct versioning opportunities in inner loops
1394 	 since the benefit tends to be much smaller for outer loops.  */
1395       if (!loop->inner)
1396 	{
1397 	  unsigned int nops = gimple_num_ops (stmt);
1398 	  for (unsigned int i = 0; i < nops; ++i)
1399 	    if (tree op = gimple_op (stmt, i))
1400 	      analyze_expr (stmt, op);
1401 	}
1402 
1403       /* The point of the instruction limit is to prevent excessive
1404 	 code growth, so this is a size-based estimate even though
1405 	 the optimization is aimed at speed.  */
1406       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1407     }
1408 
1409   return true;
1410 }
1411 
1412 /* Analyze all the blocks in the function, looking for useful version checks.
1413    Return true if we found one.  */
1414 
1415 bool
1416 loop_versioning::analyze_blocks ()
1417 {
1418   AUTO_DUMP_SCOPE ("analyze_blocks",
1419 		   dump_user_location_t::from_function_decl (m_fn->decl));
1420 
1421   /* For now we don't try to version the whole function, although
1422      versioning at that level could be useful in some cases.  */
1423   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1424 
1425   struct loop *loop;
1426   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1427     {
1428       loop_info &linfo = get_loop_info (loop);
1429 
1430       /* Ignore cold loops.  */
1431       if (!optimize_loop_for_speed_p (loop))
1432 	linfo.rejected_p = true;
1433 
1434       /* See whether an inner loop prevents versioning of this loop.  */
1435       if (!linfo.rejected_p)
1436 	for (struct loop *inner = loop->inner; inner; inner = inner->next)
1437 	  if (get_loop_info (inner).rejected_p)
1438 	    {
1439 	      linfo.rejected_p = true;
1440 	      break;
1441 	    }
1442 
1443       /* If versioning the loop is still a possibility, examine the
1444 	 statements in the loop to look for versioning opportunities.  */
1445       if (!linfo.rejected_p)
1446 	{
1447 	  void *start_point = obstack_alloc (&m_obstack, 0);
1448 
1449 	  for (basic_block bb = linfo.block_list; bb;
1450 	       bb = m_next_block_in_loop[bb->index])
1451 	    if (!analyze_block (bb))
1452 	      {
1453 		linfo.rejected_p = true;
1454 		break;
1455 	    }
1456 
1457 	  if (!linfo.rejected_p)
1458 	    {
1459 	      /* Process any queued address fragments, now that we have
1460 		 complete grouping information.  */
1461 	      address_info *address;
1462 	      unsigned int i;
1463 	      FOR_EACH_VEC_ELT (m_address_list, i, address)
1464 		analyze_address_fragment (*address);
1465 	    }
1466 
1467 	  m_address_table.empty ();
1468 	  m_address_list.truncate (0);
1469 	  obstack_free (&m_obstack, start_point);
1470 	}
1471     }
1472 
1473   return m_num_conditions != 0;
1474 }
1475 
1476 /* Use the ranges in VRS to remove impossible versioning conditions from
1477    LOOP.  */
1478 
1479 void
1480 loop_versioning::prune_loop_conditions (struct loop *loop, vr_values *vrs)
1481 {
1482   loop_info &li = get_loop_info (loop);
1483 
1484   int to_remove = -1;
1485   bitmap_iterator bi;
1486   unsigned int i;
1487   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1488     {
1489       tree name = ssa_name (i);
1490       value_range *vr = vrs->get_value_range (name);
1491       if (vr && !range_includes_p (vr, 1))
1492 	{
1493 	  if (dump_enabled_p ())
1494 	    dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1495 			     "%T can never be 1 in this loop\n", name);
1496 
1497 	  if (to_remove >= 0)
1498 	    bitmap_clear_bit (&li.unity_names, to_remove);
1499 	  to_remove = i;
1500 	  m_num_conditions -= 1;
1501 	}
1502     }
1503   if (to_remove >= 0)
1504     bitmap_clear_bit (&li.unity_names, to_remove);
1505 }
1506 
1507 /* Remove any scheduled loop version conditions that will never be true.
1508    Return true if any remain.  */
1509 
1510 bool
1511 loop_versioning::prune_conditions ()
1512 {
1513   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1514 		   dump_user_location_t::from_function_decl (m_fn->decl));
1515 
1516   calculate_dominance_info (CDI_DOMINATORS);
1517   lv_dom_walker dom_walker (*this);
1518   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1519   return m_num_conditions != 0;
1520 }
1521 
1522 /* Merge the version checks for INNER into immediately-enclosing loop
1523    OUTER.  */
1524 
1525 void
1526 loop_versioning::merge_loop_info (struct loop *outer, struct loop *inner)
1527 {
1528   loop_info &inner_li = get_loop_info (inner);
1529   loop_info &outer_li = get_loop_info (outer);
1530 
1531   if (dump_enabled_p ())
1532     {
1533       bitmap_iterator bi;
1534       unsigned int i;
1535       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1536 	if (!bitmap_bit_p (&outer_li.unity_names, i))
1537 	  dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1538 			   "hoisting check that %T == 1 to outer loop\n",
1539 			   ssa_name (i));
1540     }
1541 
1542   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1543   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1544     outer_li.outermost = inner_li.outermost;
1545 }
1546 
1547 /* Add LOOP to the queue of loops to version.  */
1548 
1549 void
1550 loop_versioning::add_loop_to_queue (struct loop *loop)
1551 {
1552   loop_info &li = get_loop_info (loop);
1553 
1554   if (dump_enabled_p ())
1555     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1556 		     "queuing this loop for versioning\n");
1557   m_loops_to_version.safe_push (loop);
1558 
1559   /* Don't try to version superloops.  */
1560   li.rejected_p = true;
1561 }
1562 
1563 /* Decide whether the cost model would allow us to version LOOP,
1564    either directly or as part of a parent loop, and return true if so.
1565    This does not imply that the loop is actually worth versioning in its
1566    own right, just that it would be valid to version it if something
1567    benefited.
1568 
1569    We have already made this decision for all inner loops of LOOP.  */
1570 
1571 bool
1572 loop_versioning::decide_whether_loop_is_versionable (struct loop *loop)
1573 {
1574   loop_info &li = get_loop_info (loop);
1575 
1576   if (li.rejected_p)
1577     return false;
1578 
1579   /* Examine the decisions made for inner loops.  */
1580   for (struct loop *inner = loop->inner; inner; inner = inner->next)
1581     {
1582       loop_info &inner_li = get_loop_info (inner);
1583       if (inner_li.rejected_p)
1584 	{
1585 	  if (dump_enabled_p ())
1586 	    dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1587 			     "not versioning this loop because one of its"
1588 			     " inner loops should not be versioned\n");
1589 	  return false;
1590 	}
1591 
1592       if (inner_li.worth_versioning_p ())
1593 	li.subloops_benefit_p = true;
1594 
1595       /* Accumulate the number of instructions from subloops that are not
1596 	 the innermost, or that don't benefit from versioning.  Only the
1597 	 instructions from innermost loops that benefit from versioning
1598 	 should be weighed against loop-versioning-max-inner-insns;
1599 	 everything else should be weighed against
1600 	 loop-versioning-max-outer-insns.  */
1601       if (!inner_li.worth_versioning_p () || inner->inner)
1602 	{
1603 	  if (dump_enabled_p ())
1604 	    dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1605 			     "counting %d instructions from this loop"
1606 			     " against its parent loop\n", inner_li.num_insns);
1607 	  li.num_insns += inner_li.num_insns;
1608 	}
1609     }
1610 
1611   /* Enforce the size limits.  */
1612   if (li.worth_versioning_p ())
1613     {
1614       unsigned int max_num_insns = max_insns_for_loop (loop);
1615       if (dump_enabled_p ())
1616 	dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1617 			 "this loop has %d instructions, against"
1618 			 " a versioning limit of %d\n",
1619 			 li.num_insns, max_num_insns);
1620       if (li.num_insns > max_num_insns)
1621 	{
1622 	  if (dump_enabled_p ())
1623 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION
1624 			     | MSG_PRIORITY_USER_FACING,
1625 			     find_loop_location (loop),
1626 			     "this loop is too big to version");
1627 	  return false;
1628 	}
1629     }
1630 
1631   /* Hoist all version checks from subloops to this loop.  */
1632   for (struct loop *subloop = loop->inner; subloop; subloop = subloop->next)
1633     merge_loop_info (loop, subloop);
1634 
1635   return true;
1636 }
1637 
1638 /* Decide which loops to version and add them to the versioning queue.
1639    Return true if there are any loops to version.  */
1640 
1641 bool
1642 loop_versioning::make_versioning_decisions ()
1643 {
1644   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1645 		   dump_user_location_t::from_function_decl (m_fn->decl));
1646 
1647   struct loop *loop;
1648   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1649     {
1650       loop_info &linfo = get_loop_info (loop);
1651       if (decide_whether_loop_is_versionable (loop))
1652 	{
1653 	  /* Commit to versioning LOOP directly if we can't hoist the
1654 	     version checks any further.  */
1655 	  if (linfo.worth_versioning_p ()
1656 	      && (loop_depth (loop) == 1 || linfo.outermost == loop))
1657 	    add_loop_to_queue (loop);
1658 	}
1659       else
1660 	{
1661 	  /* We can't version this loop, so individually version any
1662 	     subloops that would benefit and haven't been versioned yet.  */
1663 	  linfo.rejected_p = true;
1664 	  for (struct loop *subloop = loop->inner; subloop;
1665 	       subloop = subloop->next)
1666 	    if (get_loop_info (subloop).worth_versioning_p ())
1667 	      add_loop_to_queue (subloop);
1668 	}
1669     }
1670 
1671   return !m_loops_to_version.is_empty ();
1672 }
1673 
1674 /* Attempt to implement loop versioning for LOOP, using the information
1675    cached in the associated loop_info.  Return true on success.  */
1676 
1677 bool
1678 loop_versioning::version_loop (struct loop *loop)
1679 {
1680   loop_info &li = get_loop_info (loop);
1681 
1682   /* Build up a condition that selects the original loop instead of
1683      the simplified loop.  */
1684   tree cond = boolean_false_node;
1685   bitmap_iterator bi;
1686   unsigned int i;
1687   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1688     {
1689       tree name = ssa_name (i);
1690       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1691 				 build_one_cst (TREE_TYPE (name)));
1692       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1693     }
1694 
1695   /* Convert the condition into a suitable gcond.  */
1696   gimple_seq stmts = NULL;
1697   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1698 
1699   /* Version the loop.  */
1700   initialize_original_copy_tables ();
1701   basic_block cond_bb;
1702   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1703 				    profile_probability::unlikely (),
1704 				    profile_probability::likely (),
1705 				    profile_probability::unlikely (),
1706 				    profile_probability::likely (), true);
1707   free_original_copy_tables ();
1708   if (!li.optimized_loop)
1709     {
1710       if (dump_enabled_p ())
1711 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1712 			 "tried but failed to version this loop for when"
1713 			 " certain strides are 1\n");
1714       return false;
1715     }
1716 
1717   if (dump_enabled_p ())
1718     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1719 		     "versioned this loop for when certain strides are 1\n");
1720 
1721   /* Insert the statements that feed COND.  */
1722   if (stmts)
1723     {
1724       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1725       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1726     }
1727 
1728   return true;
1729 }
1730 
1731 /* Attempt to version all loops in the versioning queue.  */
1732 
1733 void
1734 loop_versioning::implement_versioning_decisions ()
1735 {
1736   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1737      user-facing at this point.  */
1738 
1739   bool any_succeeded_p = false;
1740   struct loop *loop;
1741   unsigned int i;
1742   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1743     if (version_loop (loop))
1744       any_succeeded_p = true;
1745   if (!any_succeeded_p)
1746     return;
1747 
1748   update_ssa (TODO_update_ssa);
1749 
1750   /* Simplify the new loop, which is used when COND is false.  */
1751   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1752     {
1753       loop_info &linfo = get_loop_info (loop);
1754       if (linfo.optimized_loop)
1755 	name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1756     }
1757 }
1758 
1759 /* Run the pass and return a set of TODO_* flags.  */
1760 
1761 unsigned int
1762 loop_versioning::run ()
1763 {
1764   gcc_assert (scev_initialized_p ());
1765 
1766   if (analyze_blocks ()
1767       && prune_conditions ()
1768       && make_versioning_decisions ())
1769     implement_versioning_decisions ();
1770 
1771   return 0;
1772 }
1773 
1774 /* Loop versioning pass.  */
1775 
1776 const pass_data pass_data_loop_versioning =
1777 {
1778   GIMPLE_PASS, /* type */
1779   "lversion", /* name */
1780   OPTGROUP_LOOP, /* optinfo_flags */
1781   TV_LOOP_VERSIONING, /* tv_id */
1782   PROP_cfg, /* properties_required */
1783   0, /* properties_provided */
1784   0, /* properties_destroyed */
1785   0, /* todo_flags_start */
1786   0, /* todo_flags_finish */
1787 };
1788 
1789 class pass_loop_versioning : public gimple_opt_pass
1790 {
1791 public:
1792   pass_loop_versioning (gcc::context *ctxt)
1793     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1794   {}
1795 
1796   /* opt_pass methods: */
1797   virtual bool gate (function *) { return flag_version_loops_for_strides; }
1798   virtual unsigned int execute (function *);
1799 };
1800 
1801 unsigned int
1802 pass_loop_versioning::execute (function *fn)
1803 {
1804   if (number_of_loops (fn) <= 1)
1805     return 0;
1806 
1807   return loop_versioning (fn).run ();
1808 }
1809 
1810 } // anon namespace
1811 
1812 gimple_opt_pass *
1813 make_pass_loop_versioning (gcc::context *ctxt)
1814 {
1815   return new pass_loop_versioning (ctxt);
1816 }
1817