xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-loop-manip.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /* Vectorizer Specific Loop Manipulations
2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 
51 /*************************************************************************
52   Simple Loop Peeling Utilities
53 
54   Utilities to support loop peeling for vectorization purposes.
55  *************************************************************************/
56 
57 
58 /* Renames the use *OP_P.  */
59 
60 static void
61 rename_use_op (use_operand_p op_p)
62 {
63   tree new_name;
64 
65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66     return;
67 
68   new_name = get_current_def (USE_FROM_PTR (op_p));
69 
70   /* Something defined outside of the loop.  */
71   if (!new_name)
72     return;
73 
74   /* An ordinary ssa name defined in the loop.  */
75 
76   SET_USE (op_p, new_name);
77 }
78 
79 
80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82    true.  */
83 
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86 {
87   gimple *stmt;
88   use_operand_p use_p;
89   ssa_op_iter iter;
90   edge e;
91   edge_iterator ei;
92   struct loop *loop = bb->loop_father;
93   struct loop *outer_loop = NULL;
94 
95   if (rename_from_outer_loop)
96     {
97       gcc_assert (loop);
98       outer_loop = loop_outer (loop);
99     }
100 
101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102        gsi_next (&gsi))
103     {
104       stmt = gsi_stmt (gsi);
105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 	rename_use_op (use_p);
107     }
108 
109   FOR_EACH_EDGE (e, ei, bb->preds)
110     {
111       if (!flow_bb_inside_loop_p (loop, e->src))
112 	{
113 	  if (!rename_from_outer_loop)
114 	    continue;
115 	  if (e->src != outer_loop->header)
116 	    {
117 	      if (outer_loop->inner->next)
118 		{
119 		  /* If outer_loop has 2 inner loops, allow there to
120 		     be an extra basic block which decides which of the
121 		     two loops to use using LOOP_VECTORIZED.  */
122 		  if (!single_pred_p (e->src)
123 		      || single_pred (e->src) != outer_loop->header)
124 		    continue;
125 		}
126 	    }
127 	}
128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 	   gsi_next (&gsi))
130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131     }
132 }
133 
134 
135 struct adjust_info
136 {
137   tree from, to;
138   basic_block bb;
139 };
140 
141 /* A stack of values to be adjusted in debug stmts.  We have to
142    process them LIFO, so that the closest substitution applies.  If we
143    processed them FIFO, without the stack, we might substitute uses
144    with a PHI DEF that would soon become non-dominant, and when we got
145    to the suitable one, it wouldn't have anything to substitute any
146    more.  */
147 static vec<adjust_info, va_heap> adjust_vec;
148 
149 /* Adjust any debug stmts that referenced AI->from values to use the
150    loop-closed AI->to, if the references are dominated by AI->bb and
151    not by the definition of AI->from.  */
152 
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
155 {
156   basic_block bbphi = ai->bb;
157   tree orig_def = ai->from;
158   tree new_def = ai->to;
159   imm_use_iterator imm_iter;
160   gimple *stmt;
161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162 
163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164 
165   /* Adjust any debug stmts that held onto non-loop-closed
166      references.  */
167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168     {
169       use_operand_p use_p;
170       basic_block bbuse;
171 
172       if (!is_gimple_debug (stmt))
173 	continue;
174 
175       gcc_assert (gimple_debug_bind_p (stmt));
176 
177       bbuse = gimple_bb (stmt);
178 
179       if ((bbuse == bbphi
180 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 	  && !(bbuse == bbdef
182 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 	{
184 	  if (new_def)
185 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 	      SET_USE (use_p, new_def);
187 	  else
188 	    {
189 	      gimple_debug_bind_reset_value (stmt);
190 	      update_stmt (stmt);
191 	    }
192 	}
193     }
194 }
195 
196 /* Adjust debug stmts as scheduled before.  */
197 
198 static void
199 adjust_vec_debug_stmts (void)
200 {
201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
202     return;
203 
204   gcc_assert (adjust_vec.exists ());
205 
206   while (!adjust_vec.is_empty ())
207     {
208       adjust_debug_stmts_now (&adjust_vec.last ());
209       adjust_vec.pop ();
210     }
211 }
212 
213 /* Adjust any debug stmts that referenced FROM values to use the
214    loop-closed TO, if the references are dominated by BB and not by
215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
216    will be postponed until adjust_vec_debug_stmts is called.  */
217 
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
220 {
221   adjust_info ai;
222 
223   if (MAY_HAVE_DEBUG_BIND_STMTS
224       && TREE_CODE (from) == SSA_NAME
225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
226       && ! virtual_operand_p (from))
227     {
228       ai.from = from;
229       ai.to = to;
230       ai.bb = bb;
231 
232       if (adjust_vec.exists ())
233 	adjust_vec.safe_push (ai);
234       else
235 	adjust_debug_stmts_now (&ai);
236     }
237 }
238 
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240    to adjust any debug stmts that referenced the old phi arg,
241    presumably non-loop-closed references left over from other
242    transformations.  */
243 
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246 {
247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248 
249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250 
251   if (MAY_HAVE_DEBUG_BIND_STMTS)
252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 			gimple_bb (update_phi));
254 }
255 
256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
257    the mask should have during the first iteration and NEXT_MASK is the
258    value that it should have on subsequent iterations.  */
259 
260 static void
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 		    tree next_mask)
263 {
264   gphi *phi = create_phi_node (mask, loop->header);
265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267 }
268 
269 /* Add SEQ to the end of LOOP's preheader block.  */
270 
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
273 {
274   if (seq)
275     {
276       edge pe = loop_preheader_edge (loop);
277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278       gcc_assert (!new_bb);
279     }
280 }
281 
282 /* Add SEQ to the beginning of LOOP's header block.  */
283 
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
286 {
287   if (seq)
288     {
289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291     }
292 }
293 
294 /* Return true if the target can interleave elements of two vectors.
295    OFFSET is 0 if the first half of the vectors should be interleaved
296    or 1 if the second half should.  When returning true, store the
297    associated permutation in INDICES.  */
298 
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 			unsigned int offset)
302 {
303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304   poly_uint64 base = exact_div (nelts, 2) * offset;
305   vec_perm_builder sel (nelts, 2, 3);
306   for (unsigned int i = 0; i < 3; ++i)
307     {
308       sel.quick_push (base + i);
309       sel.quick_push (base + i + nelts);
310     }
311   indices->new_vector (sel, 2, nelts);
312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313 }
314 
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316    in SRC_RGM, given that the former has twice as many masks as the
317    latter.  Return true on success, adding any new statements to SEQ.  */
318 
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 			       rgroup_masks *src_rgm)
322 {
323   tree src_masktype = src_rgm->mask_type;
324   tree dest_masktype = dest_rgm->mask_type;
325   machine_mode src_mode = TYPE_MODE (src_masktype);
326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329     {
330       /* Unpacking the source masks gives at least as many mask bits as
331 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 	{
335 	  tree src = src_rgm->masks[i / 2];
336 	  tree dest = dest_rgm->masks[i];
337 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338 			    ? VEC_UNPACK_HI_EXPR
339 			    : VEC_UNPACK_LO_EXPR);
340 	  gassign *stmt;
341 	  if (dest_masktype == unpack_masktype)
342 	    stmt = gimple_build_assign (dest, code, src);
343 	  else
344 	    {
345 	      tree temp = make_ssa_name (unpack_masktype);
346 	      stmt = gimple_build_assign (temp, code, src);
347 	      gimple_seq_add_stmt (seq, stmt);
348 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349 					  build1 (VIEW_CONVERT_EXPR,
350 						  dest_masktype, temp));
351 	    }
352 	  gimple_seq_add_stmt (seq, stmt);
353 	}
354       return true;
355     }
356   vec_perm_indices indices[2];
357   if (dest_masktype == src_masktype
358       && interleave_supported_p (&indices[0], src_masktype, 0)
359       && interleave_supported_p (&indices[1], src_masktype, 1))
360     {
361       /* The destination requires twice as many mask bits as the source, so
362 	 we can use interleaving permutes to double up the number of bits.  */
363       tree masks[2];
364       for (unsigned int i = 0; i < 2; ++i)
365 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367 	{
368 	  tree src = src_rgm->masks[i / 2];
369 	  tree dest = dest_rgm->masks[i];
370 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371 					      src, src, masks[i & 1]);
372 	  gimple_seq_add_stmt (seq, stmt);
373 	}
374       return true;
375     }
376   return false;
377 }
378 
379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
380    all the masks in RGM and return a mask that is nonzero when the loop
381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
382    Use LOOP_COND_GSI to insert code before the exit gcond.
383 
384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
385    times and has been vectorized according to LOOP_VINFO.  Each iteration
386    of the vectorized loop handles VF iterations of the scalar loop.
387 
388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389    starts with NITERS_SKIP dummy iterations of the scalar loop before
390    the real work starts.  The mask elements for these dummy iterations
391    must be 0, to ensure that the extra iterations do not have an effect.
392 
393    It is known that:
394 
395      NITERS * RGM->max_nscalars_per_iter
396 
397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
398    variable that starts at 0 and has step:
399 
400      VF * RGM->max_nscalars_per_iter
401 
402    might overflow before hitting a value above:
403 
404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405 
406    This means that we cannot guarantee that such an induction variable
407    would ever hit a value that produces a set of all-false masks for RGM.  */
408 
409 static tree
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411 			      gimple_seq *preheader_seq,
412 			      gimple_stmt_iterator loop_cond_gsi,
413 			      rgroup_masks *rgm, tree vf,
414 			      tree niters, tree niters_skip,
415 			      bool might_wrap_p)
416 {
417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418   tree mask_type = rgm->mask_type;
419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421 
422   /* Calculate the maximum number of scalar values that the rgroup
423      handles in total, the number that it handles for each iteration
424      of the vector loop, and the number that it should skip during the
425      first iteration of the vector loop.  */
426   tree nscalars_total = niters;
427   tree nscalars_step = vf;
428   tree nscalars_skip = niters_skip;
429   if (nscalars_per_iter != 1)
430     {
431       /* We checked before choosing to use a fully-masked loop that these
432 	 multiplications don't overflow.  */
433       tree factor = build_int_cst (compare_type, nscalars_per_iter);
434       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435 				     nscalars_total, factor);
436       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437 				    nscalars_step, factor);
438       if (nscalars_skip)
439 	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440 				      nscalars_skip, factor);
441     }
442 
443   /* Create an induction variable that counts the number of scalars
444      processed.  */
445   tree index_before_incr, index_after_incr;
446   gimple_stmt_iterator incr_gsi;
447   bool insert_after;
448   tree zero_index = build_int_cst (compare_type, 0);
449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451 	     insert_after, &index_before_incr, &index_after_incr);
452 
453   tree test_index, test_limit, first_limit;
454   gimple_stmt_iterator *test_gsi;
455   if (might_wrap_p)
456     {
457       /* In principle the loop should stop iterating once the incremented
458 	 IV reaches a value greater than or equal to:
459 
460 	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461 
462 	 However, there's no guarantee that this addition doesn't overflow
463 	 the comparison type, or that the IV hits a value above it before
464 	 wrapping around.  We therefore adjust the limit down by one
465 	 IV step:
466 
467 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468 	   -[infinite-prec] NSCALARS_STEP
469 
470 	 and compare the IV against this limit _before_ incrementing it.
471 	 Since the comparison type is unsigned, we actually want the
472 	 subtraction to saturate at zero:
473 
474 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475 	   -[sat] NSCALARS_STEP
476 
477 	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478 
479 	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480 
481 	 where the rightmost subtraction can be done directly in
482 	 COMPARE_TYPE.  */
483       test_index = index_before_incr;
484       tree adjust = nscalars_step;
485       if (nscalars_skip)
486 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487 			       adjust, nscalars_skip);
488       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489 				 nscalars_total, adjust);
490       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491 				 test_limit, adjust);
492       test_gsi = &incr_gsi;
493 
494       /* Get a safe limit for the first iteration.  */
495       if (nscalars_skip)
496 	{
497 	  /* The first vector iteration can handle at most NSCALARS_STEP
498 	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
499 	     NSCALARS_SKIP to that cannot overflow.  */
500 	  tree const_limit = build_int_cst (compare_type,
501 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502 					    * nscalars_per_iter);
503 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504 				      nscalars_total, const_limit);
505 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506 				      first_limit, nscalars_skip);
507 	}
508       else
509 	/* For the first iteration it doesn't matter whether the IV hits
510 	   a value above NSCALARS_TOTAL.  That only matters for the latch
511 	   condition.  */
512 	first_limit = nscalars_total;
513     }
514   else
515     {
516       /* Test the incremented IV, which will always hit a value above
517 	 the bound before wrapping.  */
518       test_index = index_after_incr;
519       test_limit = nscalars_total;
520       if (nscalars_skip)
521 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522 				   test_limit, nscalars_skip);
523       test_gsi = &loop_cond_gsi;
524 
525       first_limit = test_limit;
526     }
527 
528   /* Provide a definition of each mask in the group.  */
529   tree next_mask = NULL_TREE;
530   tree mask;
531   unsigned int i;
532   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533     {
534       /* Previous masks will cover BIAS scalars.  This mask covers the
535 	 next batch.  */
536       poly_uint64 bias = nscalars_per_mask * i;
537       tree bias_tree = build_int_cst (compare_type, bias);
538       gimple *tmp_stmt;
539 
540       /* See whether the first iteration of the vector loop is known
541 	 to have a full mask.  */
542       poly_uint64 const_limit;
543       bool first_iteration_full
544 	= (poly_int_tree_p (first_limit, &const_limit)
545 	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546 
547       /* Rather than have a new IV that starts at BIAS and goes up to
548 	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549 	 and adjust the bound down by BIAS.  */
550       tree this_test_limit = test_limit;
551       if (i != 0)
552 	{
553 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554 					  compare_type, this_test_limit,
555 					  bias_tree);
556 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557 					  compare_type, this_test_limit,
558 					  bias_tree);
559 	}
560 
561       /* Create the initial mask.  First include all scalars that
562 	 are within the loop limit.  */
563       tree init_mask = NULL_TREE;
564       if (!first_iteration_full)
565 	{
566 	  tree start, end;
567 	  if (first_limit == test_limit)
568 	    {
569 	      /* Use a natural test between zero (the initial IV value)
570 		 and the loop limit.  The "else" block would be valid too,
571 		 but this choice can avoid the need to load BIAS_TREE into
572 		 a register.  */
573 	      start = zero_index;
574 	      end = this_test_limit;
575 	    }
576 	  else
577 	    {
578 	      /* FIRST_LIMIT is the maximum number of scalars handled by the
579 		 first iteration of the vector loop.  Test the portion
580 		 associated with this mask.  */
581 	      start = bias_tree;
582 	      end = first_limit;
583 	    }
584 
585 	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586 	  tmp_stmt = vect_gen_while (init_mask, start, end);
587 	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588 	}
589 
590       /* Now AND out the bits that are within the number of skipped
591 	 scalars.  */
592       poly_uint64 const_skip;
593       if (nscalars_skip
594 	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
595 	       && known_le (const_skip, bias)))
596 	{
597 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598 						    bias_tree, nscalars_skip);
599 	  if (init_mask)
600 	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601 				      init_mask, unskipped_mask);
602 	  else
603 	    init_mask = unskipped_mask;
604 	}
605 
606       if (!init_mask)
607 	/* First iteration is full.  */
608 	init_mask = build_minus_one_cst (mask_type);
609 
610       /* Get the mask value for the next iteration of the loop.  */
611       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614 
615       vect_set_loop_mask (loop, mask, init_mask, next_mask);
616     }
617   return next_mask;
618 }
619 
620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
622    number of iterations of the original scalar loop that should be
623    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
624    as for vect_set_loop_condition.
625 
626    Insert the branch-back condition before LOOP_COND_GSI and return the
627    final gcond.  */
628 
629 static gcond *
630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631 				tree niters, tree final_iv,
632 				bool niters_maybe_zero,
633 				gimple_stmt_iterator loop_cond_gsi)
634 {
635   gimple_seq preheader_seq = NULL;
636   gimple_seq header_seq = NULL;
637 
638   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639   unsigned int compare_precision = TYPE_PRECISION (compare_type);
640   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641   tree orig_niters = niters;
642 
643   /* Type of the initial value of NITERS.  */
644   tree ni_actual_type = TREE_TYPE (niters);
645   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646 
647   /* Convert NITERS to the same size as the compare.  */
648   if (compare_precision > ni_actual_precision
649       && niters_maybe_zero)
650     {
651       /* We know that there is always at least one iteration, so if the
652 	 count is zero then it must have wrapped.  Cope with this by
653 	 subtracting 1 before the conversion and adding 1 to the result.  */
654       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656 			     niters, build_minus_one_cst (ni_actual_type));
657       niters = gimple_convert (&preheader_seq, compare_type, niters);
658       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659 			     niters, build_one_cst (compare_type));
660     }
661   else
662     niters = gimple_convert (&preheader_seq, compare_type, niters);
663 
664   /* Convert skip_niters to the right type.  */
665   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666 
667   /* Now calculate the value that the induction variable must be able
668      to hit in order to ensure that we end the loop with an all-false mask.
669      This involves adding the maximum number of inactive trailing scalar
670      iterations.  */
671   widest_int iv_limit;
672   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673   if (known_max_iters)
674     {
675       if (niters_skip)
676 	{
677 	  /* Add the maximum number of skipped iterations to the
678 	     maximum iteration count.  */
679 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
680 	    iv_limit += wi::to_widest (niters_skip);
681 	  else
682 	    iv_limit += max_vf - 1;
683 	}
684       /* IV_LIMIT is the maximum number of latch iterations, which is also
685 	 the maximum in-range IV value.  Round this value down to the previous
686 	 vector alignment boundary and then add an extra full iteration.  */
687       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689     }
690 
691   /* Get the vectorization factor in tree form.  */
692   tree vf = build_int_cst (compare_type,
693 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694 
695   /* Iterate over all the rgroups and fill in their masks.  We could use
696      the first mask from any rgroup for the loop condition; here we
697      arbitrarily pick the last.  */
698   tree test_mask = NULL_TREE;
699   rgroup_masks *rgm;
700   unsigned int i;
701   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702   FOR_EACH_VEC_ELT (*masks, i, rgm)
703     if (!rgm->masks.is_empty ())
704       {
705 	/* First try using permutes.  This adds a single vector
706 	   instruction to the loop for each mask, but needs no extra
707 	   loop invariants or IVs.  */
708 	unsigned int nmasks = i + 1;
709 	if ((nmasks & 1) == 0)
710 	  {
711 	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712 	    if (!half_rgm->masks.is_empty ()
713 		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714 	      continue;
715 	  }
716 
717 	/* See whether zero-based IV would ever generate all-false masks
718 	   before wrapping around.  */
719 	bool might_wrap_p
720 	  = (!known_max_iters
721 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722 				    UNSIGNED)
723 		 > compare_precision));
724 
725 	/* Set up all masks for this group.  */
726 	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727 						  &preheader_seq,
728 						  loop_cond_gsi, rgm, vf,
729 						  niters, niters_skip,
730 						  might_wrap_p);
731       }
732 
733   /* Emit all accumulated statements.  */
734   add_preheader_seq (loop, preheader_seq);
735   add_header_seq (loop, header_seq);
736 
737   /* Get a boolean result that tells us whether to iterate.  */
738   edge exit_edge = single_exit (loop);
739   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742 					NULL_TREE, NULL_TREE);
743   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744 
745   /* The loop iterates (NITERS - 1) / VF + 1 times.
746      Subtract one from this to get the latch count.  */
747   tree step = build_int_cst (compare_type,
748 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750 				       build_minus_one_cst (compare_type));
751   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752 				     niters_minus_one, step);
753 
754   if (final_iv)
755     {
756       gassign *assign = gimple_build_assign (final_iv, orig_niters);
757       gsi_insert_on_edge_immediate (single_exit (loop), assign);
758     }
759 
760   return cond_stmt;
761 }
762 
763 /* Like vect_set_loop_condition, but handle the case in which there
764    are no loop masks.  */
765 
766 static gcond *
767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768 				  tree step, tree final_iv,
769 				  bool niters_maybe_zero,
770 				  gimple_stmt_iterator loop_cond_gsi)
771 {
772   tree indx_before_incr, indx_after_incr;
773   gcond *cond_stmt;
774   gcond *orig_cond;
775   edge pe = loop_preheader_edge (loop);
776   edge exit_edge = single_exit (loop);
777   gimple_stmt_iterator incr_gsi;
778   bool insert_after;
779   enum tree_code code;
780   tree niters_type = TREE_TYPE (niters);
781 
782   orig_cond = get_loop_exit_condition (loop);
783   gcc_assert (orig_cond);
784   loop_cond_gsi = gsi_for_stmt (orig_cond);
785 
786   tree init, limit;
787   if (!niters_maybe_zero && integer_onep (step))
788     {
789       /* In this case we can use a simple 0-based IV:
790 
791 	 A:
792 	   x = 0;
793 	   do
794 	     {
795 	       ...
796 	       x += 1;
797 	     }
798 	   while (x < NITERS);  */
799       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800       init = build_zero_cst (niters_type);
801       limit = niters;
802     }
803   else
804     {
805       /* The following works for all values of NITERS except 0:
806 
807 	 B:
808 	   x = 0;
809 	   do
810 	     {
811 	       ...
812 	       x += STEP;
813 	     }
814 	   while (x <= NITERS - STEP);
815 
816 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
817 	 but stops if x + STEP - 1 >= NITERS.
818 
819 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
820 	 before wrapping around.  There are two obvious ways of dealing with
821 	 this:
822 
823 	 - start at STEP - 1 and compare x before incrementing it
824 	 - start at -1 and compare x after incrementing it
825 
826 	 The latter is simpler and is what we use.  The loop in this case
827 	 looks like:
828 
829 	 C:
830 	   x = -1;
831 	   do
832 	     {
833 	       ...
834 	       x += STEP;
835 	     }
836 	   while (x < NITERS - STEP);
837 
838 	 In both cases the loop limit is NITERS - STEP.  */
839       gimple_seq seq = NULL;
840       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842       if (seq)
843 	{
844 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845 	  gcc_assert (!new_bb);
846 	}
847       if (niters_maybe_zero)
848 	{
849 	  /* Case C.  */
850 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851 	  init = build_all_ones_cst (niters_type);
852 	}
853       else
854 	{
855 	  /* Case B.  */
856 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857 	  init = build_zero_cst (niters_type);
858 	}
859     }
860 
861   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862   create_iv (init, step, NULL_TREE, loop,
863              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
864   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865 					      true, NULL_TREE, true,
866 					      GSI_SAME_STMT);
867   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
868 				     true, GSI_SAME_STMT);
869 
870   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
871 				 NULL_TREE);
872 
873   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874 
875   /* Record the number of latch iterations.  */
876   if (limit == niters)
877     /* Case A: the loop iterates NITERS times.  Subtract one to get the
878        latch count.  */
879     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880 				       build_int_cst (niters_type, 1));
881   else
882     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883        Subtract one from this to get the latch count.  */
884     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885 				       limit, step);
886 
887   if (final_iv)
888     {
889       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890 					     indx_after_incr, init);
891       gsi_insert_on_edge_immediate (single_exit (loop), assign);
892     }
893 
894   return cond_stmt;
895 }
896 
897 /* If we're using fully-masked loops, make LOOP iterate:
898 
899       N == (NITERS - 1) / STEP + 1
900 
901    times.  When NITERS is zero, this is equivalent to making the loop
902    execute (1 << M) / STEP times, where M is the precision of NITERS.
903    NITERS_MAYBE_ZERO is true if this last case might occur.
904 
905    If we're not using fully-masked loops, make LOOP iterate:
906 
907       N == (NITERS - STEP) / STEP + 1
908 
909    times, where NITERS is known to be outside the range [1, STEP - 1].
910    This is equivalent to making the loop execute NITERS / STEP times
911    when NITERS is nonzero and (1 << M) / STEP times otherwise.
912    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913 
914    If FINAL_IV is nonnull, it is an SSA name that should be set to
915    N * STEP on exit from the loop.
916 
917    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
918 
919 void
920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921 			 tree niters, tree step, tree final_iv,
922 			 bool niters_maybe_zero)
923 {
924   gcond *cond_stmt;
925   gcond *orig_cond = get_loop_exit_condition (loop);
926   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927 
928   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930 						final_iv, niters_maybe_zero,
931 						loop_cond_gsi);
932   else
933     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934 						  final_iv, niters_maybe_zero,
935 						  loop_cond_gsi);
936 
937   /* Remove old loop exit test.  */
938   stmt_vec_info orig_cond_info;
939   if (loop_vinfo
940       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
941     loop_vinfo->remove_stmt (orig_cond_info);
942   else
943     gsi_remove (&loop_cond_gsi, true);
944 
945   if (dump_enabled_p ())
946     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
947 		     cond_stmt);
948 }
949 
950 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
951    For all PHI arguments in FROM->dest and TO->dest from those
952    edges ensure that TO->dest PHI arguments have current_def
953    to that in from.  */
954 
955 static void
956 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
957 {
958   gimple_stmt_iterator gsi_from, gsi_to;
959 
960   for (gsi_from = gsi_start_phis (from->dest),
961        gsi_to = gsi_start_phis (to->dest);
962        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
963     {
964       gimple *from_phi = gsi_stmt (gsi_from);
965       gimple *to_phi = gsi_stmt (gsi_to);
966       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
967       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
968       if (virtual_operand_p (from_arg))
969 	{
970 	  gsi_next (&gsi_from);
971 	  continue;
972 	}
973       if (virtual_operand_p (to_arg))
974 	{
975 	  gsi_next (&gsi_to);
976 	  continue;
977 	}
978       if (TREE_CODE (from_arg) != SSA_NAME)
979 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
980       else if (TREE_CODE (to_arg) == SSA_NAME
981 	       && from_arg != to_arg)
982 	{
983 	  if (get_current_def (to_arg) == NULL_TREE)
984 	    {
985 	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
986 					      TREE_TYPE (get_current_def
987 							   (from_arg))));
988 	      set_current_def (to_arg, get_current_def (from_arg));
989 	    }
990 	}
991       gsi_next (&gsi_from);
992       gsi_next (&gsi_to);
993     }
994 
995   gphi *from_phi = get_virtual_phi (from->dest);
996   gphi *to_phi = get_virtual_phi (to->dest);
997   if (from_phi)
998     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
999 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1000 }
1001 
1002 
1003 /* Given LOOP this function generates a new copy of it and puts it
1004    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1005    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1006    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1007    entry or exit of LOOP.  */
1008 
1009 struct loop *
1010 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1011 					struct loop *scalar_loop, edge e)
1012 {
1013   struct loop *new_loop;
1014   basic_block *new_bbs, *bbs, *pbbs;
1015   bool at_exit;
1016   bool was_imm_dom;
1017   basic_block exit_dest;
1018   edge exit, new_exit;
1019   bool duplicate_outer_loop = false;
1020 
1021   exit = single_exit (loop);
1022   at_exit = (e == exit);
1023   if (!at_exit && e != loop_preheader_edge (loop))
1024     return NULL;
1025 
1026   if (scalar_loop == NULL)
1027     scalar_loop = loop;
1028 
1029   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1030   pbbs = bbs + 1;
1031   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1032   /* Allow duplication of outer loops.  */
1033   if (scalar_loop->inner)
1034     duplicate_outer_loop = true;
1035   /* Check whether duplication is possible.  */
1036   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1037     {
1038       free (bbs);
1039       return NULL;
1040     }
1041 
1042   /* Generate new loop structure.  */
1043   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1044   duplicate_subloops (scalar_loop, new_loop);
1045 
1046   exit_dest = exit->dest;
1047   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1048 					  exit_dest) == loop->header ?
1049 		 true : false);
1050 
1051   /* Also copy the pre-header, this avoids jumping through hoops to
1052      duplicate the loop entry PHI arguments.  Create an empty
1053      pre-header unconditionally for this.  */
1054   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1055   edge entry_e = single_pred_edge (preheader);
1056   bbs[0] = preheader;
1057   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1058 
1059   exit = single_exit (scalar_loop);
1060   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1061 	    &exit, 1, &new_exit, NULL,
1062 	    at_exit ? loop->latch : e->src, true);
1063   exit = single_exit (loop);
1064   basic_block new_preheader = new_bbs[0];
1065 
1066   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1067 
1068   if (scalar_loop != loop)
1069     {
1070       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1071 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1072 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1073 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1074 	 header) to have current_def set, so copy them over.  */
1075       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1076 						exit);
1077       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1078 							   0),
1079 						EDGE_SUCC (loop->latch, 0));
1080     }
1081 
1082   if (at_exit) /* Add the loop copy at exit.  */
1083     {
1084       if (scalar_loop != loop)
1085 	{
1086 	  gphi_iterator gsi;
1087 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1088 
1089 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1090 	       gsi_next (&gsi))
1091 	    {
1092 	      gphi *phi = gsi.phi ();
1093 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1094 	      location_t orig_locus
1095 		= gimple_phi_arg_location_from_edge (phi, e);
1096 
1097 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1098 	    }
1099 	}
1100       redirect_edge_and_branch_force (e, new_preheader);
1101       flush_pending_stmts (e);
1102       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1103       if (was_imm_dom || duplicate_outer_loop)
1104 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1105 
1106       /* And remove the non-necessary forwarder again.  Keep the other
1107          one so we have a proper pre-header for the loop at the exit edge.  */
1108       redirect_edge_pred (single_succ_edge (preheader),
1109 			  single_pred (preheader));
1110       delete_basic_block (preheader);
1111       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1112 			       loop_preheader_edge (scalar_loop)->src);
1113     }
1114   else /* Add the copy at entry.  */
1115     {
1116       if (scalar_loop != loop)
1117 	{
1118 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1119 	  redirect_edge_pred (single_succ_edge (preheader),
1120 			      single_pred (preheader));
1121 	  delete_basic_block (preheader);
1122 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1123 				   loop_preheader_edge (scalar_loop)->src);
1124 	  preheader = split_edge (loop_preheader_edge (loop));
1125 	  entry_e = single_pred_edge (preheader);
1126 	}
1127 
1128       redirect_edge_and_branch_force (entry_e, new_preheader);
1129       flush_pending_stmts (entry_e);
1130       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1131 
1132       redirect_edge_and_branch_force (new_exit, preheader);
1133       flush_pending_stmts (new_exit);
1134       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1135 
1136       /* And remove the non-necessary forwarder again.  Keep the other
1137          one so we have a proper pre-header for the loop at the exit edge.  */
1138       redirect_edge_pred (single_succ_edge (new_preheader),
1139 			  single_pred (new_preheader));
1140       delete_basic_block (new_preheader);
1141       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1142 			       loop_preheader_edge (new_loop)->src);
1143     }
1144 
1145   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1146   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1147     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1148 
1149   if (scalar_loop != loop)
1150     {
1151       /* Update new_loop->header PHIs, so that on the preheader
1152 	 edge they are the ones from loop rather than scalar_loop.  */
1153       gphi_iterator gsi_orig, gsi_new;
1154       edge orig_e = loop_preheader_edge (loop);
1155       edge new_e = loop_preheader_edge (new_loop);
1156 
1157       for (gsi_orig = gsi_start_phis (loop->header),
1158 	   gsi_new = gsi_start_phis (new_loop->header);
1159 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1160 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1161 	{
1162 	  gphi *orig_phi = gsi_orig.phi ();
1163 	  gphi *new_phi = gsi_new.phi ();
1164 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1165 	  location_t orig_locus
1166 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1167 
1168 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1169 	}
1170     }
1171 
1172   free (new_bbs);
1173   free (bbs);
1174 
1175   checking_verify_dominators (CDI_DOMINATORS);
1176 
1177   return new_loop;
1178 }
1179 
1180 
1181 /* Given the condition expression COND, put it as the last statement of
1182    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1183    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1184    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1185    new edge as irreducible if IRREDUCIBLE_P is true.  */
1186 
1187 static edge
1188 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1189 		       basic_block guard_to, basic_block dom_bb,
1190 		       profile_probability probability, bool irreducible_p)
1191 {
1192   gimple_stmt_iterator gsi;
1193   edge new_e, enter_e;
1194   gcond *cond_stmt;
1195   gimple_seq gimplify_stmt_list = NULL;
1196 
1197   enter_e = EDGE_SUCC (guard_bb, 0);
1198   enter_e->flags &= ~EDGE_FALLTHRU;
1199   enter_e->flags |= EDGE_FALSE_VALUE;
1200   gsi = gsi_last_bb (guard_bb);
1201 
1202   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1203 				 NULL_TREE);
1204   if (gimplify_stmt_list)
1205     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1206 
1207   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1208   gsi = gsi_last_bb (guard_bb);
1209   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1210 
1211   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1212   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1213 
1214   new_e->probability = probability;
1215   if (irreducible_p)
1216     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1217 
1218   enter_e->probability = probability.invert ();
1219   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1220 
1221   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1222   if (enter_e->dest->loop_father->header == enter_e->dest)
1223     split_edge (enter_e);
1224 
1225   return new_e;
1226 }
1227 
1228 
1229 /* This function verifies that the following restrictions apply to LOOP:
1230    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1231        for innermost loop and 5 basic blocks for outer-loop.
1232    (2) it is single entry, single exit
1233    (3) its exit condition is the last stmt in the header
1234    (4) E is the entry/exit edge of LOOP.
1235  */
1236 
1237 bool
1238 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1239 {
1240   edge exit_e = single_exit (loop);
1241   edge entry_e = loop_preheader_edge (loop);
1242   gcond *orig_cond = get_loop_exit_condition (loop);
1243   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1244   unsigned int num_bb = loop->inner? 5 : 2;
1245 
1246   /* All loops have an outer scope; the only case loop->outer is NULL is for
1247      the function itself.  */
1248   if (!loop_outer (loop)
1249       || loop->num_nodes != num_bb
1250       || !empty_block_p (loop->latch)
1251       || !single_exit (loop)
1252       /* Verify that new loop exit condition can be trivially modified.  */
1253       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1254       || (e != exit_e && e != entry_e))
1255     return false;
1256 
1257   return true;
1258 }
1259 
1260 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1261    in the exit bb and rename all the uses after the loop.  This simplifies
1262    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1263    (but normally loop closed SSA form doesn't require virtual PHIs to be
1264    in the same form).  Doing this early simplifies the checking what
1265    uses should be renamed.  */
1266 
1267 static void
1268 create_lcssa_for_virtual_phi (struct loop *loop)
1269 {
1270   gphi_iterator gsi;
1271   edge exit_e = single_exit (loop);
1272 
1273   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1274     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1275       {
1276 	gphi *phi = gsi.phi ();
1277 	for (gsi = gsi_start_phis (exit_e->dest);
1278 	     !gsi_end_p (gsi); gsi_next (&gsi))
1279 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1280 	    break;
1281 	if (gsi_end_p (gsi))
1282 	  {
1283 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1284 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1285 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1286 	    imm_use_iterator imm_iter;
1287 	    gimple *stmt;
1288 	    use_operand_p use_p;
1289 
1290 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1291 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1292 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1293 	    gimple_phi_set_result (new_phi, new_vop);
1294 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1295 	      if (stmt != new_phi
1296 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1297 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1298 		  SET_USE (use_p, new_vop);
1299 	  }
1300 	break;
1301       }
1302 
1303 }
1304 
1305 /* Function vect_get_loop_location.
1306 
1307    Extract the location of the loop in the source code.
1308    If the loop is not well formed for vectorization, an estimated
1309    location is calculated.
1310    Return the loop location if succeed and NULL if not.  */
1311 
1312 dump_user_location_t
1313 find_loop_location (struct loop *loop)
1314 {
1315   gimple *stmt = NULL;
1316   basic_block bb;
1317   gimple_stmt_iterator si;
1318 
1319   if (!loop)
1320     return dump_user_location_t ();
1321 
1322   stmt = get_loop_exit_condition (loop);
1323 
1324   if (stmt
1325       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1326     return stmt;
1327 
1328   /* If we got here the loop is probably not "well formed",
1329      try to estimate the loop location */
1330 
1331   if (!loop->header)
1332     return dump_user_location_t ();
1333 
1334   bb = loop->header;
1335 
1336   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1337     {
1338       stmt = gsi_stmt (si);
1339       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1340         return stmt;
1341     }
1342 
1343   return dump_user_location_t ();
1344 }
1345 
1346 /* Return true if the phi described by STMT_INFO defines an IV of the
1347    loop to be vectorized.  */
1348 
1349 static bool
1350 iv_phi_p (stmt_vec_info stmt_info)
1351 {
1352   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1353   if (virtual_operand_p (PHI_RESULT (phi)))
1354     return false;
1355 
1356   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1357       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1358     return false;
1359 
1360   return true;
1361 }
1362 
1363 /* Function vect_can_advance_ivs_p
1364 
1365    In case the number of iterations that LOOP iterates is unknown at compile
1366    time, an epilog loop will be generated, and the loop induction variables
1367    (IVs) will be "advanced" to the value they are supposed to take just before
1368    the epilog loop.  Here we check that the access function of the loop IVs
1369    and the expression that represents the loop bound are simple enough.
1370    These restrictions will be relaxed in the future.  */
1371 
1372 bool
1373 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1374 {
1375   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1376   basic_block bb = loop->header;
1377   gphi_iterator gsi;
1378 
1379   /* Analyze phi functions of the loop header.  */
1380 
1381   if (dump_enabled_p ())
1382     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1383   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1384     {
1385       tree evolution_part;
1386 
1387       gphi *phi = gsi.phi ();
1388       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1389       if (dump_enabled_p ())
1390 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1391 			 phi_info->stmt);
1392 
1393       /* Skip virtual phi's. The data dependences that are associated with
1394 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1395 
1396 	 Skip reduction phis.  */
1397       if (!iv_phi_p (phi_info))
1398 	{
1399 	  if (dump_enabled_p ())
1400 	    dump_printf_loc (MSG_NOTE, vect_location,
1401 			     "reduc or virtual phi. skip.\n");
1402 	  continue;
1403 	}
1404 
1405       /* Analyze the evolution function.  */
1406 
1407       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1408       if (evolution_part == NULL_TREE)
1409         {
1410 	  if (dump_enabled_p ())
1411 	    dump_printf (MSG_MISSED_OPTIMIZATION,
1412 			 "No access function or evolution.\n");
1413 	  return false;
1414         }
1415 
1416       /* FORNOW: We do not transform initial conditions of IVs
1417 	 which evolution functions are not invariants in the loop.  */
1418 
1419       if (!expr_invariant_in_loop_p (loop, evolution_part))
1420 	{
1421 	  if (dump_enabled_p ())
1422 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423 			     "evolution not invariant in loop.\n");
1424 	  return false;
1425 	}
1426 
1427       /* FORNOW: We do not transform initial conditions of IVs
1428 	 which evolution functions are a polynomial of degree >= 2.  */
1429 
1430       if (tree_is_chrec (evolution_part))
1431 	{
1432 	  if (dump_enabled_p ())
1433 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1434 			     "evolution is chrec.\n");
1435 	  return false;
1436 	}
1437     }
1438 
1439   return true;
1440 }
1441 
1442 
1443 /*   Function vect_update_ivs_after_vectorizer.
1444 
1445      "Advance" the induction variables of LOOP to the value they should take
1446      after the execution of LOOP.  This is currently necessary because the
1447      vectorizer does not handle induction variables that are used after the
1448      loop.  Such a situation occurs when the last iterations of LOOP are
1449      peeled, because:
1450      1. We introduced new uses after LOOP for IVs that were not originally used
1451         after LOOP: the IVs of LOOP are now used by an epilog loop.
1452      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1453         times, whereas the loop IVs should be bumped N times.
1454 
1455      Input:
1456      - LOOP - a loop that is going to be vectorized. The last few iterations
1457               of LOOP were peeled.
1458      - NITERS - the number of iterations that LOOP executes (before it is
1459                 vectorized). i.e, the number of times the ivs should be bumped.
1460      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1461                   coming out from LOOP on which there are uses of the LOOP ivs
1462 		  (this is the path from LOOP->exit to epilog_loop->preheader).
1463 
1464                   The new definitions of the ivs are placed in LOOP->exit.
1465                   The phi args associated with the edge UPDATE_E in the bb
1466                   UPDATE_E->dest are updated accordingly.
1467 
1468      Assumption 1: Like the rest of the vectorizer, this function assumes
1469      a single loop exit that has a single predecessor.
1470 
1471      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1472      organized in the same order.
1473 
1474      Assumption 3: The access function of the ivs is simple enough (see
1475      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1476 
1477      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1478      coming out of LOOP on which the ivs of LOOP are used (this is the path
1479      that leads to the epilog loop; other paths skip the epilog loop).  This
1480      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1481      needs to have its phis updated.
1482  */
1483 
1484 static void
1485 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1486 				  tree niters, edge update_e)
1487 {
1488   gphi_iterator gsi, gsi1;
1489   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1490   basic_block update_bb = update_e->dest;
1491   basic_block exit_bb = single_exit (loop)->dest;
1492 
1493   /* Make sure there exists a single-predecessor exit bb:  */
1494   gcc_assert (single_pred_p (exit_bb));
1495   gcc_assert (single_succ_edge (exit_bb) == update_e);
1496 
1497   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1498        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1499        gsi_next (&gsi), gsi_next (&gsi1))
1500     {
1501       tree init_expr;
1502       tree step_expr, off;
1503       tree type;
1504       tree var, ni, ni_name;
1505       gimple_stmt_iterator last_gsi;
1506 
1507       gphi *phi = gsi.phi ();
1508       gphi *phi1 = gsi1.phi ();
1509       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1510       if (dump_enabled_p ())
1511 	dump_printf_loc (MSG_NOTE, vect_location,
1512 			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1513 
1514       /* Skip reduction and virtual phis.  */
1515       if (!iv_phi_p (phi_info))
1516 	{
1517 	  if (dump_enabled_p ())
1518 	    dump_printf_loc (MSG_NOTE, vect_location,
1519 			     "reduc or virtual phi. skip.\n");
1520 	  continue;
1521 	}
1522 
1523       type = TREE_TYPE (gimple_phi_result (phi));
1524       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1525       step_expr = unshare_expr (step_expr);
1526 
1527       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1528          of degree >= 2 or exponential.  */
1529       gcc_assert (!tree_is_chrec (step_expr));
1530 
1531       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1532 
1533       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1534 			 fold_convert (TREE_TYPE (step_expr), niters),
1535 			 step_expr);
1536       if (POINTER_TYPE_P (type))
1537 	ni = fold_build_pointer_plus (init_expr, off);
1538       else
1539 	ni = fold_build2 (PLUS_EXPR, type,
1540 			  init_expr, fold_convert (type, off));
1541 
1542       var = create_tmp_var (type, "tmp");
1543 
1544       last_gsi = gsi_last_bb (exit_bb);
1545       gimple_seq new_stmts = NULL;
1546       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1547       /* Exit_bb shouldn't be empty.  */
1548       if (!gsi_end_p (last_gsi))
1549 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1550       else
1551 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1552 
1553       /* Fix phi expressions in the successor bb.  */
1554       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1555     }
1556 }
1557 
1558 /* Return a gimple value containing the misalignment (measured in vector
1559    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1560    it is away from a perfectly aligned address.  Add any new statements
1561    to SEQ.  */
1562 
1563 static tree
1564 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1565 {
1566   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1567   stmt_vec_info stmt_info = dr_info->stmt;
1568   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1569 
1570   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1571   unsigned HOST_WIDE_INT target_align_c;
1572   tree target_align_minus_1;
1573 
1574   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1575 					size_zero_node) < 0;
1576   tree offset = (negative
1577 		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1578 		 : size_zero_node);
1579   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1580 							  offset);
1581   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1582   if (target_align.is_constant (&target_align_c))
1583     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1584   else
1585     {
1586       tree vla = build_int_cst (type, target_align);
1587       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1588 				    fold_build2 (MINUS_EXPR, type,
1589 						 build_int_cst (type, 0), vla));
1590       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1591 					  build_int_cst (type, 1));
1592     }
1593 
1594   HOST_WIDE_INT elem_size
1595     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1596   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1597 
1598   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1599   tree int_start_addr = fold_convert (type, start_addr);
1600   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1601 					target_align_minus_1);
1602 
1603   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1604   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1605 					elem_size_log);
1606 
1607   return misalign_in_elems;
1608 }
1609 
1610 /* Function vect_gen_prolog_loop_niters
1611 
1612    Generate the number of iterations which should be peeled as prolog for the
1613    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1614    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1615    As a result, after the execution of this loop, the data reference DR will
1616    refer to an aligned location.  The following computation is generated:
1617 
1618    If the misalignment of DR is known at compile time:
1619      addr_mis = int mis = DR_MISALIGNMENT (dr);
1620    Else, compute address misalignment in bytes:
1621      addr_mis = addr & (target_align - 1)
1622 
1623    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1624 
1625    (elem_size = element type size; an element is the scalar element whose type
1626    is the inner type of the vectype)
1627 
1628    The computations will be emitted at the end of BB.  We also compute and
1629    store upper bound (included) of the result in BOUND.
1630 
1631    When the step of the data-ref in the loop is not 1 (as in interleaved data
1632    and SLP), the number of iterations of the prolog must be divided by the step
1633    (which is equal to the size of interleaved group).
1634 
1635    The above formulas assume that VF == number of elements in the vector. This
1636    may not hold when there are multiple-types in the loop.
1637    In this case, for some data-references in the loop the VF does not represent
1638    the number of elements that fit in the vector.  Therefore, instead of VF we
1639    use TYPE_VECTOR_SUBPARTS.  */
1640 
1641 static tree
1642 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1643 			     basic_block bb, int *bound)
1644 {
1645   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1646   tree var;
1647   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1648   gimple_seq stmts = NULL, new_stmts = NULL;
1649   tree iters, iters_name;
1650   stmt_vec_info stmt_info = dr_info->stmt;
1651   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1652   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1653 
1654   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1655     {
1656       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1657 
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_NOTE, vect_location,
1660                          "known peeling = %d.\n", npeel);
1661 
1662       iters = build_int_cst (niters_type, npeel);
1663       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1664     }
1665   else
1666     {
1667       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1668       tree type = TREE_TYPE (misalign_in_elems);
1669       HOST_WIDE_INT elem_size
1670 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1671       /* We only do prolog peeling if the target alignment is known at compile
1672          time.  */
1673       poly_uint64 align_in_elems =
1674 	exact_div (target_align, elem_size);
1675       tree align_in_elems_minus_1 =
1676 	build_int_cst (type, align_in_elems - 1);
1677       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1678 
1679       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1680 				 & (align_in_elems - 1)).  */
1681       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1682 					    size_zero_node) < 0;
1683       if (negative)
1684 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1685 			     align_in_elems_tree);
1686       else
1687 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1688 			     misalign_in_elems);
1689       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1690       iters = fold_convert (niters_type, iters);
1691       unsigned HOST_WIDE_INT align_in_elems_c;
1692       if (align_in_elems.is_constant (&align_in_elems_c))
1693 	*bound = align_in_elems_c - 1;
1694       else
1695 	*bound = -1;
1696     }
1697 
1698   if (dump_enabled_p ())
1699     dump_printf_loc (MSG_NOTE, vect_location,
1700 		     "niters for prolog loop: %T\n", iters);
1701 
1702   var = create_tmp_var (niters_type, "prolog_loop_niters");
1703   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1704 
1705   if (new_stmts)
1706     gimple_seq_add_seq (&stmts, new_stmts);
1707   if (stmts)
1708     {
1709       gcc_assert (single_succ_p (bb));
1710       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1711       if (gsi_end_p (gsi))
1712 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1713       else
1714 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1715     }
1716   return iters_name;
1717 }
1718 
1719 
1720 /* Function vect_update_init_of_dr
1721 
1722    If CODE is PLUS, the vector loop starts NITERS iterations after the
1723    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1724    iterations before the scalar one (using masking to skip inactive
1725    elements).  This function updates the information recorded in DR to
1726    account for the difference.  Specifically, it updates the OFFSET
1727    field of DR.  */
1728 
1729 static void
1730 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1731 {
1732   tree offset = DR_OFFSET (dr);
1733 
1734   niters = fold_build2 (MULT_EXPR, sizetype,
1735 			fold_convert (sizetype, niters),
1736 			fold_convert (sizetype, DR_STEP (dr)));
1737   offset = fold_build2 (code, sizetype,
1738 			fold_convert (sizetype, offset), niters);
1739   DR_OFFSET (dr) = offset;
1740 }
1741 
1742 
1743 /* Function vect_update_inits_of_drs
1744 
1745    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1746    CODE and NITERS are as for vect_update_inits_of_dr.  */
1747 
1748 static void
1749 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1750 			  tree_code code)
1751 {
1752   unsigned int i;
1753   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1754   struct data_reference *dr;
1755 
1756   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1757 
1758   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1759   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1760     {
1761       gimple_seq seq;
1762       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1763       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1764 
1765       niters = fold_convert (sizetype, niters);
1766       niters = force_gimple_operand (niters, &seq, false, var);
1767       if (seq)
1768 	{
1769 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1770 	  gcc_assert (!new_bb);
1771 	}
1772     }
1773 
1774   FOR_EACH_VEC_ELT (datarefs, i, dr)
1775     {
1776       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1777       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1778 	vect_update_init_of_dr (dr, niters, code);
1779     }
1780 }
1781 
1782 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1783    by masking.  This involves calculating the number of iterations to
1784    be peeled and then aligning all memory references appropriately.  */
1785 
1786 void
1787 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1788 {
1789   tree misalign_in_elems;
1790   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1791 
1792   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1793 
1794   /* From the information recorded in LOOP_VINFO get the number of iterations
1795      that need to be skipped via masking.  */
1796   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1797     {
1798       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1799 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1800       misalign_in_elems = build_int_cst (type, misalign);
1801     }
1802   else
1803     {
1804       gimple_seq seq1 = NULL, seq2 = NULL;
1805       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1806       misalign_in_elems = fold_convert (type, misalign_in_elems);
1807       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1808 						&seq2, true, NULL_TREE);
1809       gimple_seq_add_seq (&seq1, seq2);
1810       if (seq1)
1811 	{
1812 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1813 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1814 	  gcc_assert (!new_bb);
1815 	}
1816     }
1817 
1818   if (dump_enabled_p ())
1819     dump_printf_loc (MSG_NOTE, vect_location,
1820 		     "misalignment for fully-masked loop: %T\n",
1821 		     misalign_in_elems);
1822 
1823   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1824 
1825   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1826 }
1827 
1828 /* This function builds ni_name = number of iterations.  Statements
1829    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1830    it to TRUE if new ssa_var is generated.  */
1831 
1832 tree
1833 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1834 {
1835   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1836   if (TREE_CODE (ni) == INTEGER_CST)
1837     return ni;
1838   else
1839     {
1840       tree ni_name, var;
1841       gimple_seq stmts = NULL;
1842       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1843 
1844       var = create_tmp_var (TREE_TYPE (ni), "niters");
1845       ni_name = force_gimple_operand (ni, &stmts, false, var);
1846       if (stmts)
1847 	{
1848 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1849 	  if (new_var_p != NULL)
1850 	    *new_var_p = true;
1851 	}
1852 
1853       return ni_name;
1854     }
1855 }
1856 
1857 /* Calculate the number of iterations above which vectorized loop will be
1858    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1859    of prolog loop.  If it's integer const, the integer number is also passed
1860    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1861    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1862    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1863    threshold below which the scalar (rather than vectorized) loop will be
1864    executed.  This function stores the upper bound (inclusive) of the result
1865    in BOUND_SCALAR.  */
1866 
1867 static tree
1868 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1869 			     int bound_prolog, poly_int64 bound_epilog, int th,
1870 			     poly_uint64 *bound_scalar,
1871 			     bool check_profitability)
1872 {
1873   tree type = TREE_TYPE (niters_prolog);
1874   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1875 			     build_int_cst (type, bound_epilog));
1876 
1877   *bound_scalar = bound_prolog + bound_epilog;
1878   if (check_profitability)
1879     {
1880       /* TH indicates the minimum niters of vectorized loop, while we
1881 	 compute the maximum niters of scalar loop.  */
1882       th--;
1883       /* Peeling for constant times.  */
1884       if (int_niters_prolog >= 0)
1885 	{
1886 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1887 	  return build_int_cst (type, *bound_scalar);
1888 	}
1889       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1890 	 and BOUND_EPILOG are inclusive upper bounds.  */
1891       if (known_ge (th, bound_prolog + bound_epilog))
1892 	{
1893 	  *bound_scalar = th;
1894 	  return build_int_cst (type, th);
1895 	}
1896       /* Need to do runtime comparison.  */
1897       else if (maybe_gt (th, bound_epilog))
1898 	{
1899 	  *bound_scalar = upper_bound (*bound_scalar, th);
1900 	  return fold_build2 (MAX_EXPR, type,
1901 			      build_int_cst (type, th), niters);
1902 	}
1903     }
1904   return niters;
1905 }
1906 
1907 /* NITERS is the number of times that the original scalar loop executes
1908    after peeling.  Work out the maximum number of iterations N that can
1909    be handled by the vectorized form of the loop and then either:
1910 
1911    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1912 
1913 	niters_vector = N
1914 
1915    b) set *STEP_VECTOR_PTR to one and generate:
1916 
1917         niters_vector = N / vf
1918 
1919    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1920    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1921    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1922 
1923 void
1924 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1925 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1926 			     bool niters_no_overflow)
1927 {
1928   tree ni_minus_gap, var;
1929   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1930   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1931   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1932   tree log_vf = NULL_TREE;
1933 
1934   /* If epilogue loop is required because of data accesses with gaps, we
1935      subtract one iteration from the total number of iterations here for
1936      correct calculation of RATIO.  */
1937   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1938     {
1939       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1940 				  build_one_cst (type));
1941       if (!is_gimple_val (ni_minus_gap))
1942 	{
1943 	  var = create_tmp_var (type, "ni_gap");
1944 	  gimple *stmts = NULL;
1945 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1946 					       true, var);
1947 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1948 	}
1949     }
1950   else
1951     ni_minus_gap = niters;
1952 
1953   unsigned HOST_WIDE_INT const_vf;
1954   if (vf.is_constant (&const_vf)
1955       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1956     {
1957       /* Create: niters >> log2(vf) */
1958       /* If it's known that niters == number of latch executions + 1 doesn't
1959 	 overflow, we can generate niters >> log2(vf); otherwise we generate
1960 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1961 	 will be at least one.  */
1962       log_vf = build_int_cst (type, exact_log2 (const_vf));
1963       if (niters_no_overflow)
1964 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1965       else
1966 	niters_vector
1967 	  = fold_build2 (PLUS_EXPR, type,
1968 			 fold_build2 (RSHIFT_EXPR, type,
1969 				      fold_build2 (MINUS_EXPR, type,
1970 						   ni_minus_gap,
1971 						   build_int_cst (type, vf)),
1972 				      log_vf),
1973 			 build_int_cst (type, 1));
1974       step_vector = build_one_cst (type);
1975     }
1976   else
1977     {
1978       niters_vector = ni_minus_gap;
1979       step_vector = build_int_cst (type, vf);
1980     }
1981 
1982   if (!is_gimple_val (niters_vector))
1983     {
1984       var = create_tmp_var (type, "bnd");
1985       gimple_seq stmts = NULL;
1986       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1987       gsi_insert_seq_on_edge_immediate (pe, stmts);
1988       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1989 	 we set range information to make niters analyzer's life easier.  */
1990       if (stmts != NULL && log_vf)
1991 	set_range_info (niters_vector, VR_RANGE,
1992 			wi::to_wide (build_int_cst (type, 1)),
1993 			wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1994 						  TYPE_MAX_VALUE (type),
1995 						  log_vf)));
1996     }
1997   *niters_vector_ptr = niters_vector;
1998   *step_vector_ptr = step_vector;
1999 
2000   return;
2001 }
2002 
2003 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2004    loop specified by LOOP_VINFO after vectorization, compute the number
2005    of iterations before vectorization (niters_vector * vf) and store it
2006    to NITERS_VECTOR_MULT_VF_PTR.  */
2007 
2008 static void
2009 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2010 				     tree niters_vector,
2011 				     tree *niters_vector_mult_vf_ptr)
2012 {
2013   /* We should be using a step_vector of VF if VF is variable.  */
2014   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2015   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2016   tree type = TREE_TYPE (niters_vector);
2017   tree log_vf = build_int_cst (type, exact_log2 (vf));
2018   basic_block exit_bb = single_exit (loop)->dest;
2019 
2020   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2021   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2022 					    niters_vector, log_vf);
2023   if (!is_gimple_val (niters_vector_mult_vf))
2024     {
2025       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2026       gimple_seq stmts = NULL;
2027       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2028 						    &stmts, true, var);
2029       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2030       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2031     }
2032   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2033 }
2034 
2035 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2036    from SECOND/FIRST and puts it at the original loop's preheader/exit
2037    edge, the two loops are arranged as below:
2038 
2039        preheader_a:
2040      first_loop:
2041        header_a:
2042 	 i_1 = PHI<i_0, i_2>;
2043 	 ...
2044 	 i_2 = i_1 + 1;
2045 	 if (cond_a)
2046 	   goto latch_a;
2047 	 else
2048 	   goto between_bb;
2049        latch_a:
2050 	 goto header_a;
2051 
2052        between_bb:
2053 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2054 
2055      second_loop:
2056        header_b:
2057 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2058 				 or with i_2 if no LCSSA phi is created
2059 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2060 	 ...
2061 	 i_4 = i_3 + 1;
2062 	 if (cond_b)
2063 	   goto latch_b;
2064 	 else
2065 	   goto exit_bb;
2066        latch_b:
2067 	 goto header_b;
2068 
2069        exit_bb:
2070 
2071    This function creates loop closed SSA for the first loop; update the
2072    second loop's PHI nodes by replacing argument on incoming edge with the
2073    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2074    is false, Loop closed ssa phis will only be created for non-iv phis for
2075    the first loop.
2076 
2077    This function assumes exit bb of the first loop is preheader bb of the
2078    second loop, i.e, between_bb in the example code.  With PHIs updated,
2079    the second loop will execute rest iterations of the first.  */
2080 
2081 static void
2082 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2083 				   struct loop *first, struct loop *second,
2084 				   bool create_lcssa_for_iv_phis)
2085 {
2086   gphi_iterator gsi_update, gsi_orig;
2087   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2088 
2089   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2090   edge second_preheader_e = loop_preheader_edge (second);
2091   basic_block between_bb = single_exit (first)->dest;
2092 
2093   gcc_assert (between_bb == second_preheader_e->src);
2094   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2095   /* Either the first loop or the second is the loop to be vectorized.  */
2096   gcc_assert (loop == first || loop == second);
2097 
2098   for (gsi_orig = gsi_start_phis (first->header),
2099        gsi_update = gsi_start_phis (second->header);
2100        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2101        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2102     {
2103       gphi *orig_phi = gsi_orig.phi ();
2104       gphi *update_phi = gsi_update.phi ();
2105 
2106       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2107       /* Generate lcssa PHI node for the first loop.  */
2108       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2109       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2110       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2111 	{
2112 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2113 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2114 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2115 	  arg = new_res;
2116 	}
2117 
2118       /* Update PHI node in the second loop by replacing arg on the loop's
2119 	 incoming edge.  */
2120       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2121     }
2122 }
2123 
2124 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2125    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2126    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2127    appear like below:
2128 
2129        guard_bb:
2130 	 if (cond)
2131 	   goto merge_bb;
2132 	 else
2133 	   goto skip_loop;
2134 
2135      skip_loop:
2136        header_a:
2137 	 i_1 = PHI<i_0, i_2>;
2138 	 ...
2139 	 i_2 = i_1 + 1;
2140 	 if (cond_a)
2141 	   goto latch_a;
2142 	 else
2143 	   goto exit_a;
2144        latch_a:
2145 	 goto header_a;
2146 
2147        exit_a:
2148 	 i_5 = PHI<i_2>;
2149 
2150        merge_bb:
2151 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2152 
2153      update_loop:
2154        header_b:
2155 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2156 	 ...
2157 	 i_4 = i_3 + 1;
2158 	 if (cond_b)
2159 	   goto latch_b;
2160 	 else
2161 	   goto exit_bb;
2162        latch_b:
2163 	 goto header_b;
2164 
2165        exit_bb:
2166 
2167    This function creates PHI nodes at merge_bb and replaces the use of i_5
2168    in the update_loop's PHI node with the result of new PHI result.  */
2169 
2170 static void
2171 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2172 				    struct loop *update_loop,
2173 				    edge guard_edge, edge merge_edge)
2174 {
2175   location_t merge_loc, guard_loc;
2176   edge orig_e = loop_preheader_edge (skip_loop);
2177   edge update_e = loop_preheader_edge (update_loop);
2178   gphi_iterator gsi_orig, gsi_update;
2179 
2180   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2181 	gsi_update = gsi_start_phis (update_loop->header));
2182        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2183        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2184     {
2185       gphi *orig_phi = gsi_orig.phi ();
2186       gphi *update_phi = gsi_update.phi ();
2187 
2188       /* Generate new phi node at merge bb of the guard.  */
2189       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2190       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2191 
2192       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2193 	 args in NEW_PHI for these edges.  */
2194       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2195       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2196       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2197       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2198       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2199       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2200 
2201       /* Update phi in UPDATE_PHI.  */
2202       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2203     }
2204 }
2205 
2206 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2207    this function searches for the corresponding lcssa phi node in exit
2208    bb of LOOP.  If it is found, return the phi result; otherwise return
2209    NULL.  */
2210 
2211 static tree
2212 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2213 		gphi *lcssa_phi)
2214 {
2215   gphi_iterator gsi;
2216   edge e = single_exit (loop);
2217 
2218   gcc_assert (single_pred_p (e->dest));
2219   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2220     {
2221       gphi *phi = gsi.phi ();
2222       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2223 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2224 	return PHI_RESULT (phi);
2225     }
2226   return NULL_TREE;
2227 }
2228 
2229 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2230    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2231    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2232    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2233    The CFG looks like:
2234 
2235      loop:
2236        header_a:
2237 	 i_1 = PHI<i_0, i_2>;
2238 	 ...
2239 	 i_2 = i_1 + 1;
2240 	 if (cond_a)
2241 	   goto latch_a;
2242 	 else
2243 	   goto exit_a;
2244        latch_a:
2245 	 goto header_a;
2246 
2247        exit_a:
2248 
2249        guard_bb:
2250 	 if (cond)
2251 	   goto merge_bb;
2252 	 else
2253 	   goto epilog_loop;
2254 
2255        ;; fall_through_bb
2256 
2257      epilog_loop:
2258        header_b:
2259 	 i_3 = PHI<i_2, i_4>;
2260 	 ...
2261 	 i_4 = i_3 + 1;
2262 	 if (cond_b)
2263 	   goto latch_b;
2264 	 else
2265 	   goto merge_bb;
2266        latch_b:
2267 	 goto header_b;
2268 
2269        merge_bb:
2270 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2271 
2272        exit_bb:
2273 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2274 
2275    For each name used out side EPILOG (i.e - for each name that has a lcssa
2276    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2277    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2278    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2279    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2280    in exit_bb will also be updated.  */
2281 
2282 static void
2283 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2284 				    edge guard_edge, edge merge_edge)
2285 {
2286   gphi_iterator gsi;
2287   basic_block merge_bb = guard_edge->dest;
2288 
2289   gcc_assert (single_succ_p (merge_bb));
2290   edge e = single_succ_edge (merge_bb);
2291   basic_block exit_bb = e->dest;
2292   gcc_assert (single_pred_p (exit_bb));
2293   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2294 
2295   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2296     {
2297       gphi *update_phi = gsi.phi ();
2298       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2299       /* This loop-closed-phi actually doesn't represent a use out of the
2300 	 loop - the phi arg is a constant.  */
2301       if (TREE_CODE (old_arg) != SSA_NAME)
2302 	continue;
2303 
2304       tree merge_arg = get_current_def (old_arg);
2305       if (!merge_arg)
2306 	merge_arg = old_arg;
2307 
2308       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2309       /* If the var is live after loop but not a reduction, we simply
2310 	 use the old arg.  */
2311       if (!guard_arg)
2312 	guard_arg = old_arg;
2313 
2314       /* Create new phi node in MERGE_BB:  */
2315       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2316       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2317 
2318       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2319 	 the two PHI args in merge_phi for these edges.  */
2320       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2321       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2322 
2323       /* Update the original phi in exit_bb.  */
2324       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2325     }
2326 }
2327 
2328 /* EPILOG loop is duplicated from the original loop for vectorizing,
2329    the arg of its loop closed ssa PHI needs to be updated.  */
2330 
2331 static void
2332 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2333 {
2334   gphi_iterator gsi;
2335   basic_block exit_bb = single_exit (epilog)->dest;
2336 
2337   gcc_assert (single_pred_p (exit_bb));
2338   edge e = EDGE_PRED (exit_bb, 0);
2339   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2340     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2341 }
2342 
2343 /* Function vect_do_peeling.
2344 
2345    Input:
2346    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2347 
2348        preheader:
2349      LOOP:
2350        header_bb:
2351 	 loop_body
2352 	 if (exit_loop_cond) goto exit_bb
2353 	 else                goto header_bb
2354        exit_bb:
2355 
2356    - NITERS: The number of iterations of the loop.
2357    - NITERSM1: The number of iterations of the loop's latch.
2358    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2359    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2360 			      CHECK_PROFITABILITY is true.
2361    Output:
2362    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2363      iterate after vectorization; see vect_set_loop_condition for details.
2364    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2365      should be set to the number of scalar iterations handled by the
2366      vector loop.  The SSA name is only used on exit from the loop.
2367 
2368    This function peels prolog and epilog from the loop, adds guards skipping
2369    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2370    would look like:
2371 
2372        guard_bb_1:
2373 	 if (prefer_scalar_loop) goto merge_bb_1
2374 	 else                    goto guard_bb_2
2375 
2376        guard_bb_2:
2377          if (skip_prolog) goto merge_bb_2
2378          else             goto prolog_preheader
2379 
2380        prolog_preheader:
2381      PROLOG:
2382        prolog_header_bb:
2383 	 prolog_body
2384 	 if (exit_prolog_cond) goto prolog_exit_bb
2385 	 else                  goto prolog_header_bb
2386        prolog_exit_bb:
2387 
2388        merge_bb_2:
2389 
2390        vector_preheader:
2391      VECTOR LOOP:
2392        vector_header_bb:
2393 	 vector_body
2394 	 if (exit_vector_cond) goto vector_exit_bb
2395 	 else                  goto vector_header_bb
2396        vector_exit_bb:
2397 
2398        guard_bb_3:
2399 	 if (skip_epilog) goto merge_bb_3
2400 	 else             goto epilog_preheader
2401 
2402        merge_bb_1:
2403 
2404        epilog_preheader:
2405      EPILOG:
2406        epilog_header_bb:
2407 	 epilog_body
2408 	 if (exit_epilog_cond) goto merge_bb_3
2409 	 else                  goto epilog_header_bb
2410 
2411        merge_bb_3:
2412 
2413    Note this function peels prolog and epilog only if it's necessary,
2414    as well as guards.
2415    Returns created epilogue or NULL.
2416 
2417    TODO: Guard for prefer_scalar_loop should be emitted along with
2418    versioning conditions if loop versioning is needed.  */
2419 
2420 
2421 struct loop *
2422 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2423 		 tree *niters_vector, tree *step_vector,
2424 		 tree *niters_vector_mult_vf_var, int th,
2425 		 bool check_profitability, bool niters_no_overflow)
2426 {
2427   edge e, guard_e;
2428   tree type = TREE_TYPE (niters), guard_cond;
2429   basic_block guard_bb, guard_to;
2430   profile_probability prob_prolog, prob_vector, prob_epilog;
2431   int estimated_vf;
2432   int prolog_peeling = 0;
2433   /* We currently do not support prolog peeling if the target alignment is not
2434      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2435      target alignment being constant.  */
2436   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2437   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2438     return NULL;
2439 
2440   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2441     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2442 
2443   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2444   poly_uint64 bound_epilog = 0;
2445   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2446       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2447     bound_epilog += vf - 1;
2448   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2449     bound_epilog += 1;
2450   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2451   poly_uint64 bound_scalar = bound_epilog;
2452 
2453   if (!prolog_peeling && !epilog_peeling)
2454     return NULL;
2455 
2456   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2457   estimated_vf = vect_vf_for_cost (loop_vinfo);
2458   if (estimated_vf == 2)
2459     estimated_vf = 3;
2460   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2461 			.apply_scale (estimated_vf - 1, estimated_vf);
2462 
2463   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2464   struct loop *first_loop = loop;
2465   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2466   create_lcssa_for_virtual_phi (loop);
2467   update_ssa (TODO_update_ssa_only_virtuals);
2468 
2469   if (MAY_HAVE_DEBUG_BIND_STMTS)
2470     {
2471       gcc_assert (!adjust_vec.exists ());
2472       adjust_vec.create (32);
2473     }
2474   initialize_original_copy_tables ();
2475 
2476   /* Record the anchor bb at which the guard should be placed if the scalar
2477      loop might be preferred.  */
2478   basic_block anchor = loop_preheader_edge (loop)->src;
2479 
2480   /* Generate the number of iterations for the prolog loop.  We do this here
2481      so that we can also get the upper bound on the number of iterations.  */
2482   tree niters_prolog;
2483   int bound_prolog = 0;
2484   if (prolog_peeling)
2485     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2486 						 &bound_prolog);
2487   else
2488     niters_prolog = build_int_cst (type, 0);
2489 
2490   /* Prolog loop may be skipped.  */
2491   bool skip_prolog = (prolog_peeling != 0);
2492   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2493      when we peel for epilog loop and when it hasn't been checked with
2494      loop versioning.  */
2495   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2496 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2497 				  bound_prolog + bound_epilog)
2498 		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2499   /* Epilog loop must be executed if the number of iterations for epilog
2500      loop is known at compile time, otherwise we need to add a check at
2501      the end of vector loop and skip to the end of epilog loop.  */
2502   bool skip_epilog = (prolog_peeling < 0
2503 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2504 		      || !vf.is_constant ());
2505   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2506   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2507     skip_epilog = false;
2508 
2509   if (skip_vector)
2510     {
2511       split_edge (loop_preheader_edge (loop));
2512 
2513       /* Due to the order in which we peel prolog and epilog, we first
2514 	 propagate probability to the whole loop.  The purpose is to
2515 	 avoid adjusting probabilities of both prolog and vector loops
2516 	 separately.  Note in this case, the probability of epilog loop
2517 	 needs to be scaled back later.  */
2518       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2519       if (prob_vector.initialized_p ())
2520 	{
2521 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2522 	  scale_loop_profile (loop, prob_vector, 0);
2523 	}
2524     }
2525 
2526   dump_user_location_t loop_loc = find_loop_location (loop);
2527   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2528   if (prolog_peeling)
2529     {
2530       e = loop_preheader_edge (loop);
2531       if (!slpeel_can_duplicate_loop_p (loop, e))
2532 	{
2533 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2534 			   "loop can't be duplicated to preheader edge.\n");
2535 	  gcc_unreachable ();
2536 	}
2537       /* Peel prolog and put it on preheader edge of loop.  */
2538       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2539       if (!prolog)
2540 	{
2541 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2542 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2543 	  gcc_unreachable ();
2544 	}
2545       prolog->force_vectorize = false;
2546       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2547       first_loop = prolog;
2548       reset_original_copy_tables ();
2549 
2550       /* Update the number of iterations for prolog loop.  */
2551       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2552       vect_set_loop_condition (prolog, NULL, niters_prolog,
2553 			       step_prolog, NULL_TREE, false);
2554 
2555       /* Skip the prolog loop.  */
2556       if (skip_prolog)
2557 	{
2558 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2559 				    niters_prolog, build_int_cst (type, 0));
2560 	  guard_bb = loop_preheader_edge (prolog)->src;
2561 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2562 	  guard_to = split_edge (loop_preheader_edge (loop));
2563 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2564 					   guard_to, guard_bb,
2565 					   prob_prolog.invert (),
2566 					   irred_flag);
2567 	  e = EDGE_PRED (guard_to, 0);
2568 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2569 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2570 
2571 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2572 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2573 	}
2574       /* Update init address of DRs.  */
2575       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2576       /* Update niters for vector loop.  */
2577       LOOP_VINFO_NITERS (loop_vinfo)
2578 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2579       LOOP_VINFO_NITERSM1 (loop_vinfo)
2580 	= fold_build2 (MINUS_EXPR, type,
2581 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2582       bool new_var_p = false;
2583       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2584       /* It's guaranteed that vector loop bound before vectorization is at
2585 	 least VF, so set range information for newly generated var.  */
2586       if (new_var_p)
2587 	set_range_info (niters, VR_RANGE,
2588 			wi::to_wide (build_int_cst (type, vf)),
2589 			wi::to_wide (TYPE_MAX_VALUE (type)));
2590 
2591       /* Prolog iterates at most bound_prolog times, latch iterates at
2592 	 most bound_prolog - 1 times.  */
2593       record_niter_bound (prolog, bound_prolog - 1, false, true);
2594       delete_update_ssa ();
2595       adjust_vec_debug_stmts ();
2596       scev_reset ();
2597     }
2598 
2599   if (epilog_peeling)
2600     {
2601       e = single_exit (loop);
2602       if (!slpeel_can_duplicate_loop_p (loop, e))
2603 	{
2604 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2605 			   "loop can't be duplicated to exit edge.\n");
2606 	  gcc_unreachable ();
2607 	}
2608       /* Peel epilog and put it on exit edge of loop.  */
2609       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2610       if (!epilog)
2611 	{
2612 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2613 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2614 	  gcc_unreachable ();
2615 	}
2616       epilog->force_vectorize = false;
2617       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2618 
2619       /* Scalar version loop may be preferred.  In this case, add guard
2620 	 and skip to epilog.  Note this only happens when the number of
2621 	 iterations of loop is unknown at compile time, otherwise this
2622 	 won't be vectorized.  */
2623       if (skip_vector)
2624 	{
2625 	  /* Additional epilogue iteration is peeled if gap exists.  */
2626 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2627 						bound_prolog, bound_epilog,
2628 						th, &bound_scalar,
2629 						check_profitability);
2630 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2631 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2632 	  guard_bb = anchor;
2633 	  guard_to = split_edge (loop_preheader_edge (epilog));
2634 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2635 					   guard_to, guard_bb,
2636 					   prob_vector.invert (),
2637 					   irred_flag);
2638 	  e = EDGE_PRED (guard_to, 0);
2639 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2640 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2641 
2642 	  /* Simply propagate profile info from guard_bb to guard_to which is
2643 	     a merge point of control flow.  */
2644 	  guard_to->count = guard_bb->count;
2645 
2646 	  /* Scale probability of epilog loop back.
2647 	     FIXME: We should avoid scaling down and back up.  Profile may
2648 	     get lost if we scale down to 0.  */
2649 	  basic_block *bbs = get_loop_body (epilog);
2650 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2651 	    bbs[i]->count = bbs[i]->count.apply_scale
2652 				 (bbs[i]->count,
2653 				  bbs[i]->count.apply_probability
2654 				    (prob_vector));
2655 	  free (bbs);
2656 	}
2657 
2658       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2659       tree niters_vector_mult_vf;
2660       /* If loop is peeled for non-zero constant times, now niters refers to
2661 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2662 	 overflows.  */
2663       niters_no_overflow |= (prolog_peeling > 0);
2664       vect_gen_vector_loop_niters (loop_vinfo, niters,
2665 				   niters_vector, step_vector,
2666 				   niters_no_overflow);
2667       if (!integer_onep (*step_vector))
2668 	{
2669 	  /* On exit from the loop we will have an easy way of calcalating
2670 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2671 	     until then.  */
2672 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2673 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2674 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2675 	}
2676       else
2677 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2678 					     &niters_vector_mult_vf);
2679       /* Update IVs of original loop as if they were advanced by
2680 	 niters_vector_mult_vf steps.  */
2681       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2682       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2683       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2684 					update_e);
2685 
2686       if (skip_epilog)
2687 	{
2688 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2689 				    niters, niters_vector_mult_vf);
2690 	  guard_bb = single_exit (loop)->dest;
2691 	  guard_to = split_edge (single_exit (epilog));
2692 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2693 					   skip_vector ? anchor : guard_bb,
2694 					   prob_epilog.invert (),
2695 					   irred_flag);
2696 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2697 					      single_exit (epilog));
2698 	  /* Only need to handle basic block before epilog loop if it's not
2699 	     the guard_bb, which is the case when skip_vector is true.  */
2700 	  if (guard_bb != bb_before_epilog)
2701 	    {
2702 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2703 
2704 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2705 	    }
2706 	  scale_loop_profile (epilog, prob_epilog, 0);
2707 	}
2708       else
2709 	slpeel_update_phi_nodes_for_lcssa (epilog);
2710 
2711       unsigned HOST_WIDE_INT bound;
2712       if (bound_scalar.is_constant (&bound))
2713 	{
2714 	  gcc_assert (bound != 0);
2715 	  /* -1 to convert loop iterations to latch iterations.  */
2716 	  record_niter_bound (epilog, bound - 1, false, true);
2717 	}
2718 
2719       delete_update_ssa ();
2720       adjust_vec_debug_stmts ();
2721       scev_reset ();
2722     }
2723   adjust_vec.release ();
2724   free_original_copy_tables ();
2725 
2726   return epilog;
2727 }
2728 
2729 /* Function vect_create_cond_for_niters_checks.
2730 
2731    Create a conditional expression that represents the run-time checks for
2732    loop's niter.  The loop is guaranteed to terminate if the run-time
2733    checks hold.
2734 
2735    Input:
2736    COND_EXPR  - input conditional expression.  New conditions will be chained
2737 		with logical AND operation.  If it is NULL, then the function
2738 		is used to return the number of alias checks.
2739    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2740 		to be checked.
2741 
2742    Output:
2743    COND_EXPR - conditional expression.
2744 
2745    The returned COND_EXPR is the conditional expression to be used in the
2746    if statement that controls which version of the loop gets executed at
2747    runtime.  */
2748 
2749 static void
2750 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2751 {
2752   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2753 
2754   if (*cond_expr)
2755     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2756 			      *cond_expr, part_cond_expr);
2757   else
2758     *cond_expr = part_cond_expr;
2759 }
2760 
2761 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2762    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2763 
2764 static void
2765 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2766 {
2767   if (*cond_expr)
2768     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2769 			      *cond_expr, part_cond_expr);
2770   else
2771     *cond_expr = part_cond_expr;
2772 }
2773 
2774 /* Function vect_create_cond_for_align_checks.
2775 
2776    Create a conditional expression that represents the alignment checks for
2777    all of data references (array element references) whose alignment must be
2778    checked at runtime.
2779 
2780    Input:
2781    COND_EXPR  - input conditional expression.  New conditions will be chained
2782                 with logical AND operation.
2783    LOOP_VINFO - two fields of the loop information are used.
2784                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2785                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2786 
2787    Output:
2788    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2789                          expression.
2790    The returned value is the conditional expression to be used in the if
2791    statement that controls which version of the loop gets executed at runtime.
2792 
2793    The algorithm makes two assumptions:
2794      1) The number of bytes "n" in a vector is a power of 2.
2795      2) An address "a" is aligned if a%n is zero and that this
2796         test can be done as a&(n-1) == 0.  For example, for 16
2797         byte vectors the test is a&0xf == 0.  */
2798 
2799 static void
2800 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2801                                    tree *cond_expr,
2802 				   gimple_seq *cond_expr_stmt_list)
2803 {
2804   vec<stmt_vec_info> may_misalign_stmts
2805     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2806   stmt_vec_info stmt_info;
2807   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2808   tree mask_cst;
2809   unsigned int i;
2810   tree int_ptrsize_type;
2811   char tmp_name[20];
2812   tree or_tmp_name = NULL_TREE;
2813   tree and_tmp_name;
2814   gimple *and_stmt;
2815   tree ptrsize_zero;
2816   tree part_cond_expr;
2817 
2818   /* Check that mask is one less than a power of 2, i.e., mask is
2819      all zeros followed by all ones.  */
2820   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2821 
2822   int_ptrsize_type = signed_type_for (ptr_type_node);
2823 
2824   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2825      of the first vector of the i'th data reference. */
2826 
2827   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2828     {
2829       gimple_seq new_stmt_list = NULL;
2830       tree addr_base;
2831       tree addr_tmp_name;
2832       tree new_or_tmp_name;
2833       gimple *addr_stmt, *or_stmt;
2834       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2835       bool negative = tree_int_cst_compare
2836 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2837       tree offset = negative
2838 	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2839 
2840       /* create: addr_tmp = (int)(address_of_first_vector) */
2841       addr_base =
2842 	vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2843 					      offset);
2844       if (new_stmt_list != NULL)
2845 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2846 
2847       sprintf (tmp_name, "addr2int%d", i);
2848       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2849       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2850       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2851 
2852       /* The addresses are OR together.  */
2853 
2854       if (or_tmp_name != NULL_TREE)
2855         {
2856           /* create: or_tmp = or_tmp | addr_tmp */
2857           sprintf (tmp_name, "orptrs%d", i);
2858 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2859 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2860 					 or_tmp_name, addr_tmp_name);
2861 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2862           or_tmp_name = new_or_tmp_name;
2863         }
2864       else
2865         or_tmp_name = addr_tmp_name;
2866 
2867     } /* end for i */
2868 
2869   mask_cst = build_int_cst (int_ptrsize_type, mask);
2870 
2871   /* create: and_tmp = or_tmp & mask  */
2872   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2873 
2874   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2875 				  or_tmp_name, mask_cst);
2876   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2877 
2878   /* Make and_tmp the left operand of the conditional test against zero.
2879      if and_tmp has a nonzero bit then some address is unaligned.  */
2880   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2881   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2882 				and_tmp_name, ptrsize_zero);
2883   chain_cond_expr (cond_expr, part_cond_expr);
2884 }
2885 
2886 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2887    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2888    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2889    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2890 
2891 static void
2892 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2893 {
2894   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2895   unsigned int i;
2896   vec_object_pair *pair;
2897   FOR_EACH_VEC_ELT (pairs, i, pair)
2898     {
2899       tree addr1 = build_fold_addr_expr (pair->first);
2900       tree addr2 = build_fold_addr_expr (pair->second);
2901       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2902 					 addr1, addr2);
2903       chain_cond_expr (cond_expr, part_cond_expr);
2904     }
2905 }
2906 
2907 /* Create an expression that is true when all lower-bound conditions for
2908    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2909 
2910 static void
2911 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2912 {
2913   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2914   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2915     {
2916       tree expr = lower_bounds[i].expr;
2917       tree type = unsigned_type_for (TREE_TYPE (expr));
2918       expr = fold_convert (type, expr);
2919       poly_uint64 bound = lower_bounds[i].min_value;
2920       if (!lower_bounds[i].unsigned_p)
2921 	{
2922 	  expr = fold_build2 (PLUS_EXPR, type, expr,
2923 			      build_int_cstu (type, bound - 1));
2924 	  bound += bound - 1;
2925 	}
2926       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2927 					 build_int_cstu (type, bound));
2928       chain_cond_expr (cond_expr, part_cond_expr);
2929     }
2930 }
2931 
2932 /* Function vect_create_cond_for_alias_checks.
2933 
2934    Create a conditional expression that represents the run-time checks for
2935    overlapping of address ranges represented by a list of data references
2936    relations passed as input.
2937 
2938    Input:
2939    COND_EXPR  - input conditional expression.  New conditions will be chained
2940                 with logical AND operation.  If it is NULL, then the function
2941                 is used to return the number of alias checks.
2942    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2943 	        to be checked.
2944 
2945    Output:
2946    COND_EXPR - conditional expression.
2947 
2948    The returned COND_EXPR is the conditional expression to be used in the if
2949    statement that controls which version of the loop gets executed at runtime.
2950 */
2951 
2952 void
2953 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2954 {
2955   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2956     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2957 
2958   if (comp_alias_ddrs.is_empty ())
2959     return;
2960 
2961   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2962 			       &comp_alias_ddrs, cond_expr);
2963   if (dump_enabled_p ())
2964     dump_printf_loc (MSG_NOTE, vect_location,
2965 		     "created %u versioning for alias checks.\n",
2966 		     comp_alias_ddrs.length ());
2967 }
2968 
2969 
2970 /* Function vect_loop_versioning.
2971 
2972    If the loop has data references that may or may not be aligned or/and
2973    has data reference relations whose independence was not proven then
2974    two versions of the loop need to be generated, one which is vectorized
2975    and one which isn't.  A test is then generated to control which of the
2976    loops is executed.  The test checks for the alignment of all of the
2977    data references that may or may not be aligned.  An additional
2978    sequence of runtime tests is generated for each pairs of DDRs whose
2979    independence was not proven.  The vectorized version of loop is
2980    executed only if both alias and alignment tests are passed.
2981 
2982    The test generated to check which version of loop is executed
2983    is modified to also check for profitability as indicated by the
2984    cost model threshold TH.
2985 
2986    The versioning precondition(s) are placed in *COND_EXPR and
2987    *COND_EXPR_STMT_LIST.  */
2988 
2989 struct loop *
2990 vect_loop_versioning (loop_vec_info loop_vinfo,
2991 		      unsigned int th, bool check_profitability,
2992 		      poly_uint64 versioning_threshold)
2993 {
2994   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2995   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2996   basic_block condition_bb;
2997   gphi_iterator gsi;
2998   gimple_stmt_iterator cond_exp_gsi;
2999   basic_block merge_bb;
3000   basic_block new_exit_bb;
3001   edge new_exit_e, e;
3002   gphi *orig_phi, *new_phi;
3003   tree cond_expr = NULL_TREE;
3004   gimple_seq cond_expr_stmt_list = NULL;
3005   tree arg;
3006   profile_probability prob = profile_probability::likely ();
3007   gimple_seq gimplify_stmt_list = NULL;
3008   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3009   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3010   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3011   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3012   tree version_simd_if_cond
3013     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3014 
3015   if (check_profitability)
3016     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3017 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3018 					    th - 1));
3019   if (maybe_ne (versioning_threshold, 0U))
3020     {
3021       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3022 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3023 					      versioning_threshold - 1));
3024       if (cond_expr)
3025 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3026 				 expr, cond_expr);
3027       else
3028 	cond_expr = expr;
3029     }
3030 
3031   if (version_niter)
3032     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3033 
3034   if (cond_expr)
3035     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3036 					is_gimple_condexpr, NULL_TREE);
3037 
3038   if (version_align)
3039     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3040 				       &cond_expr_stmt_list);
3041 
3042   if (version_alias)
3043     {
3044       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3045       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3046       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3047     }
3048 
3049   if (version_simd_if_cond)
3050     {
3051       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3052       if (flag_checking)
3053 	if (basic_block bb
3054 	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3055 	  gcc_assert (bb != loop->header
3056 		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3057 		      && (scalar_loop == NULL
3058 			  || (bb != scalar_loop->header
3059 			      && dominated_by_p (CDI_DOMINATORS,
3060 						 scalar_loop->header, bb))));
3061       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3062       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3063 			    version_simd_if_cond, zero);
3064       if (cond_expr)
3065         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3066 				 c, cond_expr);
3067       else
3068         cond_expr = c;
3069       if (dump_enabled_p ())
3070 	dump_printf_loc (MSG_NOTE, vect_location,
3071 			 "created versioning for simd if condition check.\n");
3072     }
3073 
3074   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3075 				      &gimplify_stmt_list,
3076 				      is_gimple_condexpr, NULL_TREE);
3077   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3078 
3079   initialize_original_copy_tables ();
3080   if (scalar_loop)
3081     {
3082       edge scalar_e;
3083       basic_block preheader, scalar_preheader;
3084 
3085       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3086 	 scale LOOP's frequencies instead.  */
3087       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3088 			    prob, prob.invert (), prob, prob.invert (), true);
3089       scale_loop_frequencies (loop, prob);
3090       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3091 	 while we need to move it above LOOP's preheader.  */
3092       e = loop_preheader_edge (loop);
3093       scalar_e = loop_preheader_edge (scalar_loop);
3094       /* The vector loop preheader might not be empty, since new
3095 	 invariants could have been created while analyzing the loop.  */
3096       gcc_assert (single_pred_p (e->src));
3097       gcc_assert (empty_block_p (scalar_e->src)
3098 		  && single_pred_p (scalar_e->src));
3099       gcc_assert (single_pred_p (condition_bb));
3100       preheader = e->src;
3101       scalar_preheader = scalar_e->src;
3102       scalar_e = find_edge (condition_bb, scalar_preheader);
3103       e = single_pred_edge (preheader);
3104       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3105 				      scalar_preheader);
3106       redirect_edge_and_branch_force (scalar_e, preheader);
3107       redirect_edge_and_branch_force (e, condition_bb);
3108       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3109 			       single_pred (condition_bb));
3110       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3111 			       single_pred (scalar_preheader));
3112       set_immediate_dominator (CDI_DOMINATORS, preheader,
3113 			       condition_bb);
3114     }
3115   else
3116     nloop = loop_version (loop, cond_expr, &condition_bb,
3117 			  prob, prob.invert (), prob, prob.invert (), true);
3118 
3119   if (version_niter)
3120     {
3121       /* The versioned loop could be infinite, we need to clear existing
3122 	 niter information which is copied from the original loop.  */
3123       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3124       vect_free_loop_info_assumptions (nloop);
3125       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3126       loop_constraint_set (loop, LOOP_C_INFINITE);
3127     }
3128 
3129   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3130       && dump_enabled_p ())
3131     {
3132       if (version_alias)
3133         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3134 			 vect_location,
3135                          "loop versioned for vectorization because of "
3136 			 "possible aliasing\n");
3137       if (version_align)
3138         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3139 			 vect_location,
3140                          "loop versioned for vectorization to enhance "
3141 			 "alignment\n");
3142 
3143     }
3144   free_original_copy_tables ();
3145 
3146   /* Loop versioning violates an assumption we try to maintain during
3147      vectorization - that the loop exit block has a single predecessor.
3148      After versioning, the exit block of both loop versions is the same
3149      basic block (i.e. it has two predecessors). Just in order to simplify
3150      following transformations in the vectorizer, we fix this situation
3151      here by adding a new (empty) block on the exit-edge of the loop,
3152      with the proper loop-exit phis to maintain loop-closed-form.
3153      If loop versioning wasn't done from loop, but scalar_loop instead,
3154      merge_bb will have already just a single successor.  */
3155 
3156   merge_bb = single_exit (loop)->dest;
3157   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3158     {
3159       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3160       new_exit_bb = split_edge (single_exit (loop));
3161       new_exit_e = single_exit (loop);
3162       e = EDGE_SUCC (new_exit_bb, 0);
3163 
3164       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3165 	{
3166 	  tree new_res;
3167 	  orig_phi = gsi.phi ();
3168 	  new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3169 	  new_phi = create_phi_node (new_res, new_exit_bb);
3170 	  arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3171 	  add_phi_arg (new_phi, arg, new_exit_e,
3172 		       gimple_phi_arg_location_from_edge (orig_phi, e));
3173 	  adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3174 	}
3175     }
3176 
3177   /* End loop-exit-fixes after versioning.  */
3178 
3179   if (cond_expr_stmt_list)
3180     {
3181       cond_exp_gsi = gsi_last_bb (condition_bb);
3182       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3183 			     GSI_SAME_STMT);
3184     }
3185   update_ssa (TODO_update_ssa);
3186 
3187   return nloop;
3188 }
3189