xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-vect-loop-manip.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Vectorizer Specific Loop Manipulations
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53 
54 /*************************************************************************
55   Simple Loop Peeling Utilities
56 
57   Utilities to support loop peeling for vectorization purposes.
58  *************************************************************************/
59 
60 
61 /* Renames the use *OP_P.  */
62 
63 static void
rename_use_op(use_operand_p op_p)64 rename_use_op (use_operand_p op_p)
65 {
66   tree new_name;
67 
68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69     return;
70 
71   new_name = get_current_def (USE_FROM_PTR (op_p));
72 
73   /* Something defined outside of the loop.  */
74   if (!new_name)
75     return;
76 
77   /* An ordinary ssa name defined in the loop.  */
78 
79   SET_USE (op_p, new_name);
80 }
81 
82 
83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85    true.  */
86 
87 static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90   gimple *stmt;
91   use_operand_p use_p;
92   ssa_op_iter iter;
93   edge e;
94   edge_iterator ei;
95   class loop *loop = bb->loop_father;
96   class loop *outer_loop = NULL;
97 
98   if (rename_from_outer_loop)
99     {
100       gcc_assert (loop);
101       outer_loop = loop_outer (loop);
102     }
103 
104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105        gsi_next (&gsi))
106     {
107       stmt = gsi_stmt (gsi);
108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 	rename_use_op (use_p);
110     }
111 
112   FOR_EACH_EDGE (e, ei, bb->preds)
113     {
114       if (!flow_bb_inside_loop_p (loop, e->src))
115 	{
116 	  if (!rename_from_outer_loop)
117 	    continue;
118 	  if (e->src != outer_loop->header)
119 	    {
120 	      if (outer_loop->inner->next)
121 		{
122 		  /* If outer_loop has 2 inner loops, allow there to
123 		     be an extra basic block which decides which of the
124 		     two loops to use using LOOP_VECTORIZED.  */
125 		  if (!single_pred_p (e->src)
126 		      || single_pred (e->src) != outer_loop->header)
127 		    continue;
128 		}
129 	    }
130 	}
131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 	   gsi_next (&gsi))
133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134     }
135 }
136 
137 
138 struct adjust_info
139 {
140   tree from, to;
141   basic_block bb;
142 };
143 
144 /* A stack of values to be adjusted in debug stmts.  We have to
145    process them LIFO, so that the closest substitution applies.  If we
146    processed them FIFO, without the stack, we might substitute uses
147    with a PHI DEF that would soon become non-dominant, and when we got
148    to the suitable one, it wouldn't have anything to substitute any
149    more.  */
150 static vec<adjust_info, va_heap> adjust_vec;
151 
152 /* Adjust any debug stmts that referenced AI->from values to use the
153    loop-closed AI->to, if the references are dominated by AI->bb and
154    not by the definition of AI->from.  */
155 
156 static void
adjust_debug_stmts_now(adjust_info * ai)157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159   basic_block bbphi = ai->bb;
160   tree orig_def = ai->from;
161   tree new_def = ai->to;
162   imm_use_iterator imm_iter;
163   gimple *stmt;
164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165 
166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167 
168   /* Adjust any debug stmts that held onto non-loop-closed
169      references.  */
170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171     {
172       use_operand_p use_p;
173       basic_block bbuse;
174 
175       if (!is_gimple_debug (stmt))
176 	continue;
177 
178       gcc_assert (gimple_debug_bind_p (stmt));
179 
180       bbuse = gimple_bb (stmt);
181 
182       if ((bbuse == bbphi
183 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 	  && !(bbuse == bbdef
185 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 	{
187 	  if (new_def)
188 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 	      SET_USE (use_p, new_def);
190 	  else
191 	    {
192 	      gimple_debug_bind_reset_value (stmt);
193 	      update_stmt (stmt);
194 	    }
195 	}
196     }
197 }
198 
199 /* Adjust debug stmts as scheduled before.  */
200 
201 static void
adjust_vec_debug_stmts(void)202 adjust_vec_debug_stmts (void)
203 {
204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
205     return;
206 
207   gcc_assert (adjust_vec.exists ());
208 
209   while (!adjust_vec.is_empty ())
210     {
211       adjust_debug_stmts_now (&adjust_vec.last ());
212       adjust_vec.pop ();
213     }
214 }
215 
216 /* Adjust any debug stmts that referenced FROM values to use the
217    loop-closed TO, if the references are dominated by BB and not by
218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
219    will be postponed until adjust_vec_debug_stmts is called.  */
220 
221 static void
adjust_debug_stmts(tree from,tree to,basic_block bb)222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224   adjust_info ai;
225 
226   if (MAY_HAVE_DEBUG_BIND_STMTS
227       && TREE_CODE (from) == SSA_NAME
228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
229       && ! virtual_operand_p (from))
230     {
231       ai.from = from;
232       ai.to = to;
233       ai.bb = bb;
234 
235       if (adjust_vec.exists ())
236 	adjust_vec.safe_push (ai);
237       else
238 	adjust_debug_stmts_now (&ai);
239     }
240 }
241 
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243    to adjust any debug stmts that referenced the old phi arg,
244    presumably non-loop-closed references left over from other
245    transformations.  */
246 
247 static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251 
252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253 
254   if (MAY_HAVE_DEBUG_BIND_STMTS)
255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 			gimple_bb (update_phi));
257 }
258 
259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
260    that the control should have during the first iteration and NEXT_CTRL is the
261    value that it should have on subsequent iterations.  */
262 
263 static void
vect_set_loop_control(class loop * loop,tree ctrl,tree init_ctrl,tree next_ctrl)264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 		       tree next_ctrl)
266 {
267   gphi *phi = create_phi_node (ctrl, loop->header);
268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271 
272 /* Add SEQ to the end of LOOP's preheader block.  */
273 
274 static void
add_preheader_seq(class loop * loop,gimple_seq seq)275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277   if (seq)
278     {
279       edge pe = loop_preheader_edge (loop);
280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281       gcc_assert (!new_bb);
282     }
283 }
284 
285 /* Add SEQ to the beginning of LOOP's header block.  */
286 
287 static void
add_header_seq(class loop * loop,gimple_seq seq)288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290   if (seq)
291     {
292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294     }
295 }
296 
297 /* Return true if the target can interleave elements of two vectors.
298    OFFSET is 0 if the first half of the vectors should be interleaved
299    or 1 if the second half should.  When returning true, store the
300    associated permutation in INDICES.  */
301 
302 static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 			unsigned int offset)
305 {
306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307   poly_uint64 base = exact_div (nelts, 2) * offset;
308   vec_perm_builder sel (nelts, 2, 3);
309   for (unsigned int i = 0; i < 3; ++i)
310     {
311       sel.quick_push (base + i);
312       sel.quick_push (base + i + nelts);
313     }
314   indices->new_vector (sel, 2, nelts);
315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317 
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319    in SRC_RGM, given that the former has twice as many masks as the
320    latter.  Return true on success, adding any new statements to SEQ.  */
321 
322 static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_controls * dest_rgm,rgroup_controls * src_rgm)323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 			       rgroup_controls *src_rgm)
325 {
326   tree src_masktype = src_rgm->type;
327   tree dest_masktype = dest_rgm->type;
328   machine_mode src_mode = TYPE_MODE (src_masktype);
329   insn_code icode1, icode2;
330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 				  src_mode)) != CODE_FOR_nothing
333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 				  src_mode)) != CODE_FOR_nothing)
335     {
336       /* Unpacking the source masks gives at least as many mask bits as
337 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
342 	{
343 	  tree src = src_rgm->controls[i / 2];
344 	  tree dest = dest_rgm->controls[i];
345 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 			    ? VEC_UNPACK_HI_EXPR
347 			    : VEC_UNPACK_LO_EXPR);
348 	  gassign *stmt;
349 	  if (dest_masktype == unpack_masktype)
350 	    stmt = gimple_build_assign (dest, code, src);
351 	  else
352 	    {
353 	      tree temp = make_ssa_name (unpack_masktype);
354 	      stmt = gimple_build_assign (temp, code, src);
355 	      gimple_seq_add_stmt (seq, stmt);
356 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 					  build1 (VIEW_CONVERT_EXPR,
358 						  dest_masktype, temp));
359 	    }
360 	  gimple_seq_add_stmt (seq, stmt);
361 	}
362       return true;
363     }
364   vec_perm_indices indices[2];
365   if (dest_masktype == src_masktype
366       && interleave_supported_p (&indices[0], src_masktype, 0)
367       && interleave_supported_p (&indices[1], src_masktype, 1))
368     {
369       /* The destination requires twice as many mask bits as the source, so
370 	 we can use interleaving permutes to double up the number of bits.  */
371       tree masks[2];
372       for (unsigned int i = 0; i < 2; ++i)
373 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
375 	{
376 	  tree src = src_rgm->controls[i / 2];
377 	  tree dest = dest_rgm->controls[i];
378 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 					      src, src, masks[i & 1]);
380 	  gimple_seq_add_stmt (seq, stmt);
381 	}
382       return true;
383     }
384   return false;
385 }
386 
387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
388    for all the rgroup controls in RGC and return a control that is nonzero
389    when the loop needs to iterate.  Add any new preheader statements to
390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
391 
392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
393    times and has been vectorized according to LOOP_VINFO.
394 
395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396    starts with NITERS_SKIP dummy iterations of the scalar loop before
397    the real work starts.  The mask elements for these dummy iterations
398    must be 0, to ensure that the extra iterations do not have an effect.
399 
400    It is known that:
401 
402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
403 
404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
405    variable that starts at 0 and has step:
406 
407      VF * RGC->max_nscalars_per_iter * RGC->factor
408 
409    might overflow before hitting a value above:
410 
411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
412 
413    This means that we cannot guarantee that such an induction variable
414    would ever hit a value that produces a set of all-false masks or zero
415    lengths for RGC.
416 
417    Note: the cost of the code generated by this function is modeled
418    by vect_estimate_min_profitable_iters, so changes here may need
419    corresponding changes there.  */
420 
421 static tree
vect_set_loop_controls_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_seq * header_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_controls * rgc,tree niters,tree niters_skip,bool might_wrap_p)422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 				 gimple_seq *preheader_seq,
424 				 gimple_seq *header_seq,
425 				 gimple_stmt_iterator loop_cond_gsi,
426 				 rgroup_controls *rgc, tree niters,
427 				 tree niters_skip, bool might_wrap_p)
428 {
429   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
430   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
431   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
432 
433   tree ctrl_type = rgc->type;
434   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
435   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
436   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
437   tree length_limit = NULL_TREE;
438   /* For length, we need length_limit to ensure length in range.  */
439   if (!use_masks_p)
440     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
441 
442   /* Calculate the maximum number of item values that the rgroup
443      handles in total, the number that it handles for each iteration
444      of the vector loop, and the number that it should skip during the
445      first iteration of the vector loop.  */
446   tree nitems_total = niters;
447   tree nitems_step = build_int_cst (iv_type, vf);
448   tree nitems_skip = niters_skip;
449   if (nitems_per_iter != 1)
450     {
451       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
452 	 these multiplications don't overflow.  */
453       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
454       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
455       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
456 				   nitems_total, compare_factor);
457       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
458 				  nitems_step, iv_factor);
459       if (nitems_skip)
460 	nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
461 				    nitems_skip, compare_factor);
462     }
463 
464   /* Create an induction variable that counts the number of items
465      processed.  */
466   tree index_before_incr, index_after_incr;
467   gimple_stmt_iterator incr_gsi;
468   bool insert_after;
469   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
470   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
471 	     &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
472 
473   tree zero_index = build_int_cst (compare_type, 0);
474   tree test_index, test_limit, first_limit;
475   gimple_stmt_iterator *test_gsi;
476   if (might_wrap_p)
477     {
478       /* In principle the loop should stop iterating once the incremented
479 	 IV reaches a value greater than or equal to:
480 
481 	   NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
482 
483 	 However, there's no guarantee that this addition doesn't overflow
484 	 the comparison type, or that the IV hits a value above it before
485 	 wrapping around.  We therefore adjust the limit down by one
486 	 IV step:
487 
488 	   (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
489 	   -[infinite-prec] NITEMS_STEP
490 
491 	 and compare the IV against this limit _before_ incrementing it.
492 	 Since the comparison type is unsigned, we actually want the
493 	 subtraction to saturate at zero:
494 
495 	   (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
496 	   -[sat] NITEMS_STEP
497 
498 	 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
499 
500 	   NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
501 
502 	 where the rightmost subtraction can be done directly in
503 	 COMPARE_TYPE.  */
504       test_index = index_before_incr;
505       tree adjust = gimple_convert (preheader_seq, compare_type,
506 				    nitems_step);
507       if (nitems_skip)
508 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
509 			       adjust, nitems_skip);
510       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
511 				 nitems_total, adjust);
512       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
513 				 test_limit, adjust);
514       test_gsi = &incr_gsi;
515 
516       /* Get a safe limit for the first iteration.  */
517       if (nitems_skip)
518 	{
519 	  /* The first vector iteration can handle at most NITEMS_STEP
520 	     items.  NITEMS_STEP <= CONST_LIMIT, and adding
521 	     NITEMS_SKIP to that cannot overflow.  */
522 	  tree const_limit = build_int_cst (compare_type,
523 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
524 					    * nitems_per_iter);
525 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
526 				      nitems_total, const_limit);
527 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
528 				      first_limit, nitems_skip);
529 	}
530       else
531 	/* For the first iteration it doesn't matter whether the IV hits
532 	   a value above NITEMS_TOTAL.  That only matters for the latch
533 	   condition.  */
534 	first_limit = nitems_total;
535     }
536   else
537     {
538       /* Test the incremented IV, which will always hit a value above
539 	 the bound before wrapping.  */
540       test_index = index_after_incr;
541       test_limit = nitems_total;
542       if (nitems_skip)
543 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
544 				   test_limit, nitems_skip);
545       test_gsi = &loop_cond_gsi;
546 
547       first_limit = test_limit;
548     }
549 
550   /* Convert the IV value to the comparison type (either a no-op or
551      a demotion).  */
552   gimple_seq test_seq = NULL;
553   test_index = gimple_convert (&test_seq, compare_type, test_index);
554   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
555 
556   /* Provide a definition of each control in the group.  */
557   tree next_ctrl = NULL_TREE;
558   tree ctrl;
559   unsigned int i;
560   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
561     {
562       /* Previous controls will cover BIAS items.  This control covers the
563 	 next batch.  */
564       poly_uint64 bias = nitems_per_ctrl * i;
565       tree bias_tree = build_int_cst (compare_type, bias);
566 
567       /* See whether the first iteration of the vector loop is known
568 	 to have a full control.  */
569       poly_uint64 const_limit;
570       bool first_iteration_full
571 	= (poly_int_tree_p (first_limit, &const_limit)
572 	   && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
573 
574       /* Rather than have a new IV that starts at BIAS and goes up to
575 	 TEST_LIMIT, prefer to use the same 0-based IV for each control
576 	 and adjust the bound down by BIAS.  */
577       tree this_test_limit = test_limit;
578       if (i != 0)
579 	{
580 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
581 					  compare_type, this_test_limit,
582 					  bias_tree);
583 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
584 					  compare_type, this_test_limit,
585 					  bias_tree);
586 	}
587 
588       /* Create the initial control.  First include all items that
589 	 are within the loop limit.  */
590       tree init_ctrl = NULL_TREE;
591       if (!first_iteration_full)
592 	{
593 	  tree start, end;
594 	  if (first_limit == test_limit)
595 	    {
596 	      /* Use a natural test between zero (the initial IV value)
597 		 and the loop limit.  The "else" block would be valid too,
598 		 but this choice can avoid the need to load BIAS_TREE into
599 		 a register.  */
600 	      start = zero_index;
601 	      end = this_test_limit;
602 	    }
603 	  else
604 	    {
605 	      /* FIRST_LIMIT is the maximum number of items handled by the
606 		 first iteration of the vector loop.  Test the portion
607 		 associated with this control.  */
608 	      start = bias_tree;
609 	      end = first_limit;
610 	    }
611 
612 	  if (use_masks_p)
613 	    init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
614 					start, end, "max_mask");
615 	  else
616 	    {
617 	      init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
618 	      gimple_seq seq = vect_gen_len (init_ctrl, start,
619 					     end, length_limit);
620 	      gimple_seq_add_seq (preheader_seq, seq);
621 	    }
622 	}
623 
624       /* Now AND out the bits that are within the number of skipped
625 	 items.  */
626       poly_uint64 const_skip;
627       if (nitems_skip
628 	  && !(poly_int_tree_p (nitems_skip, &const_skip)
629 	       && known_le (const_skip, bias)))
630 	{
631 	  gcc_assert (use_masks_p);
632 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
633 						    bias_tree, nitems_skip);
634 	  if (init_ctrl)
635 	    init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
636 				      init_ctrl, unskipped_mask);
637 	  else
638 	    init_ctrl = unskipped_mask;
639 	}
640 
641       if (!init_ctrl)
642 	{
643 	  /* First iteration is full.  */
644 	  if (use_masks_p)
645 	    init_ctrl = build_minus_one_cst (ctrl_type);
646 	  else
647 	    init_ctrl = length_limit;
648 	}
649 
650       /* Get the control value for the next iteration of the loop.  */
651       if (use_masks_p)
652 	{
653 	  gimple_seq stmts = NULL;
654 	  next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
655 				      this_test_limit, "next_mask");
656 	  gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
657 	}
658       else
659 	{
660 	  next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
661 	  gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
662 					 length_limit);
663 	  gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
664 	}
665 
666       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
667     }
668 
669   int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
670   if (partial_load_bias != 0)
671     {
672       tree adjusted_len = rgc->bias_adjusted_ctrl;
673       gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
674 					    rgc->controls[0],
675 					    build_int_cst
676 					    (TREE_TYPE (rgc->controls[0]),
677 					     partial_load_bias));
678       gimple_seq_add_stmt (header_seq, minus);
679     }
680 
681   return next_ctrl;
682 }
683 
684 /* Set up the iteration condition and rgroup controls for LOOP, given
685    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
686    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
687    the number of iterations of the original scalar loop that should be
688    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
689    for vect_set_loop_condition.
690 
691    Insert the branch-back condition before LOOP_COND_GSI and return the
692    final gcond.  */
693 
694 static gcond *
vect_set_loop_condition_partial_vectors(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)695 vect_set_loop_condition_partial_vectors (class loop *loop,
696 					 loop_vec_info loop_vinfo, tree niters,
697 					 tree final_iv, bool niters_maybe_zero,
698 					 gimple_stmt_iterator loop_cond_gsi)
699 {
700   gimple_seq preheader_seq = NULL;
701   gimple_seq header_seq = NULL;
702 
703   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
704   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
705   unsigned int compare_precision = TYPE_PRECISION (compare_type);
706   tree orig_niters = niters;
707 
708   /* Type of the initial value of NITERS.  */
709   tree ni_actual_type = TREE_TYPE (niters);
710   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
711   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
712 
713   /* Convert NITERS to the same size as the compare.  */
714   if (compare_precision > ni_actual_precision
715       && niters_maybe_zero)
716     {
717       /* We know that there is always at least one iteration, so if the
718 	 count is zero then it must have wrapped.  Cope with this by
719 	 subtracting 1 before the conversion and adding 1 to the result.  */
720       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
721       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
722 			     niters, build_minus_one_cst (ni_actual_type));
723       niters = gimple_convert (&preheader_seq, compare_type, niters);
724       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
725 			     niters, build_one_cst (compare_type));
726     }
727   else
728     niters = gimple_convert (&preheader_seq, compare_type, niters);
729 
730   /* Iterate over all the rgroups and fill in their controls.  We could use
731      the first control from any rgroup for the loop condition; here we
732      arbitrarily pick the last.  */
733   tree test_ctrl = NULL_TREE;
734   rgroup_controls *rgc;
735   unsigned int i;
736   auto_vec<rgroup_controls> *controls = use_masks_p
737 					  ? &LOOP_VINFO_MASKS (loop_vinfo)
738 					  : &LOOP_VINFO_LENS (loop_vinfo);
739   FOR_EACH_VEC_ELT (*controls, i, rgc)
740     if (!rgc->controls.is_empty ())
741       {
742 	/* First try using permutes.  This adds a single vector
743 	   instruction to the loop for each mask, but needs no extra
744 	   loop invariants or IVs.  */
745 	unsigned int nmasks = i + 1;
746 	if (use_masks_p && (nmasks & 1) == 0)
747 	  {
748 	    rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
749 	    if (!half_rgc->controls.is_empty ()
750 		&& vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
751 	      continue;
752 	  }
753 
754 	/* See whether zero-based IV would ever generate all-false masks
755 	   or zero length before wrapping around.  */
756 	bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
757 
758 	/* Set up all controls for this group.  */
759 	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
760 						     &preheader_seq,
761 						     &header_seq,
762 						     loop_cond_gsi, rgc,
763 						     niters, niters_skip,
764 						     might_wrap_p);
765       }
766 
767   /* Emit all accumulated statements.  */
768   add_preheader_seq (loop, preheader_seq);
769   add_header_seq (loop, header_seq);
770 
771   /* Get a boolean result that tells us whether to iterate.  */
772   edge exit_edge = single_exit (loop);
773   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
774   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
775   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
776 					NULL_TREE, NULL_TREE);
777   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
778 
779   /* The loop iterates (NITERS - 1) / VF + 1 times.
780      Subtract one from this to get the latch count.  */
781   tree step = build_int_cst (compare_type,
782 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
783   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
784 				       build_minus_one_cst (compare_type));
785   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
786 				     niters_minus_one, step);
787 
788   if (final_iv)
789     {
790       gassign *assign = gimple_build_assign (final_iv, orig_niters);
791       gsi_insert_on_edge_immediate (single_exit (loop), assign);
792     }
793 
794   return cond_stmt;
795 }
796 
797 /* Like vect_set_loop_condition, but handle the case in which the vector
798    loop handles exactly VF scalars per iteration.  */
799 
800 static gcond *
vect_set_loop_condition_normal(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)801 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
802 				tree final_iv, bool niters_maybe_zero,
803 				gimple_stmt_iterator loop_cond_gsi)
804 {
805   tree indx_before_incr, indx_after_incr;
806   gcond *cond_stmt;
807   gcond *orig_cond;
808   edge pe = loop_preheader_edge (loop);
809   edge exit_edge = single_exit (loop);
810   gimple_stmt_iterator incr_gsi;
811   bool insert_after;
812   enum tree_code code;
813   tree niters_type = TREE_TYPE (niters);
814 
815   orig_cond = get_loop_exit_condition (loop);
816   gcc_assert (orig_cond);
817   loop_cond_gsi = gsi_for_stmt (orig_cond);
818 
819   tree init, limit;
820   if (!niters_maybe_zero && integer_onep (step))
821     {
822       /* In this case we can use a simple 0-based IV:
823 
824 	 A:
825 	   x = 0;
826 	   do
827 	     {
828 	       ...
829 	       x += 1;
830 	     }
831 	   while (x < NITERS);  */
832       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
833       init = build_zero_cst (niters_type);
834       limit = niters;
835     }
836   else
837     {
838       /* The following works for all values of NITERS except 0:
839 
840 	 B:
841 	   x = 0;
842 	   do
843 	     {
844 	       ...
845 	       x += STEP;
846 	     }
847 	   while (x <= NITERS - STEP);
848 
849 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
850 	 but stops if x + STEP - 1 >= NITERS.
851 
852 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
853 	 before wrapping around.  There are two obvious ways of dealing with
854 	 this:
855 
856 	 - start at STEP - 1 and compare x before incrementing it
857 	 - start at -1 and compare x after incrementing it
858 
859 	 The latter is simpler and is what we use.  The loop in this case
860 	 looks like:
861 
862 	 C:
863 	   x = -1;
864 	   do
865 	     {
866 	       ...
867 	       x += STEP;
868 	     }
869 	   while (x < NITERS - STEP);
870 
871 	 In both cases the loop limit is NITERS - STEP.  */
872       gimple_seq seq = NULL;
873       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
874       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
875       if (seq)
876 	{
877 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
878 	  gcc_assert (!new_bb);
879 	}
880       if (niters_maybe_zero)
881 	{
882 	  /* Case C.  */
883 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
884 	  init = build_all_ones_cst (niters_type);
885 	}
886       else
887 	{
888 	  /* Case B.  */
889 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
890 	  init = build_zero_cst (niters_type);
891 	}
892     }
893 
894   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
895   create_iv (init, step, NULL_TREE, loop,
896              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
897   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
898 					      true, NULL_TREE, true,
899 					      GSI_SAME_STMT);
900   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
901 				     true, GSI_SAME_STMT);
902 
903   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
904 				 NULL_TREE);
905 
906   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
907 
908   /* Record the number of latch iterations.  */
909   if (limit == niters)
910     /* Case A: the loop iterates NITERS times.  Subtract one to get the
911        latch count.  */
912     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
913 				       build_int_cst (niters_type, 1));
914   else
915     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
916        Subtract one from this to get the latch count.  */
917     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
918 				       limit, step);
919 
920   if (final_iv)
921     {
922       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
923 					     indx_after_incr, init);
924       gsi_insert_on_edge_immediate (single_exit (loop), assign);
925     }
926 
927   return cond_stmt;
928 }
929 
930 /* If we're using fully-masked loops, make LOOP iterate:
931 
932       N == (NITERS - 1) / STEP + 1
933 
934    times.  When NITERS is zero, this is equivalent to making the loop
935    execute (1 << M) / STEP times, where M is the precision of NITERS.
936    NITERS_MAYBE_ZERO is true if this last case might occur.
937 
938    If we're not using fully-masked loops, make LOOP iterate:
939 
940       N == (NITERS - STEP) / STEP + 1
941 
942    times, where NITERS is known to be outside the range [1, STEP - 1].
943    This is equivalent to making the loop execute NITERS / STEP times
944    when NITERS is nonzero and (1 << M) / STEP times otherwise.
945    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
946 
947    If FINAL_IV is nonnull, it is an SSA name that should be set to
948    N * STEP on exit from the loop.
949 
950    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
951 
952 void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)953 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
954 			 tree niters, tree step, tree final_iv,
955 			 bool niters_maybe_zero)
956 {
957   gcond *cond_stmt;
958   gcond *orig_cond = get_loop_exit_condition (loop);
959   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
960 
961   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
962     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
963 							 niters, final_iv,
964 							 niters_maybe_zero,
965 							 loop_cond_gsi);
966   else
967     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
968 						niters_maybe_zero,
969 						loop_cond_gsi);
970 
971   /* Remove old loop exit test.  */
972   stmt_vec_info orig_cond_info;
973   if (loop_vinfo
974       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
975     loop_vinfo->remove_stmt (orig_cond_info);
976   else
977     gsi_remove (&loop_cond_gsi, true);
978 
979   if (dump_enabled_p ())
980     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
981 		     cond_stmt);
982 }
983 
984 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
985    For all PHI arguments in FROM->dest and TO->dest from those
986    edges ensure that TO->dest PHI arguments have current_def
987    to that in from.  */
988 
989 static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)990 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
991 {
992   gimple_stmt_iterator gsi_from, gsi_to;
993 
994   for (gsi_from = gsi_start_phis (from->dest),
995        gsi_to = gsi_start_phis (to->dest);
996        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
997     {
998       gimple *from_phi = gsi_stmt (gsi_from);
999       gimple *to_phi = gsi_stmt (gsi_to);
1000       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1001       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1002       if (virtual_operand_p (from_arg))
1003 	{
1004 	  gsi_next (&gsi_from);
1005 	  continue;
1006 	}
1007       if (virtual_operand_p (to_arg))
1008 	{
1009 	  gsi_next (&gsi_to);
1010 	  continue;
1011 	}
1012       if (TREE_CODE (from_arg) != SSA_NAME)
1013 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1014       else if (TREE_CODE (to_arg) == SSA_NAME
1015 	       && from_arg != to_arg)
1016 	{
1017 	  if (get_current_def (to_arg) == NULL_TREE)
1018 	    {
1019 	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1020 					      TREE_TYPE (get_current_def
1021 							   (from_arg))));
1022 	      set_current_def (to_arg, get_current_def (from_arg));
1023 	    }
1024 	}
1025       gsi_next (&gsi_from);
1026       gsi_next (&gsi_to);
1027     }
1028 
1029   gphi *from_phi = get_virtual_phi (from->dest);
1030   gphi *to_phi = get_virtual_phi (to->dest);
1031   if (from_phi)
1032     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1033 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1034 }
1035 
1036 
1037 /* Given LOOP this function generates a new copy of it and puts it
1038    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1039    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1040    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1041    entry or exit of LOOP.  */
1042 
1043 class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)1044 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1045 					class loop *scalar_loop, edge e)
1046 {
1047   class loop *new_loop;
1048   basic_block *new_bbs, *bbs, *pbbs;
1049   bool at_exit;
1050   bool was_imm_dom;
1051   basic_block exit_dest;
1052   edge exit, new_exit;
1053   bool duplicate_outer_loop = false;
1054 
1055   exit = single_exit (loop);
1056   at_exit = (e == exit);
1057   if (!at_exit && e != loop_preheader_edge (loop))
1058     return NULL;
1059 
1060   if (scalar_loop == NULL)
1061     scalar_loop = loop;
1062 
1063   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1064   pbbs = bbs + 1;
1065   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1066   /* Allow duplication of outer loops.  */
1067   if (scalar_loop->inner)
1068     duplicate_outer_loop = true;
1069   /* Check whether duplication is possible.  */
1070   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1071     {
1072       free (bbs);
1073       return NULL;
1074     }
1075 
1076   /* Generate new loop structure.  */
1077   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1078   duplicate_subloops (scalar_loop, new_loop);
1079 
1080   exit_dest = exit->dest;
1081   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1082 					  exit_dest) == loop->header ?
1083 		 true : false);
1084 
1085   /* Also copy the pre-header, this avoids jumping through hoops to
1086      duplicate the loop entry PHI arguments.  Create an empty
1087      pre-header unconditionally for this.  */
1088   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1089   edge entry_e = single_pred_edge (preheader);
1090   bbs[0] = preheader;
1091   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1092 
1093   exit = single_exit (scalar_loop);
1094   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1095 	    &exit, 1, &new_exit, NULL,
1096 	    at_exit ? loop->latch : e->src, true);
1097   exit = single_exit (loop);
1098   basic_block new_preheader = new_bbs[0];
1099 
1100   /* Before installing PHI arguments make sure that the edges
1101      into them match that of the scalar loop we analyzed.  This
1102      makes sure the SLP tree matches up between the main vectorized
1103      loop and the epilogue vectorized copies.  */
1104   if (single_succ_edge (preheader)->dest_idx
1105       != single_succ_edge (new_bbs[0])->dest_idx)
1106     {
1107       basic_block swap_bb = new_bbs[1];
1108       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1109       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1110       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1111       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1112     }
1113   if (duplicate_outer_loop)
1114     {
1115       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1116       if (loop_preheader_edge (scalar_loop)->dest_idx
1117 	  != loop_preheader_edge (new_inner_loop)->dest_idx)
1118 	{
1119 	  basic_block swap_bb = new_inner_loop->header;
1120 	  gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1121 	  std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1122 	  EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1123 	  EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1124 	}
1125     }
1126 
1127   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1128 
1129   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1130   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1131     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1132 
1133   if (scalar_loop != loop)
1134     {
1135       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1136 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1137 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1138 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1139 	 header) to have current_def set, so copy them over.  */
1140       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1141 						exit);
1142       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1143 							   0),
1144 						EDGE_SUCC (loop->latch, 0));
1145     }
1146 
1147   if (at_exit) /* Add the loop copy at exit.  */
1148     {
1149       if (scalar_loop != loop)
1150 	{
1151 	  gphi_iterator gsi;
1152 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1153 
1154 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1155 	       gsi_next (&gsi))
1156 	    {
1157 	      gphi *phi = gsi.phi ();
1158 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1159 	      location_t orig_locus
1160 		= gimple_phi_arg_location_from_edge (phi, e);
1161 
1162 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1163 	    }
1164 	}
1165       redirect_edge_and_branch_force (e, new_preheader);
1166       flush_pending_stmts (e);
1167       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1168       if (was_imm_dom || duplicate_outer_loop)
1169 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1170 
1171       /* And remove the non-necessary forwarder again.  Keep the other
1172          one so we have a proper pre-header for the loop at the exit edge.  */
1173       redirect_edge_pred (single_succ_edge (preheader),
1174 			  single_pred (preheader));
1175       delete_basic_block (preheader);
1176       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1177 			       loop_preheader_edge (scalar_loop)->src);
1178     }
1179   else /* Add the copy at entry.  */
1180     {
1181       if (scalar_loop != loop)
1182 	{
1183 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1184 	  redirect_edge_pred (single_succ_edge (preheader),
1185 			      single_pred (preheader));
1186 	  delete_basic_block (preheader);
1187 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1188 				   loop_preheader_edge (scalar_loop)->src);
1189 	  preheader = split_edge (loop_preheader_edge (loop));
1190 	  entry_e = single_pred_edge (preheader);
1191 	}
1192 
1193       redirect_edge_and_branch_force (entry_e, new_preheader);
1194       flush_pending_stmts (entry_e);
1195       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1196 
1197       redirect_edge_and_branch_force (new_exit, preheader);
1198       flush_pending_stmts (new_exit);
1199       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1200 
1201       /* And remove the non-necessary forwarder again.  Keep the other
1202          one so we have a proper pre-header for the loop at the exit edge.  */
1203       redirect_edge_pred (single_succ_edge (new_preheader),
1204 			  single_pred (new_preheader));
1205       delete_basic_block (new_preheader);
1206       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1207 			       loop_preheader_edge (new_loop)->src);
1208     }
1209 
1210   if (scalar_loop != loop)
1211     {
1212       /* Update new_loop->header PHIs, so that on the preheader
1213 	 edge they are the ones from loop rather than scalar_loop.  */
1214       gphi_iterator gsi_orig, gsi_new;
1215       edge orig_e = loop_preheader_edge (loop);
1216       edge new_e = loop_preheader_edge (new_loop);
1217 
1218       for (gsi_orig = gsi_start_phis (loop->header),
1219 	   gsi_new = gsi_start_phis (new_loop->header);
1220 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1221 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1222 	{
1223 	  gphi *orig_phi = gsi_orig.phi ();
1224 	  gphi *new_phi = gsi_new.phi ();
1225 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1226 	  location_t orig_locus
1227 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1228 
1229 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1230 	}
1231     }
1232 
1233   free (new_bbs);
1234   free (bbs);
1235 
1236   checking_verify_dominators (CDI_DOMINATORS);
1237 
1238   return new_loop;
1239 }
1240 
1241 
1242 /* Given the condition expression COND, put it as the last statement of
1243    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1244    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1245    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1246    new edge as irreducible if IRREDUCIBLE_P is true.  */
1247 
1248 static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1249 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1250 		       basic_block guard_to, basic_block dom_bb,
1251 		       profile_probability probability, bool irreducible_p)
1252 {
1253   gimple_stmt_iterator gsi;
1254   edge new_e, enter_e;
1255   gcond *cond_stmt;
1256   gimple_seq gimplify_stmt_list = NULL;
1257 
1258   enter_e = EDGE_SUCC (guard_bb, 0);
1259   enter_e->flags &= ~EDGE_FALLTHRU;
1260   enter_e->flags |= EDGE_FALSE_VALUE;
1261   gsi = gsi_last_bb (guard_bb);
1262 
1263   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1264 				 NULL_TREE);
1265   if (gimplify_stmt_list)
1266     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1267 
1268   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1269   gsi = gsi_last_bb (guard_bb);
1270   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1271 
1272   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1273   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1274 
1275   new_e->probability = probability;
1276   if (irreducible_p)
1277     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1278 
1279   enter_e->probability = probability.invert ();
1280   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1281 
1282   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1283   if (enter_e->dest->loop_father->header == enter_e->dest)
1284     split_edge (enter_e);
1285 
1286   return new_e;
1287 }
1288 
1289 
1290 /* This function verifies that the following restrictions apply to LOOP:
1291    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1292        for innermost loop and 5 basic blocks for outer-loop.
1293    (2) it is single entry, single exit
1294    (3) its exit condition is the last stmt in the header
1295    (4) E is the entry/exit edge of LOOP.
1296  */
1297 
1298 bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1299 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1300 {
1301   edge exit_e = single_exit (loop);
1302   edge entry_e = loop_preheader_edge (loop);
1303   gcond *orig_cond = get_loop_exit_condition (loop);
1304   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1305   unsigned int num_bb = loop->inner? 5 : 2;
1306 
1307   /* All loops have an outer scope; the only case loop->outer is NULL is for
1308      the function itself.  */
1309   if (!loop_outer (loop)
1310       || loop->num_nodes != num_bb
1311       || !empty_block_p (loop->latch)
1312       || !single_exit (loop)
1313       /* Verify that new loop exit condition can be trivially modified.  */
1314       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1315       || (e != exit_e && e != entry_e))
1316     return false;
1317 
1318   return true;
1319 }
1320 
1321 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1322    in the exit bb and rename all the uses after the loop.  This simplifies
1323    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1324    (but normally loop closed SSA form doesn't require virtual PHIs to be
1325    in the same form).  Doing this early simplifies the checking what
1326    uses should be renamed.
1327 
1328    If we create a new phi after the loop, return the definition that
1329    applies on entry to the loop, otherwise return null.  */
1330 
1331 static tree
create_lcssa_for_virtual_phi(class loop * loop)1332 create_lcssa_for_virtual_phi (class loop *loop)
1333 {
1334   gphi_iterator gsi;
1335   edge exit_e = single_exit (loop);
1336 
1337   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1338     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1339       {
1340 	gphi *phi = gsi.phi ();
1341 	for (gsi = gsi_start_phis (exit_e->dest);
1342 	     !gsi_end_p (gsi); gsi_next (&gsi))
1343 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1344 	    break;
1345 	if (gsi_end_p (gsi))
1346 	  {
1347 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1348 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1349 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1350 	    imm_use_iterator imm_iter;
1351 	    gimple *stmt;
1352 	    use_operand_p use_p;
1353 
1354 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1355 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1356 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1357 	    gimple_phi_set_result (new_phi, new_vop);
1358 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1359 	      if (stmt != new_phi
1360 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1361 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1362 		  SET_USE (use_p, new_vop);
1363 
1364 	    return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1365 	  }
1366 	break;
1367       }
1368   return NULL_TREE;
1369 }
1370 
1371 /* Function vect_get_loop_location.
1372 
1373    Extract the location of the loop in the source code.
1374    If the loop is not well formed for vectorization, an estimated
1375    location is calculated.
1376    Return the loop location if succeed and NULL if not.  */
1377 
1378 dump_user_location_t
find_loop_location(class loop * loop)1379 find_loop_location (class loop *loop)
1380 {
1381   gimple *stmt = NULL;
1382   basic_block bb;
1383   gimple_stmt_iterator si;
1384 
1385   if (!loop)
1386     return dump_user_location_t ();
1387 
1388   stmt = get_loop_exit_condition (loop);
1389 
1390   if (stmt
1391       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1392     return stmt;
1393 
1394   /* If we got here the loop is probably not "well formed",
1395      try to estimate the loop location */
1396 
1397   if (!loop->header)
1398     return dump_user_location_t ();
1399 
1400   bb = loop->header;
1401 
1402   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1403     {
1404       stmt = gsi_stmt (si);
1405       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1406         return stmt;
1407     }
1408 
1409   return dump_user_location_t ();
1410 }
1411 
1412 /* Return true if the phi described by STMT_INFO defines an IV of the
1413    loop to be vectorized.  */
1414 
1415 static bool
iv_phi_p(stmt_vec_info stmt_info)1416 iv_phi_p (stmt_vec_info stmt_info)
1417 {
1418   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1419   if (virtual_operand_p (PHI_RESULT (phi)))
1420     return false;
1421 
1422   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1423       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1424     return false;
1425 
1426   return true;
1427 }
1428 
1429 /* Function vect_can_advance_ivs_p
1430 
1431    In case the number of iterations that LOOP iterates is unknown at compile
1432    time, an epilog loop will be generated, and the loop induction variables
1433    (IVs) will be "advanced" to the value they are supposed to take just before
1434    the epilog loop.  Here we check that the access function of the loop IVs
1435    and the expression that represents the loop bound are simple enough.
1436    These restrictions will be relaxed in the future.  */
1437 
1438 bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1439 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1440 {
1441   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1442   basic_block bb = loop->header;
1443   gphi_iterator gsi;
1444 
1445   /* Analyze phi functions of the loop header.  */
1446 
1447   if (dump_enabled_p ())
1448     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1449   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1450     {
1451       tree evolution_part;
1452 
1453       gphi *phi = gsi.phi ();
1454       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1455       if (dump_enabled_p ())
1456 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1457 			 phi_info->stmt);
1458 
1459       /* Skip virtual phi's. The data dependences that are associated with
1460 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1461 
1462 	 Skip reduction phis.  */
1463       if (!iv_phi_p (phi_info))
1464 	{
1465 	  if (dump_enabled_p ())
1466 	    dump_printf_loc (MSG_NOTE, vect_location,
1467 			     "reduc or virtual phi. skip.\n");
1468 	  continue;
1469 	}
1470 
1471       /* Analyze the evolution function.  */
1472 
1473       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1474       if (evolution_part == NULL_TREE)
1475         {
1476 	  if (dump_enabled_p ())
1477 	    dump_printf (MSG_MISSED_OPTIMIZATION,
1478 			 "No access function or evolution.\n");
1479 	  return false;
1480         }
1481 
1482       /* FORNOW: We do not transform initial conditions of IVs
1483 	 which evolution functions are not invariants in the loop.  */
1484 
1485       if (!expr_invariant_in_loop_p (loop, evolution_part))
1486 	{
1487 	  if (dump_enabled_p ())
1488 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489 			     "evolution not invariant in loop.\n");
1490 	  return false;
1491 	}
1492 
1493       /* FORNOW: We do not transform initial conditions of IVs
1494 	 which evolution functions are a polynomial of degree >= 2.  */
1495 
1496       if (tree_is_chrec (evolution_part))
1497 	{
1498 	  if (dump_enabled_p ())
1499 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1500 			     "evolution is chrec.\n");
1501 	  return false;
1502 	}
1503     }
1504 
1505   return true;
1506 }
1507 
1508 
1509 /*   Function vect_update_ivs_after_vectorizer.
1510 
1511      "Advance" the induction variables of LOOP to the value they should take
1512      after the execution of LOOP.  This is currently necessary because the
1513      vectorizer does not handle induction variables that are used after the
1514      loop.  Such a situation occurs when the last iterations of LOOP are
1515      peeled, because:
1516      1. We introduced new uses after LOOP for IVs that were not originally used
1517         after LOOP: the IVs of LOOP are now used by an epilog loop.
1518      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1519         times, whereas the loop IVs should be bumped N times.
1520 
1521      Input:
1522      - LOOP - a loop that is going to be vectorized. The last few iterations
1523               of LOOP were peeled.
1524      - NITERS - the number of iterations that LOOP executes (before it is
1525                 vectorized). i.e, the number of times the ivs should be bumped.
1526      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1527                   coming out from LOOP on which there are uses of the LOOP ivs
1528 		  (this is the path from LOOP->exit to epilog_loop->preheader).
1529 
1530                   The new definitions of the ivs are placed in LOOP->exit.
1531                   The phi args associated with the edge UPDATE_E in the bb
1532                   UPDATE_E->dest are updated accordingly.
1533 
1534      Assumption 1: Like the rest of the vectorizer, this function assumes
1535      a single loop exit that has a single predecessor.
1536 
1537      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1538      organized in the same order.
1539 
1540      Assumption 3: The access function of the ivs is simple enough (see
1541      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1542 
1543      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1544      coming out of LOOP on which the ivs of LOOP are used (this is the path
1545      that leads to the epilog loop; other paths skip the epilog loop).  This
1546      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1547      needs to have its phis updated.
1548  */
1549 
1550 static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1551 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1552 				  tree niters, edge update_e)
1553 {
1554   gphi_iterator gsi, gsi1;
1555   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1556   basic_block update_bb = update_e->dest;
1557   basic_block exit_bb = single_exit (loop)->dest;
1558 
1559   /* Make sure there exists a single-predecessor exit bb:  */
1560   gcc_assert (single_pred_p (exit_bb));
1561   gcc_assert (single_succ_edge (exit_bb) == update_e);
1562 
1563   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1564        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1565        gsi_next (&gsi), gsi_next (&gsi1))
1566     {
1567       tree init_expr;
1568       tree step_expr, off;
1569       tree type;
1570       tree var, ni, ni_name;
1571       gimple_stmt_iterator last_gsi;
1572 
1573       gphi *phi = gsi.phi ();
1574       gphi *phi1 = gsi1.phi ();
1575       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1576       if (dump_enabled_p ())
1577 	dump_printf_loc (MSG_NOTE, vect_location,
1578 			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1579 
1580       /* Skip reduction and virtual phis.  */
1581       if (!iv_phi_p (phi_info))
1582 	{
1583 	  if (dump_enabled_p ())
1584 	    dump_printf_loc (MSG_NOTE, vect_location,
1585 			     "reduc or virtual phi. skip.\n");
1586 	  continue;
1587 	}
1588 
1589       type = TREE_TYPE (gimple_phi_result (phi));
1590       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1591       step_expr = unshare_expr (step_expr);
1592 
1593       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1594          of degree >= 2 or exponential.  */
1595       gcc_assert (!tree_is_chrec (step_expr));
1596 
1597       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1598 
1599       tree stype = TREE_TYPE (step_expr);
1600       off = fold_build2 (MULT_EXPR, stype,
1601 			 fold_convert (stype, niters), step_expr);
1602       if (POINTER_TYPE_P (type))
1603 	ni = fold_build_pointer_plus (init_expr, off);
1604       else
1605 	ni = fold_convert (type,
1606 			   fold_build2 (PLUS_EXPR, stype,
1607 					fold_convert (stype, init_expr),
1608 					off));
1609 
1610       var = create_tmp_var (type, "tmp");
1611 
1612       last_gsi = gsi_last_bb (exit_bb);
1613       gimple_seq new_stmts = NULL;
1614       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1615       /* Exit_bb shouldn't be empty.  */
1616       if (!gsi_end_p (last_gsi))
1617 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1618       else
1619 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1620 
1621       /* Fix phi expressions in the successor bb.  */
1622       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1623     }
1624 }
1625 
1626 /* Return a gimple value containing the misalignment (measured in vector
1627    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1628    it is away from a perfectly aligned address.  Add any new statements
1629    to SEQ.  */
1630 
1631 static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1632 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1633 {
1634   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1635   stmt_vec_info stmt_info = dr_info->stmt;
1636   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1637 
1638   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1639   unsigned HOST_WIDE_INT target_align_c;
1640   tree target_align_minus_1;
1641 
1642   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1643 					size_zero_node) < 0;
1644   tree offset = (negative
1645 		 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1646 			     * TREE_INT_CST_LOW
1647 				 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1648 		 : size_zero_node);
1649   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1650 							  stmt_info, seq,
1651 							  offset);
1652   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1653   if (target_align.is_constant (&target_align_c))
1654     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1655   else
1656     {
1657       tree vla = build_int_cst (type, target_align);
1658       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1659 				    fold_build2 (MINUS_EXPR, type,
1660 						 build_int_cst (type, 0), vla));
1661       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1662 					  build_int_cst (type, 1));
1663     }
1664 
1665   HOST_WIDE_INT elem_size
1666     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1667   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1668 
1669   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1670   tree int_start_addr = fold_convert (type, start_addr);
1671   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1672 					target_align_minus_1);
1673 
1674   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1675   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1676 					elem_size_log);
1677 
1678   return misalign_in_elems;
1679 }
1680 
1681 /* Function vect_gen_prolog_loop_niters
1682 
1683    Generate the number of iterations which should be peeled as prolog for the
1684    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1685    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1686    As a result, after the execution of this loop, the data reference DR will
1687    refer to an aligned location.  The following computation is generated:
1688 
1689    If the misalignment of DR is known at compile time:
1690      addr_mis = int mis = DR_MISALIGNMENT (dr);
1691    Else, compute address misalignment in bytes:
1692      addr_mis = addr & (target_align - 1)
1693 
1694    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1695 
1696    (elem_size = element type size; an element is the scalar element whose type
1697    is the inner type of the vectype)
1698 
1699    The computations will be emitted at the end of BB.  We also compute and
1700    store upper bound (included) of the result in BOUND.
1701 
1702    When the step of the data-ref in the loop is not 1 (as in interleaved data
1703    and SLP), the number of iterations of the prolog must be divided by the step
1704    (which is equal to the size of interleaved group).
1705 
1706    The above formulas assume that VF == number of elements in the vector. This
1707    may not hold when there are multiple-types in the loop.
1708    In this case, for some data-references in the loop the VF does not represent
1709    the number of elements that fit in the vector.  Therefore, instead of VF we
1710    use TYPE_VECTOR_SUBPARTS.  */
1711 
1712 static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1713 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1714 			     basic_block bb, int *bound)
1715 {
1716   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1717   tree var;
1718   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1719   gimple_seq stmts = NULL, new_stmts = NULL;
1720   tree iters, iters_name;
1721   stmt_vec_info stmt_info = dr_info->stmt;
1722   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1723   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1724 
1725   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1726     {
1727       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1728 
1729       if (dump_enabled_p ())
1730         dump_printf_loc (MSG_NOTE, vect_location,
1731                          "known peeling = %d.\n", npeel);
1732 
1733       iters = build_int_cst (niters_type, npeel);
1734       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1735     }
1736   else
1737     {
1738       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1739       tree type = TREE_TYPE (misalign_in_elems);
1740       HOST_WIDE_INT elem_size
1741 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1742       /* We only do prolog peeling if the target alignment is known at compile
1743          time.  */
1744       poly_uint64 align_in_elems =
1745 	exact_div (target_align, elem_size);
1746       tree align_in_elems_minus_1 =
1747 	build_int_cst (type, align_in_elems - 1);
1748       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1749 
1750       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1751 				 & (align_in_elems - 1)).  */
1752       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1753 					    size_zero_node) < 0;
1754       if (negative)
1755 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1756 			     align_in_elems_tree);
1757       else
1758 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1759 			     misalign_in_elems);
1760       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1761       iters = fold_convert (niters_type, iters);
1762       unsigned HOST_WIDE_INT align_in_elems_c;
1763       if (align_in_elems.is_constant (&align_in_elems_c))
1764 	*bound = align_in_elems_c - 1;
1765       else
1766 	*bound = -1;
1767     }
1768 
1769   if (dump_enabled_p ())
1770     dump_printf_loc (MSG_NOTE, vect_location,
1771 		     "niters for prolog loop: %T\n", iters);
1772 
1773   var = create_tmp_var (niters_type, "prolog_loop_niters");
1774   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1775 
1776   if (new_stmts)
1777     gimple_seq_add_seq (&stmts, new_stmts);
1778   if (stmts)
1779     {
1780       gcc_assert (single_succ_p (bb));
1781       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1782       if (gsi_end_p (gsi))
1783 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1784       else
1785 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1786     }
1787   return iters_name;
1788 }
1789 
1790 
1791 /* Function vect_update_init_of_dr
1792 
1793    If CODE is PLUS, the vector loop starts NITERS iterations after the
1794    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1795    iterations before the scalar one (using masking to skip inactive
1796    elements).  This function updates the information recorded in DR to
1797    account for the difference.  Specifically, it updates the OFFSET
1798    field of DR_INFO.  */
1799 
1800 static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1801 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1802 {
1803   struct data_reference *dr = dr_info->dr;
1804   tree offset = dr_info->offset;
1805   if (!offset)
1806     offset = build_zero_cst (sizetype);
1807 
1808   niters = fold_build2 (MULT_EXPR, sizetype,
1809 			fold_convert (sizetype, niters),
1810 			fold_convert (sizetype, DR_STEP (dr)));
1811   offset = fold_build2 (code, sizetype,
1812 			fold_convert (sizetype, offset), niters);
1813   dr_info->offset = offset;
1814 }
1815 
1816 
1817 /* Function vect_update_inits_of_drs
1818 
1819    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1820    CODE and NITERS are as for vect_update_inits_of_dr.  */
1821 
1822 void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1823 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1824 			  tree_code code)
1825 {
1826   unsigned int i;
1827   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1828   struct data_reference *dr;
1829 
1830   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1831 
1832   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1833      here, but since we might use these niters to update the epilogues niters
1834      and data references we can't insert them here as this definition might not
1835      always dominate its uses.  */
1836   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1837     niters = fold_convert (sizetype, niters);
1838 
1839   FOR_EACH_VEC_ELT (datarefs, i, dr)
1840     {
1841       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1842       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1843 	  && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1844 	vect_update_init_of_dr (dr_info, niters, code);
1845     }
1846 }
1847 
1848 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1849    by masking.  This involves calculating the number of iterations to
1850    be peeled and then aligning all memory references appropriately.  */
1851 
1852 void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1853 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1854 {
1855   tree misalign_in_elems;
1856   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1857 
1858   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1859 
1860   /* From the information recorded in LOOP_VINFO get the number of iterations
1861      that need to be skipped via masking.  */
1862   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1863     {
1864       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1865 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1866       misalign_in_elems = build_int_cst (type, misalign);
1867     }
1868   else
1869     {
1870       gimple_seq seq1 = NULL, seq2 = NULL;
1871       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1872       misalign_in_elems = fold_convert (type, misalign_in_elems);
1873       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1874 						&seq2, true, NULL_TREE);
1875       gimple_seq_add_seq (&seq1, seq2);
1876       if (seq1)
1877 	{
1878 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1879 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1880 	  gcc_assert (!new_bb);
1881 	}
1882     }
1883 
1884   if (dump_enabled_p ())
1885     dump_printf_loc (MSG_NOTE, vect_location,
1886 		     "misalignment for fully-masked loop: %T\n",
1887 		     misalign_in_elems);
1888 
1889   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1890 
1891   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1892 }
1893 
1894 /* This function builds ni_name = number of iterations.  Statements
1895    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1896    it to TRUE if new ssa_var is generated.  */
1897 
1898 tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1899 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1900 {
1901   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1902   if (TREE_CODE (ni) == INTEGER_CST)
1903     return ni;
1904   else
1905     {
1906       tree ni_name, var;
1907       gimple_seq stmts = NULL;
1908       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1909 
1910       var = create_tmp_var (TREE_TYPE (ni), "niters");
1911       ni_name = force_gimple_operand (ni, &stmts, false, var);
1912       if (stmts)
1913 	{
1914 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1915 	  if (new_var_p != NULL)
1916 	    *new_var_p = true;
1917 	}
1918 
1919       return ni_name;
1920     }
1921 }
1922 
1923 /* Calculate the number of iterations above which vectorized loop will be
1924    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1925    of prolog loop.  If it's integer const, the integer number is also passed
1926    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1927    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1928    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1929    threshold below which the scalar (rather than vectorized) loop will be
1930    executed.  This function stores the upper bound (inclusive) of the result
1931    in BOUND_SCALAR.  */
1932 
1933 static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1934 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1935 			     int bound_prolog, poly_int64 bound_epilog, int th,
1936 			     poly_uint64 *bound_scalar,
1937 			     bool check_profitability)
1938 {
1939   tree type = TREE_TYPE (niters_prolog);
1940   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1941 			     build_int_cst (type, bound_epilog));
1942 
1943   *bound_scalar = bound_prolog + bound_epilog;
1944   if (check_profitability)
1945     {
1946       /* TH indicates the minimum niters of vectorized loop, while we
1947 	 compute the maximum niters of scalar loop.  */
1948       th--;
1949       /* Peeling for constant times.  */
1950       if (int_niters_prolog >= 0)
1951 	{
1952 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1953 	  return build_int_cst (type, *bound_scalar);
1954 	}
1955       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1956 	 and BOUND_EPILOG are inclusive upper bounds.  */
1957       if (known_ge (th, bound_prolog + bound_epilog))
1958 	{
1959 	  *bound_scalar = th;
1960 	  return build_int_cst (type, th);
1961 	}
1962       /* Need to do runtime comparison.  */
1963       else if (maybe_gt (th, bound_epilog))
1964 	{
1965 	  *bound_scalar = upper_bound (*bound_scalar, th);
1966 	  return fold_build2 (MAX_EXPR, type,
1967 			      build_int_cst (type, th), niters);
1968 	}
1969     }
1970   return niters;
1971 }
1972 
1973 /* NITERS is the number of times that the original scalar loop executes
1974    after peeling.  Work out the maximum number of iterations N that can
1975    be handled by the vectorized form of the loop and then either:
1976 
1977    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1978 
1979 	niters_vector = N
1980 
1981    b) set *STEP_VECTOR_PTR to one and generate:
1982 
1983         niters_vector = N / vf
1984 
1985    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1986    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1987    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1988 
1989 void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1990 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1991 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1992 			     bool niters_no_overflow)
1993 {
1994   tree ni_minus_gap, var;
1995   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1996   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1998   tree log_vf = NULL_TREE;
1999 
2000   /* If epilogue loop is required because of data accesses with gaps, we
2001      subtract one iteration from the total number of iterations here for
2002      correct calculation of RATIO.  */
2003   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2004     {
2005       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2006 				  build_one_cst (type));
2007       if (!is_gimple_val (ni_minus_gap))
2008 	{
2009 	  var = create_tmp_var (type, "ni_gap");
2010 	  gimple *stmts = NULL;
2011 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2012 					       true, var);
2013 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
2014 	}
2015     }
2016   else
2017     ni_minus_gap = niters;
2018 
2019   unsigned HOST_WIDE_INT const_vf;
2020   if (vf.is_constant (&const_vf)
2021       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2022     {
2023       /* Create: niters >> log2(vf) */
2024       /* If it's known that niters == number of latch executions + 1 doesn't
2025 	 overflow, we can generate niters >> log2(vf); otherwise we generate
2026 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2027 	 will be at least one.  */
2028       log_vf = build_int_cst (type, exact_log2 (const_vf));
2029       if (niters_no_overflow)
2030 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2031       else
2032 	niters_vector
2033 	  = fold_build2 (PLUS_EXPR, type,
2034 			 fold_build2 (RSHIFT_EXPR, type,
2035 				      fold_build2 (MINUS_EXPR, type,
2036 						   ni_minus_gap,
2037 						   build_int_cst (type, vf)),
2038 				      log_vf),
2039 			 build_int_cst (type, 1));
2040       step_vector = build_one_cst (type);
2041     }
2042   else
2043     {
2044       niters_vector = ni_minus_gap;
2045       step_vector = build_int_cst (type, vf);
2046     }
2047 
2048   if (!is_gimple_val (niters_vector))
2049     {
2050       var = create_tmp_var (type, "bnd");
2051       gimple_seq stmts = NULL;
2052       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2053       gsi_insert_seq_on_edge_immediate (pe, stmts);
2054       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2055 	 we set range information to make niters analyzer's life easier.
2056 	 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2057 	 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2058       if (stmts != NULL && log_vf)
2059 	{
2060 	  if (niters_no_overflow)
2061 	    set_range_info (niters_vector, VR_RANGE,
2062 			    wi::one (TYPE_PRECISION (type)),
2063 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
2064 						       TYPE_SIGN (type)),
2065 					exact_log2 (const_vf),
2066 					TYPE_SIGN (type)));
2067 	  /* For VF == 1 the vector IV might also overflow so we cannot
2068 	     assert a minimum value of 1.  */
2069 	  else if (const_vf > 1)
2070 	    set_range_info (niters_vector, VR_RANGE,
2071 			    wi::one (TYPE_PRECISION (type)),
2072 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
2073 						       TYPE_SIGN (type))
2074 					- (const_vf - 1),
2075 					exact_log2 (const_vf), TYPE_SIGN (type))
2076 			    + 1);
2077 	}
2078     }
2079   *niters_vector_ptr = niters_vector;
2080   *step_vector_ptr = step_vector;
2081 
2082   return;
2083 }
2084 
2085 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2086    loop specified by LOOP_VINFO after vectorization, compute the number
2087    of iterations before vectorization (niters_vector * vf) and store it
2088    to NITERS_VECTOR_MULT_VF_PTR.  */
2089 
2090 static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)2091 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2092 				     tree niters_vector,
2093 				     tree *niters_vector_mult_vf_ptr)
2094 {
2095   /* We should be using a step_vector of VF if VF is variable.  */
2096   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2097   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2098   tree type = TREE_TYPE (niters_vector);
2099   tree log_vf = build_int_cst (type, exact_log2 (vf));
2100   basic_block exit_bb = single_exit (loop)->dest;
2101 
2102   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2103   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2104 					    niters_vector, log_vf);
2105   if (!is_gimple_val (niters_vector_mult_vf))
2106     {
2107       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2108       gimple_seq stmts = NULL;
2109       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2110 						    &stmts, true, var);
2111       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2112       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2113     }
2114   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2115 }
2116 
2117 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2118    this function searches for the corresponding lcssa phi node in exit
2119    bb of LOOP.  If it is found, return the phi result; otherwise return
2120    NULL.  */
2121 
2122 static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2123 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2124 		gphi *lcssa_phi)
2125 {
2126   gphi_iterator gsi;
2127   edge e = single_exit (loop);
2128 
2129   gcc_assert (single_pred_p (e->dest));
2130   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2131     {
2132       gphi *phi = gsi.phi ();
2133       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2134 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2135 	return PHI_RESULT (phi);
2136     }
2137   return NULL_TREE;
2138 }
2139 
2140 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2141    from SECOND/FIRST and puts it at the original loop's preheader/exit
2142    edge, the two loops are arranged as below:
2143 
2144        preheader_a:
2145      first_loop:
2146        header_a:
2147 	 i_1 = PHI<i_0, i_2>;
2148 	 ...
2149 	 i_2 = i_1 + 1;
2150 	 if (cond_a)
2151 	   goto latch_a;
2152 	 else
2153 	   goto between_bb;
2154        latch_a:
2155 	 goto header_a;
2156 
2157        between_bb:
2158 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2159 
2160      second_loop:
2161        header_b:
2162 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2163 				 or with i_2 if no LCSSA phi is created
2164 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2165 	 ...
2166 	 i_4 = i_3 + 1;
2167 	 if (cond_b)
2168 	   goto latch_b;
2169 	 else
2170 	   goto exit_bb;
2171        latch_b:
2172 	 goto header_b;
2173 
2174        exit_bb:
2175 
2176    This function creates loop closed SSA for the first loop; update the
2177    second loop's PHI nodes by replacing argument on incoming edge with the
2178    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2179    is false, Loop closed ssa phis will only be created for non-iv phis for
2180    the first loop.
2181 
2182    This function assumes exit bb of the first loop is preheader bb of the
2183    second loop, i.e, between_bb in the example code.  With PHIs updated,
2184    the second loop will execute rest iterations of the first.  */
2185 
2186 static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)2187 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2188 				   class loop *first, class loop *second,
2189 				   bool create_lcssa_for_iv_phis)
2190 {
2191   gphi_iterator gsi_update, gsi_orig;
2192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2193 
2194   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2195   edge second_preheader_e = loop_preheader_edge (second);
2196   basic_block between_bb = single_exit (first)->dest;
2197 
2198   gcc_assert (between_bb == second_preheader_e->src);
2199   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2200   /* Either the first loop or the second is the loop to be vectorized.  */
2201   gcc_assert (loop == first || loop == second);
2202 
2203   for (gsi_orig = gsi_start_phis (first->header),
2204        gsi_update = gsi_start_phis (second->header);
2205        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2206        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2207     {
2208       gphi *orig_phi = gsi_orig.phi ();
2209       gphi *update_phi = gsi_update.phi ();
2210 
2211       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2212       /* Generate lcssa PHI node for the first loop.  */
2213       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2214       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2215       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2216 	{
2217 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2218 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2219 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2220 	  arg = new_res;
2221 	}
2222 
2223       /* Update PHI node in the second loop by replacing arg on the loop's
2224 	 incoming edge.  */
2225       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2226     }
2227 
2228   /* For epilogue peeling we have to make sure to copy all LC PHIs
2229      for correct vectorization of live stmts.  */
2230   if (loop == first)
2231     {
2232       basic_block orig_exit = single_exit (second)->dest;
2233       for (gsi_orig = gsi_start_phis (orig_exit);
2234 	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2235 	{
2236 	  gphi *orig_phi = gsi_orig.phi ();
2237 	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2238 	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2239 	    continue;
2240 
2241 	  /* Already created in the above loop.   */
2242 	  if (find_guard_arg (first, second, orig_phi))
2243 	    continue;
2244 
2245 	  tree new_res = copy_ssa_name (orig_arg);
2246 	  gphi *lcphi = create_phi_node (new_res, between_bb);
2247 	  add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2248 	}
2249     }
2250 }
2251 
2252 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2253    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2254    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2255    appear like below:
2256 
2257        guard_bb:
2258 	 if (cond)
2259 	   goto merge_bb;
2260 	 else
2261 	   goto skip_loop;
2262 
2263      skip_loop:
2264        header_a:
2265 	 i_1 = PHI<i_0, i_2>;
2266 	 ...
2267 	 i_2 = i_1 + 1;
2268 	 if (cond_a)
2269 	   goto latch_a;
2270 	 else
2271 	   goto exit_a;
2272        latch_a:
2273 	 goto header_a;
2274 
2275        exit_a:
2276 	 i_5 = PHI<i_2>;
2277 
2278        merge_bb:
2279 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2280 
2281      update_loop:
2282        header_b:
2283 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2284 	 ...
2285 	 i_4 = i_3 + 1;
2286 	 if (cond_b)
2287 	   goto latch_b;
2288 	 else
2289 	   goto exit_bb;
2290        latch_b:
2291 	 goto header_b;
2292 
2293        exit_bb:
2294 
2295    This function creates PHI nodes at merge_bb and replaces the use of i_5
2296    in the update_loop's PHI node with the result of new PHI result.  */
2297 
2298 static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2299 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2300 				    class loop *update_loop,
2301 				    edge guard_edge, edge merge_edge)
2302 {
2303   location_t merge_loc, guard_loc;
2304   edge orig_e = loop_preheader_edge (skip_loop);
2305   edge update_e = loop_preheader_edge (update_loop);
2306   gphi_iterator gsi_orig, gsi_update;
2307 
2308   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2309 	gsi_update = gsi_start_phis (update_loop->header));
2310        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2311        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2312     {
2313       gphi *orig_phi = gsi_orig.phi ();
2314       gphi *update_phi = gsi_update.phi ();
2315 
2316       /* Generate new phi node at merge bb of the guard.  */
2317       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2318       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2319 
2320       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2321 	 args in NEW_PHI for these edges.  */
2322       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2323       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2324       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2325       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2326       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2327       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2328 
2329       /* Update phi in UPDATE_PHI.  */
2330       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2331     }
2332 }
2333 
2334 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2335    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2336    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2337    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2338    The CFG looks like:
2339 
2340      loop:
2341        header_a:
2342 	 i_1 = PHI<i_0, i_2>;
2343 	 ...
2344 	 i_2 = i_1 + 1;
2345 	 if (cond_a)
2346 	   goto latch_a;
2347 	 else
2348 	   goto exit_a;
2349        latch_a:
2350 	 goto header_a;
2351 
2352        exit_a:
2353 
2354        guard_bb:
2355 	 if (cond)
2356 	   goto merge_bb;
2357 	 else
2358 	   goto epilog_loop;
2359 
2360        ;; fall_through_bb
2361 
2362      epilog_loop:
2363        header_b:
2364 	 i_3 = PHI<i_2, i_4>;
2365 	 ...
2366 	 i_4 = i_3 + 1;
2367 	 if (cond_b)
2368 	   goto latch_b;
2369 	 else
2370 	   goto merge_bb;
2371        latch_b:
2372 	 goto header_b;
2373 
2374        merge_bb:
2375 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2376 
2377        exit_bb:
2378 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2379 
2380    For each name used out side EPILOG (i.e - for each name that has a lcssa
2381    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2382    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2383    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2384    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2385    in exit_bb will also be updated.  */
2386 
2387 static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2388 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2389 				    edge guard_edge, edge merge_edge)
2390 {
2391   gphi_iterator gsi;
2392   basic_block merge_bb = guard_edge->dest;
2393 
2394   gcc_assert (single_succ_p (merge_bb));
2395   edge e = single_succ_edge (merge_bb);
2396   basic_block exit_bb = e->dest;
2397   gcc_assert (single_pred_p (exit_bb));
2398   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2399 
2400   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2401     {
2402       gphi *update_phi = gsi.phi ();
2403       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2404 
2405       tree merge_arg = NULL_TREE;
2406 
2407       /* If the old argument is a SSA_NAME use its current_def.  */
2408       if (TREE_CODE (old_arg) == SSA_NAME)
2409 	merge_arg = get_current_def (old_arg);
2410       /* If it's a constant or doesn't have a current_def, just use the old
2411 	 argument.  */
2412       if (!merge_arg)
2413 	merge_arg = old_arg;
2414 
2415       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2416       /* If the var is live after loop but not a reduction, we simply
2417 	 use the old arg.  */
2418       if (!guard_arg)
2419 	guard_arg = old_arg;
2420 
2421       /* Create new phi node in MERGE_BB:  */
2422       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2423       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2424 
2425       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2426 	 the two PHI args in merge_phi for these edges.  */
2427       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2428       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2429 
2430       /* Update the original phi in exit_bb.  */
2431       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2432     }
2433 }
2434 
2435 /* EPILOG loop is duplicated from the original loop for vectorizing,
2436    the arg of its loop closed ssa PHI needs to be updated.  */
2437 
2438 static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2439 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2440 {
2441   gphi_iterator gsi;
2442   basic_block exit_bb = single_exit (epilog)->dest;
2443 
2444   gcc_assert (single_pred_p (exit_bb));
2445   edge e = EDGE_PRED (exit_bb, 0);
2446   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2447     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2448 }
2449 
2450 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2451    iterate exactly CONST_NITERS times.  Make a final decision about
2452    whether the epilogue loop should be used, returning true if so.  */
2453 
2454 static bool
vect_update_epilogue_niters(loop_vec_info epilogue_vinfo,unsigned HOST_WIDE_INT const_niters)2455 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2456 			     unsigned HOST_WIDE_INT const_niters)
2457 {
2458   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2459      using an epilogue loop for a single scalar iteration, even if
2460      we could in principle implement that using partial vectors.  */
2461   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2462   if (const_niters <= gap_niters + 1)
2463     return false;
2464 
2465   /* Install the number of iterations.  */
2466   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2467   tree niters_tree = build_int_cst (niters_type, const_niters);
2468   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2469 
2470   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2471   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2472 
2473   /* Decide what to do if the number of epilogue iterations is not
2474      a multiple of the epilogue loop's vectorization factor.  */
2475   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2476 }
2477 
2478 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2479    Return a value that equals:
2480 
2481    - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2482    - SKIP_VALUE when the main loop is skipped.  */
2483 
2484 tree
vect_get_main_loop_result(loop_vec_info loop_vinfo,tree main_loop_value,tree skip_value)2485 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2486 			   tree skip_value)
2487 {
2488   gcc_assert (loop_vinfo->main_loop_edge);
2489 
2490   tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2491   basic_block bb = loop_vinfo->main_loop_edge->dest;
2492   gphi *new_phi = create_phi_node (phi_result, bb);
2493   add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2494 	       UNKNOWN_LOCATION);
2495   add_phi_arg (new_phi, skip_value,
2496 	       loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2497   return phi_result;
2498 }
2499 
2500 /* Function vect_do_peeling.
2501 
2502    Input:
2503    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2504 
2505        preheader:
2506      LOOP:
2507        header_bb:
2508 	 loop_body
2509 	 if (exit_loop_cond) goto exit_bb
2510 	 else                goto header_bb
2511        exit_bb:
2512 
2513    - NITERS: The number of iterations of the loop.
2514    - NITERSM1: The number of iterations of the loop's latch.
2515    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2516    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2517 			      CHECK_PROFITABILITY is true.
2518    Output:
2519    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2520      iterate after vectorization; see vect_set_loop_condition for details.
2521    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2522      should be set to the number of scalar iterations handled by the
2523      vector loop.  The SSA name is only used on exit from the loop.
2524 
2525    This function peels prolog and epilog from the loop, adds guards skipping
2526    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2527    would look like:
2528 
2529        guard_bb_1:
2530 	 if (prefer_scalar_loop) goto merge_bb_1
2531 	 else                    goto guard_bb_2
2532 
2533        guard_bb_2:
2534          if (skip_prolog) goto merge_bb_2
2535          else             goto prolog_preheader
2536 
2537        prolog_preheader:
2538      PROLOG:
2539        prolog_header_bb:
2540 	 prolog_body
2541 	 if (exit_prolog_cond) goto prolog_exit_bb
2542 	 else                  goto prolog_header_bb
2543        prolog_exit_bb:
2544 
2545        merge_bb_2:
2546 
2547        vector_preheader:
2548      VECTOR LOOP:
2549        vector_header_bb:
2550 	 vector_body
2551 	 if (exit_vector_cond) goto vector_exit_bb
2552 	 else                  goto vector_header_bb
2553        vector_exit_bb:
2554 
2555        guard_bb_3:
2556 	 if (skip_epilog) goto merge_bb_3
2557 	 else             goto epilog_preheader
2558 
2559        merge_bb_1:
2560 
2561        epilog_preheader:
2562      EPILOG:
2563        epilog_header_bb:
2564 	 epilog_body
2565 	 if (exit_epilog_cond) goto merge_bb_3
2566 	 else                  goto epilog_header_bb
2567 
2568        merge_bb_3:
2569 
2570    Note this function peels prolog and epilog only if it's necessary,
2571    as well as guards.
2572    This function returns the epilogue loop if a decision was made to vectorize
2573    it, otherwise NULL.
2574 
2575    The analysis resulting in this epilogue loop's loop_vec_info was performed
2576    in the same vect_analyze_loop call as the main loop's.  At that time
2577    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2578    vectorization factors than the main loop.  This list is stored in the main
2579    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2580    vectorize the epilogue loop for a lower vectorization factor,  the
2581    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2582    updated and linked to the epilogue loop.  This is later used to vectorize
2583    the epilogue.  The reason the loop_vec_info needs updating is that it was
2584    constructed based on the original main loop, and the epilogue loop is a
2585    copy of this loop, so all links pointing to statements in the original loop
2586    need updating.  Furthermore, these loop_vec_infos share the
2587    data_reference's records, which will also need to be updated.
2588 
2589    TODO: Guard for prefer_scalar_loop should be emitted along with
2590    versioning conditions if loop versioning is needed.  */
2591 
2592 
2593 class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)2594 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2595 		 tree *niters_vector, tree *step_vector,
2596 		 tree *niters_vector_mult_vf_var, int th,
2597 		 bool check_profitability, bool niters_no_overflow,
2598 		 tree *advance)
2599 {
2600   edge e, guard_e;
2601   tree type = TREE_TYPE (niters), guard_cond;
2602   basic_block guard_bb, guard_to;
2603   profile_probability prob_prolog, prob_vector, prob_epilog;
2604   int estimated_vf;
2605   int prolog_peeling = 0;
2606   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2607   bool vect_epilogues_updated_niters = false;
2608   /* We currently do not support prolog peeling if the target alignment is not
2609      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2610      target alignment being constant.  */
2611   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2612   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2613     return NULL;
2614 
2615   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2616     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2617 
2618   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2619   poly_uint64 bound_epilog = 0;
2620   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2621       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2622     bound_epilog += vf - 1;
2623   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2624     bound_epilog += 1;
2625   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2626   poly_uint64 bound_scalar = bound_epilog;
2627 
2628   if (!prolog_peeling && !epilog_peeling)
2629     return NULL;
2630 
2631   /* Before doing any peeling make sure to reset debug binds outside of
2632      the loop refering to defs not in LC SSA.  */
2633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2634   for (unsigned i = 0; i < loop->num_nodes; ++i)
2635     {
2636       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2637       imm_use_iterator ui;
2638       gimple *use_stmt;
2639       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2640 	   gsi_next (&gsi))
2641 	{
2642 	  FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2643 	    if (gimple_debug_bind_p (use_stmt)
2644 		&& loop != gimple_bb (use_stmt)->loop_father
2645 		&& !flow_loop_nested_p (loop,
2646 					gimple_bb (use_stmt)->loop_father))
2647 	      {
2648 		gimple_debug_bind_reset_value (use_stmt);
2649 		update_stmt (use_stmt);
2650 	      }
2651 	}
2652       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2653 	   gsi_next (&gsi))
2654 	{
2655 	  ssa_op_iter op_iter;
2656 	  def_operand_p def_p;
2657 	  FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2658 	    FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2659 	      if (gimple_debug_bind_p (use_stmt)
2660 		  && loop != gimple_bb (use_stmt)->loop_father
2661 		  && !flow_loop_nested_p (loop,
2662 					  gimple_bb (use_stmt)->loop_father))
2663 		{
2664 		  gimple_debug_bind_reset_value (use_stmt);
2665 		  update_stmt (use_stmt);
2666 		}
2667 	}
2668     }
2669 
2670   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2671   estimated_vf = vect_vf_for_cost (loop_vinfo);
2672   if (estimated_vf == 2)
2673     estimated_vf = 3;
2674   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2675 			.apply_scale (estimated_vf - 1, estimated_vf);
2676 
2677   class loop *prolog, *epilog = NULL;
2678   class loop *first_loop = loop;
2679   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2680 
2681   /* We might have a queued need to update virtual SSA form.  As we
2682      delete the update SSA machinery below after doing a regular
2683      incremental SSA update during loop copying make sure we don't
2684      lose that fact.
2685      ???  Needing to update virtual SSA form by renaming is unfortunate
2686      but not all of the vectorizer code inserting new loads / stores
2687      properly assigns virtual operands to those statements.  */
2688   update_ssa (TODO_update_ssa_only_virtuals);
2689 
2690   create_lcssa_for_virtual_phi (loop);
2691 
2692   /* If we're vectorizing an epilogue loop, the update_ssa above will
2693      have ensured that the virtual operand is in SSA form throughout the
2694      vectorized main loop.  Normally it is possible to trace the updated
2695      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2696      back to scalar-stmt vuses, meaning that the effect of the SSA update
2697      remains local to the main loop.  However, there are rare cases in
2698      which the vectorized loop has vdefs even when the original scalar
2699      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2700      introduces clobbers of the temporary vector array, which in turn
2701      needs new vdefs.  If the scalar loop doesn't write to memory, these
2702      new vdefs will be the only ones in the vector loop.
2703 
2704      In that case, update_ssa will have added a new virtual phi to the
2705      main loop, which previously didn't need one.  Ensure that we (locally)
2706      maintain LCSSA form for the virtual operand, just as we would have
2707      done if the virtual phi had existed from the outset.  This makes it
2708      easier to duplicate the scalar epilogue loop below.  */
2709   tree vop_to_rename = NULL_TREE;
2710   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2711     {
2712       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2713       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2714     }
2715 
2716   if (MAY_HAVE_DEBUG_BIND_STMTS)
2717     {
2718       gcc_assert (!adjust_vec.exists ());
2719       adjust_vec.create (32);
2720     }
2721   initialize_original_copy_tables ();
2722 
2723   /* Record the anchor bb at which the guard should be placed if the scalar
2724      loop might be preferred.  */
2725   basic_block anchor = loop_preheader_edge (loop)->src;
2726 
2727   /* Generate the number of iterations for the prolog loop.  We do this here
2728      so that we can also get the upper bound on the number of iterations.  */
2729   tree niters_prolog;
2730   int bound_prolog = 0;
2731   if (prolog_peeling)
2732     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2733 						  &bound_prolog);
2734   else
2735     niters_prolog = build_int_cst (type, 0);
2736 
2737   loop_vec_info epilogue_vinfo = NULL;
2738   if (vect_epilogues)
2739     {
2740       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2741       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2742     }
2743 
2744   tree niters_vector_mult_vf = NULL_TREE;
2745   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2746   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2747   edge update_e = NULL, skip_e = NULL;
2748   unsigned int lowest_vf = constant_lower_bound (vf);
2749   /* If we know the number of scalar iterations for the main loop we should
2750      check whether after the main loop there are enough iterations left over
2751      for the epilogue.  */
2752   if (vect_epilogues
2753       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2754       && prolog_peeling >= 0
2755       && known_eq (vf, lowest_vf))
2756     {
2757       unsigned HOST_WIDE_INT eiters
2758 	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
2759 	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2760 
2761       eiters -= prolog_peeling;
2762       eiters
2763 	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2764 
2765       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2766 	{
2767 	  delete epilogue_vinfo;
2768 	  epilogue_vinfo = NULL;
2769 	  if (loop_vinfo->epilogue_vinfos.length () == 0)
2770 	    {
2771 	      vect_epilogues = false;
2772 	      break;
2773 	    }
2774 	  epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2775 	  loop_vinfo->epilogue_vinfos.ordered_remove (0);
2776 	}
2777       vect_epilogues_updated_niters = true;
2778     }
2779   /* Prolog loop may be skipped.  */
2780   bool skip_prolog = (prolog_peeling != 0);
2781   /* Skip this loop to epilog when there are not enough iterations to enter this
2782      vectorized loop.  If true we should perform runtime checks on the NITERS
2783      to check whether we should skip the current vectorized loop.  If we know
2784      the number of scalar iterations we may choose to add a runtime check if
2785      this number "maybe" smaller than the number of iterations required
2786      when we know the number of scalar iterations may potentially
2787      be smaller than the number of iterations required to enter this loop, for
2788      this we use the upper bounds on the prolog and epilog peeling.  When we
2789      don't know the number of iterations and don't require versioning it is
2790      because we have asserted that there are enough scalar iterations to enter
2791      the main loop, so this skip is not necessary.  When we are versioning then
2792      we only add such a skip if we have chosen to vectorize the epilogue.  */
2793   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2794 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2795 				  bound_prolog + bound_epilog)
2796 		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2797 			 || vect_epilogues));
2798   /* Epilog loop must be executed if the number of iterations for epilog
2799      loop is known at compile time, otherwise we need to add a check at
2800      the end of vector loop and skip to the end of epilog loop.  */
2801   bool skip_epilog = (prolog_peeling < 0
2802 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2803 		      || !vf.is_constant ());
2804   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2805   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2806     skip_epilog = false;
2807 
2808   if (skip_vector)
2809     {
2810       split_edge (loop_preheader_edge (loop));
2811 
2812       /* Due to the order in which we peel prolog and epilog, we first
2813 	 propagate probability to the whole loop.  The purpose is to
2814 	 avoid adjusting probabilities of both prolog and vector loops
2815 	 separately.  Note in this case, the probability of epilog loop
2816 	 needs to be scaled back later.  */
2817       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2818       if (prob_vector.initialized_p ())
2819 	{
2820 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2821 	  scale_loop_profile (loop, prob_vector, 0);
2822 	}
2823     }
2824 
2825   dump_user_location_t loop_loc = find_loop_location (loop);
2826   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2827   if (vect_epilogues)
2828     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2829        use the original scalar loop as remaining epilogue if necessary.  */
2830     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2831       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2832 
2833   if (prolog_peeling)
2834     {
2835       e = loop_preheader_edge (loop);
2836       if (!slpeel_can_duplicate_loop_p (loop, e))
2837 	{
2838 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2839 			   "loop can't be duplicated to preheader edge.\n");
2840 	  gcc_unreachable ();
2841 	}
2842       /* Peel prolog and put it on preheader edge of loop.  */
2843       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2844       if (!prolog)
2845 	{
2846 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2847 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2848 	  gcc_unreachable ();
2849 	}
2850       prolog->force_vectorize = false;
2851       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2852       first_loop = prolog;
2853       reset_original_copy_tables ();
2854 
2855       /* Update the number of iterations for prolog loop.  */
2856       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2857       vect_set_loop_condition (prolog, NULL, niters_prolog,
2858 			       step_prolog, NULL_TREE, false);
2859 
2860       /* Skip the prolog loop.  */
2861       if (skip_prolog)
2862 	{
2863 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2864 				    niters_prolog, build_int_cst (type, 0));
2865 	  guard_bb = loop_preheader_edge (prolog)->src;
2866 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2867 	  guard_to = split_edge (loop_preheader_edge (loop));
2868 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2869 					   guard_to, guard_bb,
2870 					   prob_prolog.invert (),
2871 					   irred_flag);
2872 	  e = EDGE_PRED (guard_to, 0);
2873 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2874 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2875 
2876 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2877 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2878 	}
2879 
2880       /* Update init address of DRs.  */
2881       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2882       /* Update niters for vector loop.  */
2883       LOOP_VINFO_NITERS (loop_vinfo)
2884 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2885       LOOP_VINFO_NITERSM1 (loop_vinfo)
2886 	= fold_build2 (MINUS_EXPR, type,
2887 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2888       bool new_var_p = false;
2889       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2890       /* It's guaranteed that vector loop bound before vectorization is at
2891 	 least VF, so set range information for newly generated var.  */
2892       if (new_var_p)
2893 	set_range_info (niters, VR_RANGE,
2894 			wi::to_wide (build_int_cst (type, vf)),
2895 			wi::to_wide (TYPE_MAX_VALUE (type)));
2896 
2897       /* Prolog iterates at most bound_prolog times, latch iterates at
2898 	 most bound_prolog - 1 times.  */
2899       record_niter_bound (prolog, bound_prolog - 1, false, true);
2900       delete_update_ssa ();
2901       adjust_vec_debug_stmts ();
2902       scev_reset ();
2903     }
2904 
2905   if (epilog_peeling)
2906     {
2907       e = single_exit (loop);
2908       if (!slpeel_can_duplicate_loop_p (loop, e))
2909 	{
2910 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2911 			   "loop can't be duplicated to exit edge.\n");
2912 	  gcc_unreachable ();
2913 	}
2914       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2915 	 said epilog then we should use a copy of the main loop as a starting
2916 	 point.  This loop may have already had some preliminary transformations
2917 	 to allow for more optimal vectorization, for example if-conversion.
2918 	 If we are not vectorizing the epilog then we should use the scalar loop
2919 	 as the transformations mentioned above make less or no sense when not
2920 	 vectorizing.  */
2921       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2922       if (vop_to_rename)
2923 	{
2924 	  /* Vectorizing the main loop can sometimes introduce a vdef to
2925 	     a loop that previously didn't have one; see the comment above
2926 	     the definition of VOP_TO_RENAME for details.  The definition
2927 	     D that holds on E will then be different from the definition
2928 	     VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2929 	     rename VOP_TO_RENAME to D when copying the loop.
2930 
2931 	     The virtual operand is in LCSSA form for the main loop,
2932 	     and no stmt between the main loop and E needs a vdef,
2933 	     so we know that D is provided by a phi rather than by a
2934 	     vdef on a normal gimple stmt.  */
2935 	  basic_block vdef_bb = e->src;
2936 	  gphi *vphi;
2937 	  while (!(vphi = get_virtual_phi (vdef_bb)))
2938 	    vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2939 	  gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2940 	  set_current_def (vop_to_rename, gimple_phi_result (vphi));
2941 	}
2942       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2943       if (!epilog)
2944 	{
2945 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2946 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2947 	  gcc_unreachable ();
2948 	}
2949       epilog->force_vectorize = false;
2950       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2951 
2952       /* Scalar version loop may be preferred.  In this case, add guard
2953 	 and skip to epilog.  Note this only happens when the number of
2954 	 iterations of loop is unknown at compile time, otherwise this
2955 	 won't be vectorized.  */
2956       if (skip_vector)
2957 	{
2958 	  /* Additional epilogue iteration is peeled if gap exists.  */
2959 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2960 						bound_prolog, bound_epilog,
2961 						th, &bound_scalar,
2962 						check_profitability);
2963 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2964 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2965 	  guard_bb = anchor;
2966 	  guard_to = split_edge (loop_preheader_edge (epilog));
2967 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2968 					   guard_to, guard_bb,
2969 					   prob_vector.invert (),
2970 					   irred_flag);
2971 	  skip_e = guard_e;
2972 	  e = EDGE_PRED (guard_to, 0);
2973 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2974 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2975 
2976 	  /* Simply propagate profile info from guard_bb to guard_to which is
2977 	     a merge point of control flow.  */
2978 	  guard_to->count = guard_bb->count;
2979 
2980 	  /* Scale probability of epilog loop back.
2981 	     FIXME: We should avoid scaling down and back up.  Profile may
2982 	     get lost if we scale down to 0.  */
2983 	  basic_block *bbs = get_loop_body (epilog);
2984 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2985 	    bbs[i]->count = bbs[i]->count.apply_scale
2986 				 (bbs[i]->count,
2987 				  bbs[i]->count.apply_probability
2988 				    (prob_vector));
2989 	  free (bbs);
2990 	}
2991 
2992       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2993       /* If loop is peeled for non-zero constant times, now niters refers to
2994 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2995 	 overflows.  */
2996       niters_no_overflow |= (prolog_peeling > 0);
2997       vect_gen_vector_loop_niters (loop_vinfo, niters,
2998 				   niters_vector, step_vector,
2999 				   niters_no_overflow);
3000       if (!integer_onep (*step_vector))
3001 	{
3002 	  /* On exit from the loop we will have an easy way of calcalating
3003 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
3004 	     until then.  */
3005 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3006 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3007 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
3008 	}
3009       else
3010 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3011 					     &niters_vector_mult_vf);
3012       /* Update IVs of original loop as if they were advanced by
3013 	 niters_vector_mult_vf steps.  */
3014       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3015       update_e = skip_vector ? e : loop_preheader_edge (epilog);
3016       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3017 					update_e);
3018 
3019       if (skip_epilog)
3020 	{
3021 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3022 				    niters, niters_vector_mult_vf);
3023 	  guard_bb = single_exit (loop)->dest;
3024 	  guard_to = split_edge (single_exit (epilog));
3025 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3026 					   skip_vector ? anchor : guard_bb,
3027 					   prob_epilog.invert (),
3028 					   irred_flag);
3029 	  if (vect_epilogues)
3030 	    epilogue_vinfo->skip_this_loop_edge = guard_e;
3031 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3032 					      single_exit (epilog));
3033 	  /* Only need to handle basic block before epilog loop if it's not
3034 	     the guard_bb, which is the case when skip_vector is true.  */
3035 	  if (guard_bb != bb_before_epilog)
3036 	    {
3037 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3038 
3039 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3040 	    }
3041 	  scale_loop_profile (epilog, prob_epilog, 0);
3042 	}
3043       else
3044 	slpeel_update_phi_nodes_for_lcssa (epilog);
3045 
3046       unsigned HOST_WIDE_INT bound;
3047       if (bound_scalar.is_constant (&bound))
3048 	{
3049 	  gcc_assert (bound != 0);
3050 	  /* -1 to convert loop iterations to latch iterations.  */
3051 	  record_niter_bound (epilog, bound - 1, false, true);
3052 	}
3053 
3054       delete_update_ssa ();
3055       adjust_vec_debug_stmts ();
3056       scev_reset ();
3057     }
3058 
3059   if (vect_epilogues)
3060     {
3061       epilog->aux = epilogue_vinfo;
3062       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3063 
3064       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3065 
3066       /* We now must calculate the number of NITERS performed by the previous
3067 	 loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3068       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3069 				 niters_prolog, niters_vector_mult_vf);
3070 
3071       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3072 	 determine whether we are coming from the previous vectorized loop
3073 	 using the update_e edge or the skip_vector basic block using the
3074 	 skip_e edge.  */
3075       if (skip_vector)
3076 	{
3077 	  gcc_assert (update_e != NULL
3078 		      && skip_e != NULL
3079 		      && !vect_epilogues_updated_niters);
3080 	  gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3081 					   update_e->dest);
3082 	  tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3083 	  gimple *stmt = gimple_build_assign (new_ssa, niters);
3084 	  gimple_stmt_iterator gsi;
3085 	  if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3086 	      && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3087 	    {
3088 	      gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3089 	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3090 	    }
3091 	  else
3092 	    {
3093 	      gsi = gsi_last_bb (update_e->src);
3094 	      gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3095 	    }
3096 
3097 	  niters = new_ssa;
3098 	  add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3099 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3100 		       UNKNOWN_LOCATION);
3101 	  niters = PHI_RESULT (new_phi);
3102 	  epilogue_vinfo->main_loop_edge = update_e;
3103 	  epilogue_vinfo->skip_main_loop_edge = skip_e;
3104 	}
3105 
3106       /* Set ADVANCE to the number of iterations performed by the previous
3107 	 loop and its prologue.  */
3108       *advance = niters;
3109 
3110       if (!vect_epilogues_updated_niters)
3111 	{
3112 	  /* Subtract the number of iterations performed by the vectorized loop
3113 	     from the number of total iterations.  */
3114 	  tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3115 					      before_loop_niters,
3116 					      niters);
3117 
3118 	  LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3119 	  LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3120 	    = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3121 			   epilogue_niters,
3122 			   build_one_cst (TREE_TYPE (epilogue_niters)));
3123 
3124 	  /* Decide what to do if the number of epilogue iterations is not
3125 	     a multiple of the epilogue loop's vectorization factor.
3126 	     We should have rejected the loop during the analysis phase
3127 	     if this fails.  */
3128 	  if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3129 							   true))
3130 	    gcc_unreachable ();
3131 	}
3132     }
3133 
3134   adjust_vec.release ();
3135   free_original_copy_tables ();
3136 
3137   return vect_epilogues ? epilog : NULL;
3138 }
3139 
3140 /* Function vect_create_cond_for_niters_checks.
3141 
3142    Create a conditional expression that represents the run-time checks for
3143    loop's niter.  The loop is guaranteed to terminate if the run-time
3144    checks hold.
3145 
3146    Input:
3147    COND_EXPR  - input conditional expression.  New conditions will be chained
3148 		with logical AND operation.  If it is NULL, then the function
3149 		is used to return the number of alias checks.
3150    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3151 		to be checked.
3152 
3153    Output:
3154    COND_EXPR - conditional expression.
3155 
3156    The returned COND_EXPR is the conditional expression to be used in the
3157    if statement that controls which version of the loop gets executed at
3158    runtime.  */
3159 
3160 static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)3161 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3162 {
3163   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3164 
3165   if (*cond_expr)
3166     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3167 			      *cond_expr, part_cond_expr);
3168   else
3169     *cond_expr = part_cond_expr;
3170 }
3171 
3172 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3173    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3174 
3175 static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3176 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3177 {
3178   if (*cond_expr)
3179     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3180 			      *cond_expr, part_cond_expr);
3181   else
3182     *cond_expr = part_cond_expr;
3183 }
3184 
3185 /* Function vect_create_cond_for_align_checks.
3186 
3187    Create a conditional expression that represents the alignment checks for
3188    all of data references (array element references) whose alignment must be
3189    checked at runtime.
3190 
3191    Input:
3192    COND_EXPR  - input conditional expression.  New conditions will be chained
3193                 with logical AND operation.
3194    LOOP_VINFO - two fields of the loop information are used.
3195                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3196                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3197 
3198    Output:
3199    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3200                          expression.
3201    The returned value is the conditional expression to be used in the if
3202    statement that controls which version of the loop gets executed at runtime.
3203 
3204    The algorithm makes two assumptions:
3205      1) The number of bytes "n" in a vector is a power of 2.
3206      2) An address "a" is aligned if a%n is zero and that this
3207         test can be done as a&(n-1) == 0.  For example, for 16
3208         byte vectors the test is a&0xf == 0.  */
3209 
3210 static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)3211 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3212                                    tree *cond_expr,
3213 				   gimple_seq *cond_expr_stmt_list)
3214 {
3215   const vec<stmt_vec_info> &may_misalign_stmts
3216     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3217   stmt_vec_info stmt_info;
3218   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3219   tree mask_cst;
3220   unsigned int i;
3221   tree int_ptrsize_type;
3222   char tmp_name[20];
3223   tree or_tmp_name = NULL_TREE;
3224   tree and_tmp_name;
3225   gimple *and_stmt;
3226   tree ptrsize_zero;
3227   tree part_cond_expr;
3228 
3229   /* Check that mask is one less than a power of 2, i.e., mask is
3230      all zeros followed by all ones.  */
3231   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3232 
3233   int_ptrsize_type = signed_type_for (ptr_type_node);
3234 
3235   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3236      of the first vector of the i'th data reference. */
3237 
3238   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3239     {
3240       gimple_seq new_stmt_list = NULL;
3241       tree addr_base;
3242       tree addr_tmp_name;
3243       tree new_or_tmp_name;
3244       gimple *addr_stmt, *or_stmt;
3245       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3246       bool negative = tree_int_cst_compare
3247 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3248       tree offset = negative
3249 	? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3250 		    * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3251 	: size_zero_node;
3252 
3253       /* create: addr_tmp = (int)(address_of_first_vector) */
3254       addr_base =
3255 	vect_create_addr_base_for_vector_ref (loop_vinfo,
3256 					      stmt_info, &new_stmt_list,
3257 					      offset);
3258       if (new_stmt_list != NULL)
3259 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3260 
3261       sprintf (tmp_name, "addr2int%d", i);
3262       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3263       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3264       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3265 
3266       /* The addresses are OR together.  */
3267 
3268       if (or_tmp_name != NULL_TREE)
3269         {
3270           /* create: or_tmp = or_tmp | addr_tmp */
3271           sprintf (tmp_name, "orptrs%d", i);
3272 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3273 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3274 					 or_tmp_name, addr_tmp_name);
3275 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3276           or_tmp_name = new_or_tmp_name;
3277         }
3278       else
3279         or_tmp_name = addr_tmp_name;
3280 
3281     } /* end for i */
3282 
3283   mask_cst = build_int_cst (int_ptrsize_type, mask);
3284 
3285   /* create: and_tmp = or_tmp & mask  */
3286   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3287 
3288   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3289 				  or_tmp_name, mask_cst);
3290   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3291 
3292   /* Make and_tmp the left operand of the conditional test against zero.
3293      if and_tmp has a nonzero bit then some address is unaligned.  */
3294   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3295   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3296 				and_tmp_name, ptrsize_zero);
3297   chain_cond_expr (cond_expr, part_cond_expr);
3298 }
3299 
3300 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3301    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3302    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3303    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3304 
3305 static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3306 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3307 {
3308   const vec<vec_object_pair> &pairs
3309     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3310   unsigned int i;
3311   vec_object_pair *pair;
3312   FOR_EACH_VEC_ELT (pairs, i, pair)
3313     {
3314       tree addr1 = build_fold_addr_expr (pair->first);
3315       tree addr2 = build_fold_addr_expr (pair->second);
3316       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3317 					 addr1, addr2);
3318       chain_cond_expr (cond_expr, part_cond_expr);
3319     }
3320 }
3321 
3322 /* Create an expression that is true when all lower-bound conditions for
3323    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3324 
3325 static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3326 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3327 {
3328   const vec<vec_lower_bound> &lower_bounds
3329     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3330   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3331     {
3332       tree expr = lower_bounds[i].expr;
3333       tree type = unsigned_type_for (TREE_TYPE (expr));
3334       expr = fold_convert (type, expr);
3335       poly_uint64 bound = lower_bounds[i].min_value;
3336       if (!lower_bounds[i].unsigned_p)
3337 	{
3338 	  expr = fold_build2 (PLUS_EXPR, type, expr,
3339 			      build_int_cstu (type, bound - 1));
3340 	  bound += bound - 1;
3341 	}
3342       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3343 					 build_int_cstu (type, bound));
3344       chain_cond_expr (cond_expr, part_cond_expr);
3345     }
3346 }
3347 
3348 /* Function vect_create_cond_for_alias_checks.
3349 
3350    Create a conditional expression that represents the run-time checks for
3351    overlapping of address ranges represented by a list of data references
3352    relations passed as input.
3353 
3354    Input:
3355    COND_EXPR  - input conditional expression.  New conditions will be chained
3356                 with logical AND operation.  If it is NULL, then the function
3357                 is used to return the number of alias checks.
3358    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3359 	        to be checked.
3360 
3361    Output:
3362    COND_EXPR - conditional expression.
3363 
3364    The returned COND_EXPR is the conditional expression to be used in the if
3365    statement that controls which version of the loop gets executed at runtime.
3366 */
3367 
3368 void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)3369 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3370 {
3371   const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3372     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3373 
3374   if (comp_alias_ddrs.is_empty ())
3375     return;
3376 
3377   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3378 			       &comp_alias_ddrs, cond_expr);
3379   if (dump_enabled_p ())
3380     dump_printf_loc (MSG_NOTE, vect_location,
3381 		     "created %u versioning for alias checks.\n",
3382 		     comp_alias_ddrs.length ());
3383 }
3384 
3385 
3386 /* Function vect_loop_versioning.
3387 
3388    If the loop has data references that may or may not be aligned or/and
3389    has data reference relations whose independence was not proven then
3390    two versions of the loop need to be generated, one which is vectorized
3391    and one which isn't.  A test is then generated to control which of the
3392    loops is executed.  The test checks for the alignment of all of the
3393    data references that may or may not be aligned.  An additional
3394    sequence of runtime tests is generated for each pairs of DDRs whose
3395    independence was not proven.  The vectorized version of loop is
3396    executed only if both alias and alignment tests are passed.
3397 
3398    The test generated to check which version of loop is executed
3399    is modified to also check for profitability as indicated by the
3400    cost model threshold TH.
3401 
3402    The versioning precondition(s) are placed in *COND_EXPR and
3403    *COND_EXPR_STMT_LIST.  */
3404 
3405 class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)3406 vect_loop_versioning (loop_vec_info loop_vinfo,
3407 		      gimple *loop_vectorized_call)
3408 {
3409   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3410   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3411   basic_block condition_bb;
3412   gphi_iterator gsi;
3413   gimple_stmt_iterator cond_exp_gsi;
3414   basic_block merge_bb;
3415   basic_block new_exit_bb;
3416   edge new_exit_e, e;
3417   gphi *orig_phi, *new_phi;
3418   tree cond_expr = NULL_TREE;
3419   gimple_seq cond_expr_stmt_list = NULL;
3420   tree arg;
3421   profile_probability prob = profile_probability::likely ();
3422   gimple_seq gimplify_stmt_list = NULL;
3423   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3424   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3425   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3426   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3427   poly_uint64 versioning_threshold
3428     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3429   tree version_simd_if_cond
3430     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3431   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3432 
3433   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3434       && !ordered_p (th, versioning_threshold))
3435     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3436 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3437 					    th - 1));
3438   if (maybe_ne (versioning_threshold, 0U))
3439     {
3440       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3441 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3442 					      versioning_threshold - 1));
3443       if (cond_expr)
3444 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3445 				 expr, cond_expr);
3446       else
3447 	cond_expr = expr;
3448     }
3449 
3450   tree cost_name = NULL_TREE;
3451   profile_probability prob2 = profile_probability::uninitialized ();
3452   if (cond_expr
3453       && EXPR_P (cond_expr)
3454       && (version_niter
3455 	  || version_align
3456 	  || version_alias
3457 	  || version_simd_if_cond))
3458     {
3459       cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3460 						      &cond_expr_stmt_list,
3461 						      is_gimple_val, NULL_TREE);
3462       /* Split prob () into two so that the overall probability of passing
3463 	 both the cost-model and versioning checks is the orig prob.  */
3464       prob2 = prob.split (prob);
3465     }
3466 
3467   if (version_niter)
3468     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3469 
3470   if (cond_expr)
3471     {
3472       gimple_seq tem = NULL;
3473       cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3474 					  &tem,
3475 					  is_gimple_condexpr, NULL_TREE);
3476       gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3477     }
3478 
3479   if (version_align)
3480     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3481 				       &cond_expr_stmt_list);
3482 
3483   if (version_alias)
3484     {
3485       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3486       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3487       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3488     }
3489 
3490   if (version_simd_if_cond)
3491     {
3492       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3493       if (flag_checking)
3494 	if (basic_block bb
3495 	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3496 	  gcc_assert (bb != loop->header
3497 		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3498 		      && (scalar_loop == NULL
3499 			  || (bb != scalar_loop->header
3500 			      && dominated_by_p (CDI_DOMINATORS,
3501 						 scalar_loop->header, bb))));
3502       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3503       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3504 			    version_simd_if_cond, zero);
3505       if (cond_expr)
3506         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3507 				 c, cond_expr);
3508       else
3509         cond_expr = c;
3510       if (dump_enabled_p ())
3511 	dump_printf_loc (MSG_NOTE, vect_location,
3512 			 "created versioning for simd if condition check.\n");
3513     }
3514 
3515   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3516 				      &gimplify_stmt_list,
3517 				      is_gimple_condexpr, NULL_TREE);
3518   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3519 
3520   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3521      invariant in.  */
3522   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3523   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3524        !gsi_end_p (gsi); gsi_next (&gsi))
3525     {
3526       gimple *stmt = gsi_stmt (gsi);
3527       update_stmt (stmt);
3528       ssa_op_iter iter;
3529       use_operand_p use_p;
3530       basic_block def_bb;
3531       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3532 	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3533 	    && flow_bb_inside_loop_p (outermost, def_bb))
3534 	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3535     }
3536 
3537   /* Search for the outermost loop we can version.  Avoid versioning of
3538      non-perfect nests but allow if-conversion versioned loops inside.  */
3539   class loop *loop_to_version = loop;
3540   if (flow_loop_nested_p (outermost, loop))
3541     {
3542       if (dump_enabled_p ())
3543 	dump_printf_loc (MSG_NOTE, vect_location,
3544 			 "trying to apply versioning to outer loop %d\n",
3545 			 outermost->num);
3546       if (outermost->num == 0)
3547 	outermost = superloop_at_depth (loop, 1);
3548       /* And avoid applying versioning on non-perfect nests.  */
3549       while (loop_to_version != outermost
3550 	     && (e = single_exit (loop_outer (loop_to_version)))
3551 	     && !(e->flags & EDGE_COMPLEX)
3552 	     && (!loop_outer (loop_to_version)->inner->next
3553 		 || vect_loop_vectorized_call (loop_to_version))
3554 	     && (!loop_outer (loop_to_version)->inner->next
3555 		 || !loop_outer (loop_to_version)->inner->next->next))
3556 	loop_to_version = loop_outer (loop_to_version);
3557     }
3558 
3559   /* Apply versioning.  If there is already a scalar version created by
3560      if-conversion re-use that.  Note we cannot re-use the copy of
3561      an if-converted outer-loop when vectorizing the inner loop only.  */
3562   gcond *cond;
3563   if ((!loop_to_version->inner || loop == loop_to_version)
3564       && loop_vectorized_call)
3565     {
3566       gcc_assert (scalar_loop);
3567       condition_bb = gimple_bb (loop_vectorized_call);
3568       cond = as_a <gcond *> (last_stmt (condition_bb));
3569       gimple_cond_set_condition_from_tree (cond, cond_expr);
3570       update_stmt (cond);
3571 
3572       if (cond_expr_stmt_list)
3573 	{
3574 	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3575 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3576 				 GSI_SAME_STMT);
3577 	}
3578 
3579       /* if-conversion uses profile_probability::always () for both paths,
3580 	 reset the paths probabilities appropriately.  */
3581       edge te, fe;
3582       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3583       te->probability = prob;
3584       fe->probability = prob.invert ();
3585       /* We can scale loops counts immediately but have to postpone
3586          scaling the scalar loop because we re-use it during peeling.  */
3587       scale_loop_frequencies (loop_to_version, te->probability);
3588       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3589 
3590       nloop = scalar_loop;
3591       if (dump_enabled_p ())
3592 	dump_printf_loc (MSG_NOTE, vect_location,
3593 			 "reusing %sloop version created by if conversion\n",
3594 			 loop_to_version != loop ? "outer " : "");
3595     }
3596   else
3597     {
3598       if (loop_to_version != loop
3599 	  && dump_enabled_p ())
3600 	dump_printf_loc (MSG_NOTE, vect_location,
3601 			 "applying loop versioning to outer loop %d\n",
3602 			 loop_to_version->num);
3603 
3604       unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3605 
3606       initialize_original_copy_tables ();
3607       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3608 			    prob, prob.invert (), prob, prob.invert (), true);
3609       gcc_assert (nloop);
3610       nloop = get_loop_copy (loop);
3611 
3612       /* For cycle vectorization with SLP we rely on the PHI arguments
3613 	 appearing in the same order as the SLP node operands which for the
3614 	 loop PHI nodes means the preheader edge dest index needs to remain
3615 	 the same for the analyzed loop which also becomes the vectorized one.
3616 	 Make it so in case the state after versioning differs by redirecting
3617 	 the first edge into the header to the same destination which moves
3618 	 it last.  */
3619       if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3620 	{
3621 	  edge e = EDGE_PRED (loop->header, 0);
3622 	  ssa_redirect_edge (e, e->dest);
3623 	  flush_pending_stmts (e);
3624 	}
3625       gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3626 
3627       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3628          reap those otherwise;  they also refer to the original
3629 	 loops.  */
3630       class loop *l = loop;
3631       while (gimple *call = vect_loop_vectorized_call (l))
3632 	{
3633 	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3634 	  fold_loop_internal_call (call, boolean_false_node);
3635 	  l = loop_outer (l);
3636 	}
3637       free_original_copy_tables ();
3638 
3639       if (cond_expr_stmt_list)
3640 	{
3641 	  cond_exp_gsi = gsi_last_bb (condition_bb);
3642 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3643 				 GSI_SAME_STMT);
3644 	}
3645 
3646       /* Loop versioning violates an assumption we try to maintain during
3647 	 vectorization - that the loop exit block has a single predecessor.
3648 	 After versioning, the exit block of both loop versions is the same
3649 	 basic block (i.e. it has two predecessors). Just in order to simplify
3650 	 following transformations in the vectorizer, we fix this situation
3651 	 here by adding a new (empty) block on the exit-edge of the loop,
3652 	 with the proper loop-exit phis to maintain loop-closed-form.
3653 	 If loop versioning wasn't done from loop, but scalar_loop instead,
3654 	 merge_bb will have already just a single successor.  */
3655 
3656       merge_bb = single_exit (loop_to_version)->dest;
3657       if (EDGE_COUNT (merge_bb->preds) >= 2)
3658 	{
3659 	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3660 	  new_exit_bb = split_edge (single_exit (loop_to_version));
3661 	  new_exit_e = single_exit (loop_to_version);
3662 	  e = EDGE_SUCC (new_exit_bb, 0);
3663 
3664 	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3665 	       gsi_next (&gsi))
3666 	    {
3667 	      tree new_res;
3668 	      orig_phi = gsi.phi ();
3669 	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3670 	      new_phi = create_phi_node (new_res, new_exit_bb);
3671 	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3672 	      add_phi_arg (new_phi, arg, new_exit_e,
3673 			   gimple_phi_arg_location_from_edge (orig_phi, e));
3674 	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3675 	    }
3676 	}
3677 
3678       update_ssa (TODO_update_ssa);
3679     }
3680 
3681   /* Split the cost model check off to a separate BB.  Costing assumes
3682      this is the only thing we perform when we enter the scalar loop
3683      from a failed cost decision.  */
3684   if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3685     {
3686       gimple *def = SSA_NAME_DEF_STMT (cost_name);
3687       gcc_assert (gimple_bb (def) == condition_bb);
3688       /* All uses of the cost check are 'true' after the check we
3689 	 are going to insert.  */
3690       replace_uses_by (cost_name, boolean_true_node);
3691       /* And we're going to build the new single use of it.  */
3692       gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3693 				       NULL_TREE, NULL_TREE);
3694       edge e = split_block (gimple_bb (def), def);
3695       gimple_stmt_iterator gsi = gsi_for_stmt (def);
3696       gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3697       edge true_e, false_e;
3698       extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3699       e->flags &= ~EDGE_FALLTHRU;
3700       e->flags |= EDGE_TRUE_VALUE;
3701       edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3702       e->probability = prob2;
3703       e2->probability = prob2.invert ();
3704       set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3705       auto_vec<basic_block, 3> adj;
3706       for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3707 	   son;
3708 	   son = next_dom_son (CDI_DOMINATORS, son))
3709 	if (EDGE_COUNT (son->preds) > 1)
3710 	  adj.safe_push (son);
3711       for (auto son : adj)
3712 	set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3713     }
3714 
3715   if (version_niter)
3716     {
3717       /* The versioned loop could be infinite, we need to clear existing
3718 	 niter information which is copied from the original loop.  */
3719       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3720       vect_free_loop_info_assumptions (nloop);
3721     }
3722 
3723   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3724       && dump_enabled_p ())
3725     {
3726       if (version_alias)
3727         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3728 			 vect_location,
3729                          "loop versioned for vectorization because of "
3730 			 "possible aliasing\n");
3731       if (version_align)
3732         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3733 			 vect_location,
3734                          "loop versioned for vectorization to enhance "
3735 			 "alignment\n");
3736 
3737     }
3738 
3739   return nloop;
3740 }
3741