xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-loop-manip.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1 /* Vectorizer Specific Loop Manipulations
2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53 
54 /*************************************************************************
55   Simple Loop Peeling Utilities
56 
57   Utilities to support loop peeling for vectorization purposes.
58  *************************************************************************/
59 
60 
61 /* Renames the use *OP_P.  */
62 
63 static void
rename_use_op(use_operand_p op_p)64 rename_use_op (use_operand_p op_p)
65 {
66   tree new_name;
67 
68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69     return;
70 
71   new_name = get_current_def (USE_FROM_PTR (op_p));
72 
73   /* Something defined outside of the loop.  */
74   if (!new_name)
75     return;
76 
77   /* An ordinary ssa name defined in the loop.  */
78 
79   SET_USE (op_p, new_name);
80 }
81 
82 
83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85    true.  */
86 
87 static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90   gimple *stmt;
91   use_operand_p use_p;
92   ssa_op_iter iter;
93   edge e;
94   edge_iterator ei;
95   class loop *loop = bb->loop_father;
96   class loop *outer_loop = NULL;
97 
98   if (rename_from_outer_loop)
99     {
100       gcc_assert (loop);
101       outer_loop = loop_outer (loop);
102     }
103 
104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105        gsi_next (&gsi))
106     {
107       stmt = gsi_stmt (gsi);
108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 	rename_use_op (use_p);
110     }
111 
112   FOR_EACH_EDGE (e, ei, bb->preds)
113     {
114       if (!flow_bb_inside_loop_p (loop, e->src))
115 	{
116 	  if (!rename_from_outer_loop)
117 	    continue;
118 	  if (e->src != outer_loop->header)
119 	    {
120 	      if (outer_loop->inner->next)
121 		{
122 		  /* If outer_loop has 2 inner loops, allow there to
123 		     be an extra basic block which decides which of the
124 		     two loops to use using LOOP_VECTORIZED.  */
125 		  if (!single_pred_p (e->src)
126 		      || single_pred (e->src) != outer_loop->header)
127 		    continue;
128 		}
129 	    }
130 	}
131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 	   gsi_next (&gsi))
133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134     }
135 }
136 
137 
138 struct adjust_info
139 {
140   tree from, to;
141   basic_block bb;
142 };
143 
144 /* A stack of values to be adjusted in debug stmts.  We have to
145    process them LIFO, so that the closest substitution applies.  If we
146    processed them FIFO, without the stack, we might substitute uses
147    with a PHI DEF that would soon become non-dominant, and when we got
148    to the suitable one, it wouldn't have anything to substitute any
149    more.  */
150 static vec<adjust_info, va_heap> adjust_vec;
151 
152 /* Adjust any debug stmts that referenced AI->from values to use the
153    loop-closed AI->to, if the references are dominated by AI->bb and
154    not by the definition of AI->from.  */
155 
156 static void
adjust_debug_stmts_now(adjust_info * ai)157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159   basic_block bbphi = ai->bb;
160   tree orig_def = ai->from;
161   tree new_def = ai->to;
162   imm_use_iterator imm_iter;
163   gimple *stmt;
164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165 
166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167 
168   /* Adjust any debug stmts that held onto non-loop-closed
169      references.  */
170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171     {
172       use_operand_p use_p;
173       basic_block bbuse;
174 
175       if (!is_gimple_debug (stmt))
176 	continue;
177 
178       gcc_assert (gimple_debug_bind_p (stmt));
179 
180       bbuse = gimple_bb (stmt);
181 
182       if ((bbuse == bbphi
183 	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 	  && !(bbuse == bbdef
185 	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 	{
187 	  if (new_def)
188 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 	      SET_USE (use_p, new_def);
190 	  else
191 	    {
192 	      gimple_debug_bind_reset_value (stmt);
193 	      update_stmt (stmt);
194 	    }
195 	}
196     }
197 }
198 
199 /* Adjust debug stmts as scheduled before.  */
200 
201 static void
adjust_vec_debug_stmts(void)202 adjust_vec_debug_stmts (void)
203 {
204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
205     return;
206 
207   gcc_assert (adjust_vec.exists ());
208 
209   while (!adjust_vec.is_empty ())
210     {
211       adjust_debug_stmts_now (&adjust_vec.last ());
212       adjust_vec.pop ();
213     }
214 }
215 
216 /* Adjust any debug stmts that referenced FROM values to use the
217    loop-closed TO, if the references are dominated by BB and not by
218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
219    will be postponed until adjust_vec_debug_stmts is called.  */
220 
221 static void
adjust_debug_stmts(tree from,tree to,basic_block bb)222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224   adjust_info ai;
225 
226   if (MAY_HAVE_DEBUG_BIND_STMTS
227       && TREE_CODE (from) == SSA_NAME
228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
229       && ! virtual_operand_p (from))
230     {
231       ai.from = from;
232       ai.to = to;
233       ai.bb = bb;
234 
235       if (adjust_vec.exists ())
236 	adjust_vec.safe_push (ai);
237       else
238 	adjust_debug_stmts_now (&ai);
239     }
240 }
241 
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243    to adjust any debug stmts that referenced the old phi arg,
244    presumably non-loop-closed references left over from other
245    transformations.  */
246 
247 static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251 
252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253 
254   if (MAY_HAVE_DEBUG_BIND_STMTS)
255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 			gimple_bb (update_phi));
257 }
258 
259 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
260    the mask should have during the first iteration and NEXT_MASK is the
261    value that it should have on subsequent iterations.  */
262 
263 static void
vect_set_loop_mask(class loop * loop,tree mask,tree init_mask,tree next_mask)264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265 		    tree next_mask)
266 {
267   gphi *phi = create_phi_node (mask, loop->header);
268   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271 
272 /* Add SEQ to the end of LOOP's preheader block.  */
273 
274 static void
add_preheader_seq(class loop * loop,gimple_seq seq)275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277   if (seq)
278     {
279       edge pe = loop_preheader_edge (loop);
280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281       gcc_assert (!new_bb);
282     }
283 }
284 
285 /* Add SEQ to the beginning of LOOP's header block.  */
286 
287 static void
add_header_seq(class loop * loop,gimple_seq seq)288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290   if (seq)
291     {
292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294     }
295 }
296 
297 /* Return true if the target can interleave elements of two vectors.
298    OFFSET is 0 if the first half of the vectors should be interleaved
299    or 1 if the second half should.  When returning true, store the
300    associated permutation in INDICES.  */
301 
302 static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 			unsigned int offset)
305 {
306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307   poly_uint64 base = exact_div (nelts, 2) * offset;
308   vec_perm_builder sel (nelts, 2, 3);
309   for (unsigned int i = 0; i < 3; ++i)
310     {
311       sel.quick_push (base + i);
312       sel.quick_push (base + i + nelts);
313     }
314   indices->new_vector (sel, 2, nelts);
315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317 
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319    in SRC_RGM, given that the former has twice as many masks as the
320    latter.  Return true on success, adding any new statements to SEQ.  */
321 
322 static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_masks * dest_rgm,rgroup_masks * src_rgm)323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324 			       rgroup_masks *src_rgm)
325 {
326   tree src_masktype = src_rgm->mask_type;
327   tree dest_masktype = dest_rgm->mask_type;
328   machine_mode src_mode = TYPE_MODE (src_masktype);
329   insn_code icode1, icode2;
330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 				  src_mode)) != CODE_FOR_nothing
333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 				  src_mode)) != CODE_FOR_nothing)
335     {
336       /* Unpacking the source masks gives at least as many mask bits as
337 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342 	{
343 	  tree src = src_rgm->masks[i / 2];
344 	  tree dest = dest_rgm->masks[i];
345 	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 			    ? VEC_UNPACK_HI_EXPR
347 			    : VEC_UNPACK_LO_EXPR);
348 	  gassign *stmt;
349 	  if (dest_masktype == unpack_masktype)
350 	    stmt = gimple_build_assign (dest, code, src);
351 	  else
352 	    {
353 	      tree temp = make_ssa_name (unpack_masktype);
354 	      stmt = gimple_build_assign (temp, code, src);
355 	      gimple_seq_add_stmt (seq, stmt);
356 	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 					  build1 (VIEW_CONVERT_EXPR,
358 						  dest_masktype, temp));
359 	    }
360 	  gimple_seq_add_stmt (seq, stmt);
361 	}
362       return true;
363     }
364   vec_perm_indices indices[2];
365   if (dest_masktype == src_masktype
366       && interleave_supported_p (&indices[0], src_masktype, 0)
367       && interleave_supported_p (&indices[1], src_masktype, 1))
368     {
369       /* The destination requires twice as many mask bits as the source, so
370 	 we can use interleaving permutes to double up the number of bits.  */
371       tree masks[2];
372       for (unsigned int i = 0; i < 2; ++i)
373 	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375 	{
376 	  tree src = src_rgm->masks[i / 2];
377 	  tree dest = dest_rgm->masks[i];
378 	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 					      src, src, masks[i & 1]);
380 	  gimple_seq_add_stmt (seq, stmt);
381 	}
382       return true;
383     }
384   return false;
385 }
386 
387 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
388    all the masks in RGM and return a mask that is nonzero when the loop
389    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
390    Use LOOP_COND_GSI to insert code before the exit gcond.
391 
392    RGM belongs to loop LOOP.  The loop originally iterated NITERS
393    times and has been vectorized according to LOOP_VINFO.
394 
395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396    starts with NITERS_SKIP dummy iterations of the scalar loop before
397    the real work starts.  The mask elements for these dummy iterations
398    must be 0, to ensure that the extra iterations do not have an effect.
399 
400    It is known that:
401 
402      NITERS * RGM->max_nscalars_per_iter
403 
404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
405    variable that starts at 0 and has step:
406 
407      VF * RGM->max_nscalars_per_iter
408 
409    might overflow before hitting a value above:
410 
411      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412 
413    This means that we cannot guarantee that such an induction variable
414    would ever hit a value that produces a set of all-false masks for RGM.  */
415 
416 static tree
vect_set_loop_masks_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_masks * rgm,tree niters,tree niters_skip,bool might_wrap_p)417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418 			      gimple_seq *preheader_seq,
419 			      gimple_stmt_iterator loop_cond_gsi,
420 			      rgroup_masks *rgm, tree niters, tree niters_skip,
421 			      bool might_wrap_p)
422 {
423   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425   tree mask_type = rgm->mask_type;
426   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429 
430   /* Calculate the maximum number of scalar values that the rgroup
431      handles in total, the number that it handles for each iteration
432      of the vector loop, and the number that it should skip during the
433      first iteration of the vector loop.  */
434   tree nscalars_total = niters;
435   tree nscalars_step = build_int_cst (iv_type, vf);
436   tree nscalars_skip = niters_skip;
437   if (nscalars_per_iter != 1)
438     {
439       /* We checked before choosing to use a fully-masked loop that these
440 	 multiplications don't overflow.  */
441       tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442       tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444 				     nscalars_total, compare_factor);
445       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 				    nscalars_step, iv_factor);
447       if (nscalars_skip)
448 	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449 				      nscalars_skip, compare_factor);
450     }
451 
452   /* Create an induction variable that counts the number of scalars
453      processed.  */
454   tree index_before_incr, index_after_incr;
455   gimple_stmt_iterator incr_gsi;
456   bool insert_after;
457   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458   create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 	     &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460 
461   tree zero_index = build_int_cst (compare_type, 0);
462   tree test_index, test_limit, first_limit;
463   gimple_stmt_iterator *test_gsi;
464   if (might_wrap_p)
465     {
466       /* In principle the loop should stop iterating once the incremented
467 	 IV reaches a value greater than or equal to:
468 
469 	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470 
471 	 However, there's no guarantee that this addition doesn't overflow
472 	 the comparison type, or that the IV hits a value above it before
473 	 wrapping around.  We therefore adjust the limit down by one
474 	 IV step:
475 
476 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 	   -[infinite-prec] NSCALARS_STEP
478 
479 	 and compare the IV against this limit _before_ incrementing it.
480 	 Since the comparison type is unsigned, we actually want the
481 	 subtraction to saturate at zero:
482 
483 	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 	   -[sat] NSCALARS_STEP
485 
486 	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487 
488 	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489 
490 	 where the rightmost subtraction can be done directly in
491 	 COMPARE_TYPE.  */
492       test_index = index_before_incr;
493       tree adjust = gimple_convert (preheader_seq, compare_type,
494 				    nscalars_step);
495       if (nscalars_skip)
496 	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 			       adjust, nscalars_skip);
498       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499 				 nscalars_total, adjust);
500       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501 				 test_limit, adjust);
502       test_gsi = &incr_gsi;
503 
504       /* Get a safe limit for the first iteration.  */
505       if (nscalars_skip)
506 	{
507 	  /* The first vector iteration can handle at most NSCALARS_STEP
508 	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
509 	     NSCALARS_SKIP to that cannot overflow.  */
510 	  tree const_limit = build_int_cst (compare_type,
511 					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 					    * nscalars_per_iter);
513 	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 				      nscalars_total, const_limit);
515 	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 				      first_limit, nscalars_skip);
517 	}
518       else
519 	/* For the first iteration it doesn't matter whether the IV hits
520 	   a value above NSCALARS_TOTAL.  That only matters for the latch
521 	   condition.  */
522 	first_limit = nscalars_total;
523     }
524   else
525     {
526       /* Test the incremented IV, which will always hit a value above
527 	 the bound before wrapping.  */
528       test_index = index_after_incr;
529       test_limit = nscalars_total;
530       if (nscalars_skip)
531 	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 				   test_limit, nscalars_skip);
533       test_gsi = &loop_cond_gsi;
534 
535       first_limit = test_limit;
536     }
537 
538   /* Convert the IV value to the comparison type (either a no-op or
539      a demotion).  */
540   gimple_seq test_seq = NULL;
541   test_index = gimple_convert (&test_seq, compare_type, test_index);
542   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543 
544   /* Provide a definition of each mask in the group.  */
545   tree next_mask = NULL_TREE;
546   tree mask;
547   unsigned int i;
548   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549     {
550       /* Previous masks will cover BIAS scalars.  This mask covers the
551 	 next batch.  */
552       poly_uint64 bias = nscalars_per_mask * i;
553       tree bias_tree = build_int_cst (compare_type, bias);
554       gimple *tmp_stmt;
555 
556       /* See whether the first iteration of the vector loop is known
557 	 to have a full mask.  */
558       poly_uint64 const_limit;
559       bool first_iteration_full
560 	= (poly_int_tree_p (first_limit, &const_limit)
561 	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562 
563       /* Rather than have a new IV that starts at BIAS and goes up to
564 	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565 	 and adjust the bound down by BIAS.  */
566       tree this_test_limit = test_limit;
567       if (i != 0)
568 	{
569 	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 					  compare_type, this_test_limit,
571 					  bias_tree);
572 	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 					  compare_type, this_test_limit,
574 					  bias_tree);
575 	}
576 
577       /* Create the initial mask.  First include all scalars that
578 	 are within the loop limit.  */
579       tree init_mask = NULL_TREE;
580       if (!first_iteration_full)
581 	{
582 	  tree start, end;
583 	  if (first_limit == test_limit)
584 	    {
585 	      /* Use a natural test between zero (the initial IV value)
586 		 and the loop limit.  The "else" block would be valid too,
587 		 but this choice can avoid the need to load BIAS_TREE into
588 		 a register.  */
589 	      start = zero_index;
590 	      end = this_test_limit;
591 	    }
592 	  else
593 	    {
594 	      /* FIRST_LIMIT is the maximum number of scalars handled by the
595 		 first iteration of the vector loop.  Test the portion
596 		 associated with this mask.  */
597 	      start = bias_tree;
598 	      end = first_limit;
599 	    }
600 
601 	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602 	  tmp_stmt = vect_gen_while (init_mask, start, end);
603 	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604 	}
605 
606       /* Now AND out the bits that are within the number of skipped
607 	 scalars.  */
608       poly_uint64 const_skip;
609       if (nscalars_skip
610 	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 	       && known_le (const_skip, bias)))
612 	{
613 	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614 						    bias_tree, nscalars_skip);
615 	  if (init_mask)
616 	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617 				      init_mask, unskipped_mask);
618 	  else
619 	    init_mask = unskipped_mask;
620 	}
621 
622       if (!init_mask)
623 	/* First iteration is full.  */
624 	init_mask = build_minus_one_cst (mask_type);
625 
626       /* Get the mask value for the next iteration of the loop.  */
627       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630 
631       vect_set_loop_mask (loop, mask, init_mask, next_mask);
632     }
633   return next_mask;
634 }
635 
636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
638    number of iterations of the original scalar loop that should be
639    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
640    as for vect_set_loop_condition.
641 
642    Insert the branch-back condition before LOOP_COND_GSI and return the
643    final gcond.  */
644 
645 static gcond *
vect_set_loop_condition_masked(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647 				tree niters, tree final_iv,
648 				bool niters_maybe_zero,
649 				gimple_stmt_iterator loop_cond_gsi)
650 {
651   gimple_seq preheader_seq = NULL;
652   gimple_seq header_seq = NULL;
653 
654   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655   unsigned int compare_precision = TYPE_PRECISION (compare_type);
656   tree orig_niters = niters;
657 
658   /* Type of the initial value of NITERS.  */
659   tree ni_actual_type = TREE_TYPE (niters);
660   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662 
663   /* Convert NITERS to the same size as the compare.  */
664   if (compare_precision > ni_actual_precision
665       && niters_maybe_zero)
666     {
667       /* We know that there is always at least one iteration, so if the
668 	 count is zero then it must have wrapped.  Cope with this by
669 	 subtracting 1 before the conversion and adding 1 to the result.  */
670       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672 			     niters, build_minus_one_cst (ni_actual_type));
673       niters = gimple_convert (&preheader_seq, compare_type, niters);
674       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675 			     niters, build_one_cst (compare_type));
676     }
677   else
678     niters = gimple_convert (&preheader_seq, compare_type, niters);
679 
680   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681 
682   /* Iterate over all the rgroups and fill in their masks.  We could use
683      the first mask from any rgroup for the loop condition; here we
684      arbitrarily pick the last.  */
685   tree test_mask = NULL_TREE;
686   rgroup_masks *rgm;
687   unsigned int i;
688   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689   FOR_EACH_VEC_ELT (*masks, i, rgm)
690     if (!rgm->masks.is_empty ())
691       {
692 	/* First try using permutes.  This adds a single vector
693 	   instruction to the loop for each mask, but needs no extra
694 	   loop invariants or IVs.  */
695 	unsigned int nmasks = i + 1;
696 	if ((nmasks & 1) == 0)
697 	  {
698 	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699 	    if (!half_rgm->masks.is_empty ()
700 		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701 	      continue;
702 	  }
703 
704 	/* See whether zero-based IV would ever generate all-false masks
705 	   before wrapping around.  */
706 	bool might_wrap_p
707 	  = (iv_limit == -1
708 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709 				    UNSIGNED)
710 		 > compare_precision));
711 
712 	/* Set up all masks for this group.  */
713 	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714 						  &preheader_seq,
715 						  loop_cond_gsi, rgm,
716 						  niters, niters_skip,
717 						  might_wrap_p);
718       }
719 
720   /* Emit all accumulated statements.  */
721   add_preheader_seq (loop, preheader_seq);
722   add_header_seq (loop, header_seq);
723 
724   /* Get a boolean result that tells us whether to iterate.  */
725   edge exit_edge = single_exit (loop);
726   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729 					NULL_TREE, NULL_TREE);
730   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731 
732   /* The loop iterates (NITERS - 1) / VF + 1 times.
733      Subtract one from this to get the latch count.  */
734   tree step = build_int_cst (compare_type,
735 			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737 				       build_minus_one_cst (compare_type));
738   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739 				     niters_minus_one, step);
740 
741   if (final_iv)
742     {
743       gassign *assign = gimple_build_assign (final_iv, orig_niters);
744       gsi_insert_on_edge_immediate (single_exit (loop), assign);
745     }
746 
747   return cond_stmt;
748 }
749 
750 /* Like vect_set_loop_condition, but handle the case in which there
751    are no loop masks.  */
752 
753 static gcond *
vect_set_loop_condition_unmasked(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755 				  tree step, tree final_iv,
756 				  bool niters_maybe_zero,
757 				  gimple_stmt_iterator loop_cond_gsi)
758 {
759   tree indx_before_incr, indx_after_incr;
760   gcond *cond_stmt;
761   gcond *orig_cond;
762   edge pe = loop_preheader_edge (loop);
763   edge exit_edge = single_exit (loop);
764   gimple_stmt_iterator incr_gsi;
765   bool insert_after;
766   enum tree_code code;
767   tree niters_type = TREE_TYPE (niters);
768 
769   orig_cond = get_loop_exit_condition (loop);
770   gcc_assert (orig_cond);
771   loop_cond_gsi = gsi_for_stmt (orig_cond);
772 
773   tree init, limit;
774   if (!niters_maybe_zero && integer_onep (step))
775     {
776       /* In this case we can use a simple 0-based IV:
777 
778 	 A:
779 	   x = 0;
780 	   do
781 	     {
782 	       ...
783 	       x += 1;
784 	     }
785 	   while (x < NITERS);  */
786       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787       init = build_zero_cst (niters_type);
788       limit = niters;
789     }
790   else
791     {
792       /* The following works for all values of NITERS except 0:
793 
794 	 B:
795 	   x = 0;
796 	   do
797 	     {
798 	       ...
799 	       x += STEP;
800 	     }
801 	   while (x <= NITERS - STEP);
802 
803 	 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 	 but stops if x + STEP - 1 >= NITERS.
805 
806 	 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 	 before wrapping around.  There are two obvious ways of dealing with
808 	 this:
809 
810 	 - start at STEP - 1 and compare x before incrementing it
811 	 - start at -1 and compare x after incrementing it
812 
813 	 The latter is simpler and is what we use.  The loop in this case
814 	 looks like:
815 
816 	 C:
817 	   x = -1;
818 	   do
819 	     {
820 	       ...
821 	       x += STEP;
822 	     }
823 	   while (x < NITERS - STEP);
824 
825 	 In both cases the loop limit is NITERS - STEP.  */
826       gimple_seq seq = NULL;
827       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829       if (seq)
830 	{
831 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 	  gcc_assert (!new_bb);
833 	}
834       if (niters_maybe_zero)
835 	{
836 	  /* Case C.  */
837 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 	  init = build_all_ones_cst (niters_type);
839 	}
840       else
841 	{
842 	  /* Case B.  */
843 	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 	  init = build_zero_cst (niters_type);
845 	}
846     }
847 
848   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849   create_iv (init, step, NULL_TREE, loop,
850              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 					      true, NULL_TREE, true,
853 					      GSI_SAME_STMT);
854   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855 				     true, GSI_SAME_STMT);
856 
857   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858 				 NULL_TREE);
859 
860   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
861 
862   /* Record the number of latch iterations.  */
863   if (limit == niters)
864     /* Case A: the loop iterates NITERS times.  Subtract one to get the
865        latch count.  */
866     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 				       build_int_cst (niters_type, 1));
868   else
869     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870        Subtract one from this to get the latch count.  */
871     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 				       limit, step);
873 
874   if (final_iv)
875     {
876       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 					     indx_after_incr, init);
878       gsi_insert_on_edge_immediate (single_exit (loop), assign);
879     }
880 
881   return cond_stmt;
882 }
883 
884 /* If we're using fully-masked loops, make LOOP iterate:
885 
886       N == (NITERS - 1) / STEP + 1
887 
888    times.  When NITERS is zero, this is equivalent to making the loop
889    execute (1 << M) / STEP times, where M is the precision of NITERS.
890    NITERS_MAYBE_ZERO is true if this last case might occur.
891 
892    If we're not using fully-masked loops, make LOOP iterate:
893 
894       N == (NITERS - STEP) / STEP + 1
895 
896    times, where NITERS is known to be outside the range [1, STEP - 1].
897    This is equivalent to making the loop execute NITERS / STEP times
898    when NITERS is nonzero and (1 << M) / STEP times otherwise.
899    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900 
901    If FINAL_IV is nonnull, it is an SSA name that should be set to
902    N * STEP on exit from the loop.
903 
904    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
905 
906 void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908 			 tree niters, tree step, tree final_iv,
909 			 bool niters_maybe_zero)
910 {
911   gcond *cond_stmt;
912   gcond *orig_cond = get_loop_exit_condition (loop);
913   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914 
915   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917 						final_iv, niters_maybe_zero,
918 						loop_cond_gsi);
919   else
920     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921 						  final_iv, niters_maybe_zero,
922 						  loop_cond_gsi);
923 
924   /* Remove old loop exit test.  */
925   stmt_vec_info orig_cond_info;
926   if (loop_vinfo
927       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928     loop_vinfo->remove_stmt (orig_cond_info);
929   else
930     gsi_remove (&loop_cond_gsi, true);
931 
932   if (dump_enabled_p ())
933     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934 		     cond_stmt);
935 }
936 
937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938    For all PHI arguments in FROM->dest and TO->dest from those
939    edges ensure that TO->dest PHI arguments have current_def
940    to that in from.  */
941 
942 static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
944 {
945   gimple_stmt_iterator gsi_from, gsi_to;
946 
947   for (gsi_from = gsi_start_phis (from->dest),
948        gsi_to = gsi_start_phis (to->dest);
949        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
950     {
951       gimple *from_phi = gsi_stmt (gsi_from);
952       gimple *to_phi = gsi_stmt (gsi_to);
953       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
954       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955       if (virtual_operand_p (from_arg))
956 	{
957 	  gsi_next (&gsi_from);
958 	  continue;
959 	}
960       if (virtual_operand_p (to_arg))
961 	{
962 	  gsi_next (&gsi_to);
963 	  continue;
964 	}
965       if (TREE_CODE (from_arg) != SSA_NAME)
966 	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
967       else if (TREE_CODE (to_arg) == SSA_NAME
968 	       && from_arg != to_arg)
969 	{
970 	  if (get_current_def (to_arg) == NULL_TREE)
971 	    {
972 	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973 					      TREE_TYPE (get_current_def
974 							   (from_arg))));
975 	      set_current_def (to_arg, get_current_def (from_arg));
976 	    }
977 	}
978       gsi_next (&gsi_from);
979       gsi_next (&gsi_to);
980     }
981 
982   gphi *from_phi = get_virtual_phi (from->dest);
983   gphi *to_phi = get_virtual_phi (to->dest);
984   if (from_phi)
985     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986 		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
987 }
988 
989 
990 /* Given LOOP this function generates a new copy of it and puts it
991    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
992    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994    entry or exit of LOOP.  */
995 
996 class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998 					class loop *scalar_loop, edge e)
999 {
1000   class loop *new_loop;
1001   basic_block *new_bbs, *bbs, *pbbs;
1002   bool at_exit;
1003   bool was_imm_dom;
1004   basic_block exit_dest;
1005   edge exit, new_exit;
1006   bool duplicate_outer_loop = false;
1007 
1008   exit = single_exit (loop);
1009   at_exit = (e == exit);
1010   if (!at_exit && e != loop_preheader_edge (loop))
1011     return NULL;
1012 
1013   if (scalar_loop == NULL)
1014     scalar_loop = loop;
1015 
1016   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017   pbbs = bbs + 1;
1018   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019   /* Allow duplication of outer loops.  */
1020   if (scalar_loop->inner)
1021     duplicate_outer_loop = true;
1022   /* Check whether duplication is possible.  */
1023   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024     {
1025       free (bbs);
1026       return NULL;
1027     }
1028 
1029   /* Generate new loop structure.  */
1030   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031   duplicate_subloops (scalar_loop, new_loop);
1032 
1033   exit_dest = exit->dest;
1034   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035 					  exit_dest) == loop->header ?
1036 		 true : false);
1037 
1038   /* Also copy the pre-header, this avoids jumping through hoops to
1039      duplicate the loop entry PHI arguments.  Create an empty
1040      pre-header unconditionally for this.  */
1041   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042   edge entry_e = single_pred_edge (preheader);
1043   bbs[0] = preheader;
1044   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045 
1046   exit = single_exit (scalar_loop);
1047   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048 	    &exit, 1, &new_exit, NULL,
1049 	    at_exit ? loop->latch : e->src, true);
1050   exit = single_exit (loop);
1051   basic_block new_preheader = new_bbs[0];
1052 
1053   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054 
1055   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1056   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1057     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1058 
1059   if (scalar_loop != loop)
1060     {
1061       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1062 	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1063 	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1064 	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1065 	 header) to have current_def set, so copy them over.  */
1066       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1067 						exit);
1068       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1069 							   0),
1070 						EDGE_SUCC (loop->latch, 0));
1071     }
1072 
1073   if (at_exit) /* Add the loop copy at exit.  */
1074     {
1075       if (scalar_loop != loop)
1076 	{
1077 	  gphi_iterator gsi;
1078 	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1079 
1080 	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1081 	       gsi_next (&gsi))
1082 	    {
1083 	      gphi *phi = gsi.phi ();
1084 	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1085 	      location_t orig_locus
1086 		= gimple_phi_arg_location_from_edge (phi, e);
1087 
1088 	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1089 	    }
1090 	}
1091       redirect_edge_and_branch_force (e, new_preheader);
1092       flush_pending_stmts (e);
1093       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1094       if (was_imm_dom || duplicate_outer_loop)
1095 	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1096 
1097       /* And remove the non-necessary forwarder again.  Keep the other
1098          one so we have a proper pre-header for the loop at the exit edge.  */
1099       redirect_edge_pred (single_succ_edge (preheader),
1100 			  single_pred (preheader));
1101       delete_basic_block (preheader);
1102       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1103 			       loop_preheader_edge (scalar_loop)->src);
1104     }
1105   else /* Add the copy at entry.  */
1106     {
1107       if (scalar_loop != loop)
1108 	{
1109 	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1110 	  redirect_edge_pred (single_succ_edge (preheader),
1111 			      single_pred (preheader));
1112 	  delete_basic_block (preheader);
1113 	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1114 				   loop_preheader_edge (scalar_loop)->src);
1115 	  preheader = split_edge (loop_preheader_edge (loop));
1116 	  entry_e = single_pred_edge (preheader);
1117 	}
1118 
1119       redirect_edge_and_branch_force (entry_e, new_preheader);
1120       flush_pending_stmts (entry_e);
1121       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1122 
1123       redirect_edge_and_branch_force (new_exit, preheader);
1124       flush_pending_stmts (new_exit);
1125       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1126 
1127       /* And remove the non-necessary forwarder again.  Keep the other
1128          one so we have a proper pre-header for the loop at the exit edge.  */
1129       redirect_edge_pred (single_succ_edge (new_preheader),
1130 			  single_pred (new_preheader));
1131       delete_basic_block (new_preheader);
1132       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1133 			       loop_preheader_edge (new_loop)->src);
1134     }
1135 
1136   if (scalar_loop != loop)
1137     {
1138       /* Update new_loop->header PHIs, so that on the preheader
1139 	 edge they are the ones from loop rather than scalar_loop.  */
1140       gphi_iterator gsi_orig, gsi_new;
1141       edge orig_e = loop_preheader_edge (loop);
1142       edge new_e = loop_preheader_edge (new_loop);
1143 
1144       for (gsi_orig = gsi_start_phis (loop->header),
1145 	   gsi_new = gsi_start_phis (new_loop->header);
1146 	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147 	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148 	{
1149 	  gphi *orig_phi = gsi_orig.phi ();
1150 	  gphi *new_phi = gsi_new.phi ();
1151 	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152 	  location_t orig_locus
1153 	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154 
1155 	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156 	}
1157     }
1158 
1159   free (new_bbs);
1160   free (bbs);
1161 
1162   checking_verify_dominators (CDI_DOMINATORS);
1163 
1164   return new_loop;
1165 }
1166 
1167 
1168 /* Given the condition expression COND, put it as the last statement of
1169    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1171    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1172    new edge as irreducible if IRREDUCIBLE_P is true.  */
1173 
1174 static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176 		       basic_block guard_to, basic_block dom_bb,
1177 		       profile_probability probability, bool irreducible_p)
1178 {
1179   gimple_stmt_iterator gsi;
1180   edge new_e, enter_e;
1181   gcond *cond_stmt;
1182   gimple_seq gimplify_stmt_list = NULL;
1183 
1184   enter_e = EDGE_SUCC (guard_bb, 0);
1185   enter_e->flags &= ~EDGE_FALLTHRU;
1186   enter_e->flags |= EDGE_FALSE_VALUE;
1187   gsi = gsi_last_bb (guard_bb);
1188 
1189   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190 				 NULL_TREE);
1191   if (gimplify_stmt_list)
1192     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193 
1194   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195   gsi = gsi_last_bb (guard_bb);
1196   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197 
1198   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1199   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200 
1201   new_e->probability = probability;
1202   if (irreducible_p)
1203     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204 
1205   enter_e->probability = probability.invert ();
1206   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207 
1208   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1209   if (enter_e->dest->loop_father->header == enter_e->dest)
1210     split_edge (enter_e);
1211 
1212   return new_e;
1213 }
1214 
1215 
1216 /* This function verifies that the following restrictions apply to LOOP:
1217    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218        for innermost loop and 5 basic blocks for outer-loop.
1219    (2) it is single entry, single exit
1220    (3) its exit condition is the last stmt in the header
1221    (4) E is the entry/exit edge of LOOP.
1222  */
1223 
1224 bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226 {
1227   edge exit_e = single_exit (loop);
1228   edge entry_e = loop_preheader_edge (loop);
1229   gcond *orig_cond = get_loop_exit_condition (loop);
1230   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231   unsigned int num_bb = loop->inner? 5 : 2;
1232 
1233   /* All loops have an outer scope; the only case loop->outer is NULL is for
1234      the function itself.  */
1235   if (!loop_outer (loop)
1236       || loop->num_nodes != num_bb
1237       || !empty_block_p (loop->latch)
1238       || !single_exit (loop)
1239       /* Verify that new loop exit condition can be trivially modified.  */
1240       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241       || (e != exit_e && e != entry_e))
1242     return false;
1243 
1244   return true;
1245 }
1246 
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248    in the exit bb and rename all the uses after the loop.  This simplifies
1249    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250    (but normally loop closed SSA form doesn't require virtual PHIs to be
1251    in the same form).  Doing this early simplifies the checking what
1252    uses should be renamed.
1253 
1254    If we create a new phi after the loop, return the definition that
1255    applies on entry to the loop, otherwise return null.  */
1256 
1257 static tree
create_lcssa_for_virtual_phi(class loop * loop)1258 create_lcssa_for_virtual_phi (class loop *loop)
1259 {
1260   gphi_iterator gsi;
1261   edge exit_e = single_exit (loop);
1262 
1263   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265       {
1266 	gphi *phi = gsi.phi ();
1267 	for (gsi = gsi_start_phis (exit_e->dest);
1268 	     !gsi_end_p (gsi); gsi_next (&gsi))
1269 	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270 	    break;
1271 	if (gsi_end_p (gsi))
1272 	  {
1273 	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274 	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275 	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 	    imm_use_iterator imm_iter;
1277 	    gimple *stmt;
1278 	    use_operand_p use_p;
1279 
1280 	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282 	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283 	    gimple_phi_set_result (new_phi, new_vop);
1284 	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285 	      if (stmt != new_phi
1286 		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 		  SET_USE (use_p, new_vop);
1289 
1290 	    return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291 	  }
1292 	break;
1293       }
1294   return NULL_TREE;
1295 }
1296 
1297 /* Function vect_get_loop_location.
1298 
1299    Extract the location of the loop in the source code.
1300    If the loop is not well formed for vectorization, an estimated
1301    location is calculated.
1302    Return the loop location if succeed and NULL if not.  */
1303 
1304 dump_user_location_t
find_loop_location(class loop * loop)1305 find_loop_location (class loop *loop)
1306 {
1307   gimple *stmt = NULL;
1308   basic_block bb;
1309   gimple_stmt_iterator si;
1310 
1311   if (!loop)
1312     return dump_user_location_t ();
1313 
1314   stmt = get_loop_exit_condition (loop);
1315 
1316   if (stmt
1317       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318     return stmt;
1319 
1320   /* If we got here the loop is probably not "well formed",
1321      try to estimate the loop location */
1322 
1323   if (!loop->header)
1324     return dump_user_location_t ();
1325 
1326   bb = loop->header;
1327 
1328   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329     {
1330       stmt = gsi_stmt (si);
1331       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332         return stmt;
1333     }
1334 
1335   return dump_user_location_t ();
1336 }
1337 
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339    loop to be vectorized.  */
1340 
1341 static bool
iv_phi_p(stmt_vec_info stmt_info)1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345   if (virtual_operand_p (PHI_RESULT (phi)))
1346     return false;
1347 
1348   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350     return false;
1351 
1352   return true;
1353 }
1354 
1355 /* Function vect_can_advance_ivs_p
1356 
1357    In case the number of iterations that LOOP iterates is unknown at compile
1358    time, an epilog loop will be generated, and the loop induction variables
1359    (IVs) will be "advanced" to the value they are supposed to take just before
1360    the epilog loop.  Here we check that the access function of the loop IVs
1361    and the expression that represents the loop bound are simple enough.
1362    These restrictions will be relaxed in the future.  */
1363 
1364 bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block bb = loop->header;
1369   gphi_iterator gsi;
1370 
1371   /* Analyze phi functions of the loop header.  */
1372 
1373   if (dump_enabled_p ())
1374     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376     {
1377       tree evolution_part;
1378 
1379       gphi *phi = gsi.phi ();
1380       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381       if (dump_enabled_p ())
1382 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383 			 phi_info->stmt);
1384 
1385       /* Skip virtual phi's. The data dependences that are associated with
1386 	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387 
1388 	 Skip reduction phis.  */
1389       if (!iv_phi_p (phi_info))
1390 	{
1391 	  if (dump_enabled_p ())
1392 	    dump_printf_loc (MSG_NOTE, vect_location,
1393 			     "reduc or virtual phi. skip.\n");
1394 	  continue;
1395 	}
1396 
1397       /* Analyze the evolution function.  */
1398 
1399       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400       if (evolution_part == NULL_TREE)
1401         {
1402 	  if (dump_enabled_p ())
1403 	    dump_printf (MSG_MISSED_OPTIMIZATION,
1404 			 "No access function or evolution.\n");
1405 	  return false;
1406         }
1407 
1408       /* FORNOW: We do not transform initial conditions of IVs
1409 	 which evolution functions are not invariants in the loop.  */
1410 
1411       if (!expr_invariant_in_loop_p (loop, evolution_part))
1412 	{
1413 	  if (dump_enabled_p ())
1414 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 			     "evolution not invariant in loop.\n");
1416 	  return false;
1417 	}
1418 
1419       /* FORNOW: We do not transform initial conditions of IVs
1420 	 which evolution functions are a polynomial of degree >= 2.  */
1421 
1422       if (tree_is_chrec (evolution_part))
1423 	{
1424 	  if (dump_enabled_p ())
1425 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 			     "evolution is chrec.\n");
1427 	  return false;
1428 	}
1429     }
1430 
1431   return true;
1432 }
1433 
1434 
1435 /*   Function vect_update_ivs_after_vectorizer.
1436 
1437      "Advance" the induction variables of LOOP to the value they should take
1438      after the execution of LOOP.  This is currently necessary because the
1439      vectorizer does not handle induction variables that are used after the
1440      loop.  Such a situation occurs when the last iterations of LOOP are
1441      peeled, because:
1442      1. We introduced new uses after LOOP for IVs that were not originally used
1443         after LOOP: the IVs of LOOP are now used by an epilog loop.
1444      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445         times, whereas the loop IVs should be bumped N times.
1446 
1447      Input:
1448      - LOOP - a loop that is going to be vectorized. The last few iterations
1449               of LOOP were peeled.
1450      - NITERS - the number of iterations that LOOP executes (before it is
1451                 vectorized). i.e, the number of times the ivs should be bumped.
1452      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453                   coming out from LOOP on which there are uses of the LOOP ivs
1454 		  (this is the path from LOOP->exit to epilog_loop->preheader).
1455 
1456                   The new definitions of the ivs are placed in LOOP->exit.
1457                   The phi args associated with the edge UPDATE_E in the bb
1458                   UPDATE_E->dest are updated accordingly.
1459 
1460      Assumption 1: Like the rest of the vectorizer, this function assumes
1461      a single loop exit that has a single predecessor.
1462 
1463      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464      organized in the same order.
1465 
1466      Assumption 3: The access function of the ivs is simple enough (see
1467      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1468 
1469      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470      coming out of LOOP on which the ivs of LOOP are used (this is the path
1471      that leads to the epilog loop; other paths skip the epilog loop).  This
1472      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473      needs to have its phis updated.
1474  */
1475 
1476 static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 				  tree niters, edge update_e)
1479 {
1480   gphi_iterator gsi, gsi1;
1481   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482   basic_block update_bb = update_e->dest;
1483   basic_block exit_bb = single_exit (loop)->dest;
1484 
1485   /* Make sure there exists a single-predecessor exit bb:  */
1486   gcc_assert (single_pred_p (exit_bb));
1487   gcc_assert (single_succ_edge (exit_bb) == update_e);
1488 
1489   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491        gsi_next (&gsi), gsi_next (&gsi1))
1492     {
1493       tree init_expr;
1494       tree step_expr, off;
1495       tree type;
1496       tree var, ni, ni_name;
1497       gimple_stmt_iterator last_gsi;
1498 
1499       gphi *phi = gsi.phi ();
1500       gphi *phi1 = gsi1.phi ();
1501       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502       if (dump_enabled_p ())
1503 	dump_printf_loc (MSG_NOTE, vect_location,
1504 			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505 
1506       /* Skip reduction and virtual phis.  */
1507       if (!iv_phi_p (phi_info))
1508 	{
1509 	  if (dump_enabled_p ())
1510 	    dump_printf_loc (MSG_NOTE, vect_location,
1511 			     "reduc or virtual phi. skip.\n");
1512 	  continue;
1513 	}
1514 
1515       type = TREE_TYPE (gimple_phi_result (phi));
1516       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517       step_expr = unshare_expr (step_expr);
1518 
1519       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520          of degree >= 2 or exponential.  */
1521       gcc_assert (!tree_is_chrec (step_expr));
1522 
1523       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524 
1525       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526 			 fold_convert (TREE_TYPE (step_expr), niters),
1527 			 step_expr);
1528       if (POINTER_TYPE_P (type))
1529 	ni = fold_build_pointer_plus (init_expr, off);
1530       else
1531 	ni = fold_build2 (PLUS_EXPR, type,
1532 			  init_expr, fold_convert (type, off));
1533 
1534       var = create_tmp_var (type, "tmp");
1535 
1536       last_gsi = gsi_last_bb (exit_bb);
1537       gimple_seq new_stmts = NULL;
1538       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539       /* Exit_bb shouldn't be empty.  */
1540       if (!gsi_end_p (last_gsi))
1541 	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542       else
1543 	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544 
1545       /* Fix phi expressions in the successor bb.  */
1546       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547     }
1548 }
1549 
1550 /* Return a gimple value containing the misalignment (measured in vector
1551    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552    it is away from a perfectly aligned address.  Add any new statements
1553    to SEQ.  */
1554 
1555 static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557 {
1558   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559   stmt_vec_info stmt_info = dr_info->stmt;
1560   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561 
1562   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563   unsigned HOST_WIDE_INT target_align_c;
1564   tree target_align_minus_1;
1565 
1566   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567 					size_zero_node) < 0;
1568   tree offset = (negative
1569 		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570 		 : size_zero_node);
1571   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572 							  offset);
1573   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1574   if (target_align.is_constant (&target_align_c))
1575     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1576   else
1577     {
1578       tree vla = build_int_cst (type, target_align);
1579       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1580 				    fold_build2 (MINUS_EXPR, type,
1581 						 build_int_cst (type, 0), vla));
1582       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1583 					  build_int_cst (type, 1));
1584     }
1585 
1586   HOST_WIDE_INT elem_size
1587     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589 
1590   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1591   tree int_start_addr = fold_convert (type, start_addr);
1592   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593 					target_align_minus_1);
1594 
1595   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1596   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597 					elem_size_log);
1598 
1599   return misalign_in_elems;
1600 }
1601 
1602 /* Function vect_gen_prolog_loop_niters
1603 
1604    Generate the number of iterations which should be peeled as prolog for the
1605    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1606    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1607    As a result, after the execution of this loop, the data reference DR will
1608    refer to an aligned location.  The following computation is generated:
1609 
1610    If the misalignment of DR is known at compile time:
1611      addr_mis = int mis = DR_MISALIGNMENT (dr);
1612    Else, compute address misalignment in bytes:
1613      addr_mis = addr & (target_align - 1)
1614 
1615    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1616 
1617    (elem_size = element type size; an element is the scalar element whose type
1618    is the inner type of the vectype)
1619 
1620    The computations will be emitted at the end of BB.  We also compute and
1621    store upper bound (included) of the result in BOUND.
1622 
1623    When the step of the data-ref in the loop is not 1 (as in interleaved data
1624    and SLP), the number of iterations of the prolog must be divided by the step
1625    (which is equal to the size of interleaved group).
1626 
1627    The above formulas assume that VF == number of elements in the vector. This
1628    may not hold when there are multiple-types in the loop.
1629    In this case, for some data-references in the loop the VF does not represent
1630    the number of elements that fit in the vector.  Therefore, instead of VF we
1631    use TYPE_VECTOR_SUBPARTS.  */
1632 
1633 static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1634 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1635 			     basic_block bb, int *bound)
1636 {
1637   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1638   tree var;
1639   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1640   gimple_seq stmts = NULL, new_stmts = NULL;
1641   tree iters, iters_name;
1642   stmt_vec_info stmt_info = dr_info->stmt;
1643   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1644   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1645 
1646   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1647     {
1648       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649 
1650       if (dump_enabled_p ())
1651         dump_printf_loc (MSG_NOTE, vect_location,
1652                          "known peeling = %d.\n", npeel);
1653 
1654       iters = build_int_cst (niters_type, npeel);
1655       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1656     }
1657   else
1658     {
1659       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660       tree type = TREE_TYPE (misalign_in_elems);
1661       HOST_WIDE_INT elem_size
1662 	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1663       /* We only do prolog peeling if the target alignment is known at compile
1664          time.  */
1665       poly_uint64 align_in_elems =
1666 	exact_div (target_align, elem_size);
1667       tree align_in_elems_minus_1 =
1668 	build_int_cst (type, align_in_elems - 1);
1669       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670 
1671       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1672 				 & (align_in_elems - 1)).  */
1673       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1674 					    size_zero_node) < 0;
1675       if (negative)
1676 	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677 			     align_in_elems_tree);
1678       else
1679 	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680 			     misalign_in_elems);
1681       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1682       iters = fold_convert (niters_type, iters);
1683       unsigned HOST_WIDE_INT align_in_elems_c;
1684       if (align_in_elems.is_constant (&align_in_elems_c))
1685 	*bound = align_in_elems_c - 1;
1686       else
1687 	*bound = -1;
1688     }
1689 
1690   if (dump_enabled_p ())
1691     dump_printf_loc (MSG_NOTE, vect_location,
1692 		     "niters for prolog loop: %T\n", iters);
1693 
1694   var = create_tmp_var (niters_type, "prolog_loop_niters");
1695   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1696 
1697   if (new_stmts)
1698     gimple_seq_add_seq (&stmts, new_stmts);
1699   if (stmts)
1700     {
1701       gcc_assert (single_succ_p (bb));
1702       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1703       if (gsi_end_p (gsi))
1704 	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1705       else
1706 	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1707     }
1708   return iters_name;
1709 }
1710 
1711 
1712 /* Function vect_update_init_of_dr
1713 
1714    If CODE is PLUS, the vector loop starts NITERS iterations after the
1715    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716    iterations before the scalar one (using masking to skip inactive
1717    elements).  This function updates the information recorded in DR to
1718    account for the difference.  Specifically, it updates the OFFSET
1719    field of DR_INFO.  */
1720 
1721 static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1722 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1723 {
1724   struct data_reference *dr = dr_info->dr;
1725   tree offset = dr_info->offset;
1726   if (!offset)
1727     offset = build_zero_cst (sizetype);
1728 
1729   niters = fold_build2 (MULT_EXPR, sizetype,
1730 			fold_convert (sizetype, niters),
1731 			fold_convert (sizetype, DR_STEP (dr)));
1732   offset = fold_build2 (code, sizetype,
1733 			fold_convert (sizetype, offset), niters);
1734   dr_info->offset = offset;
1735 }
1736 
1737 
1738 /* Function vect_update_inits_of_drs
1739 
1740    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741    CODE and NITERS are as for vect_update_inits_of_dr.  */
1742 
1743 void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1744 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745 			  tree_code code)
1746 {
1747   unsigned int i;
1748   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1749   struct data_reference *dr;
1750 
1751   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1752 
1753   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1754      here, but since we might use these niters to update the epilogues niters
1755      and data references we can't insert them here as this definition might not
1756      always dominate its uses.  */
1757   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1758     niters = fold_convert (sizetype, niters);
1759 
1760   FOR_EACH_VEC_ELT (datarefs, i, dr)
1761     {
1762       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1764 	  && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1765 	vect_update_init_of_dr (dr_info, niters, code);
1766     }
1767 }
1768 
1769 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770    by masking.  This involves calculating the number of iterations to
1771    be peeled and then aligning all memory references appropriately.  */
1772 
1773 void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1774 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775 {
1776   tree misalign_in_elems;
1777   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778 
1779   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780 
1781   /* From the information recorded in LOOP_VINFO get the number of iterations
1782      that need to be skipped via masking.  */
1783   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784     {
1785       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786 			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787       misalign_in_elems = build_int_cst (type, misalign);
1788     }
1789   else
1790     {
1791       gimple_seq seq1 = NULL, seq2 = NULL;
1792       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793       misalign_in_elems = fold_convert (type, misalign_in_elems);
1794       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795 						&seq2, true, NULL_TREE);
1796       gimple_seq_add_seq (&seq1, seq2);
1797       if (seq1)
1798 	{
1799 	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800 	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801 	  gcc_assert (!new_bb);
1802 	}
1803     }
1804 
1805   if (dump_enabled_p ())
1806     dump_printf_loc (MSG_NOTE, vect_location,
1807 		     "misalignment for fully-masked loop: %T\n",
1808 		     misalign_in_elems);
1809 
1810   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811 
1812   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813 }
1814 
1815 /* This function builds ni_name = number of iterations.  Statements
1816    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1817    it to TRUE if new ssa_var is generated.  */
1818 
1819 tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1820 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1821 {
1822   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1823   if (TREE_CODE (ni) == INTEGER_CST)
1824     return ni;
1825   else
1826     {
1827       tree ni_name, var;
1828       gimple_seq stmts = NULL;
1829       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1830 
1831       var = create_tmp_var (TREE_TYPE (ni), "niters");
1832       ni_name = force_gimple_operand (ni, &stmts, false, var);
1833       if (stmts)
1834 	{
1835 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1836 	  if (new_var_p != NULL)
1837 	    *new_var_p = true;
1838 	}
1839 
1840       return ni_name;
1841     }
1842 }
1843 
1844 /* Calculate the number of iterations above which vectorized loop will be
1845    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1846    of prolog loop.  If it's integer const, the integer number is also passed
1847    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1848    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1849    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1850    threshold below which the scalar (rather than vectorized) loop will be
1851    executed.  This function stores the upper bound (inclusive) of the result
1852    in BOUND_SCALAR.  */
1853 
1854 static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1855 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856 			     int bound_prolog, poly_int64 bound_epilog, int th,
1857 			     poly_uint64 *bound_scalar,
1858 			     bool check_profitability)
1859 {
1860   tree type = TREE_TYPE (niters_prolog);
1861   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862 			     build_int_cst (type, bound_epilog));
1863 
1864   *bound_scalar = bound_prolog + bound_epilog;
1865   if (check_profitability)
1866     {
1867       /* TH indicates the minimum niters of vectorized loop, while we
1868 	 compute the maximum niters of scalar loop.  */
1869       th--;
1870       /* Peeling for constant times.  */
1871       if (int_niters_prolog >= 0)
1872 	{
1873 	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1874 	  return build_int_cst (type, *bound_scalar);
1875 	}
1876       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1877 	 and BOUND_EPILOG are inclusive upper bounds.  */
1878       if (known_ge (th, bound_prolog + bound_epilog))
1879 	{
1880 	  *bound_scalar = th;
1881 	  return build_int_cst (type, th);
1882 	}
1883       /* Need to do runtime comparison.  */
1884       else if (maybe_gt (th, bound_epilog))
1885 	{
1886 	  *bound_scalar = upper_bound (*bound_scalar, th);
1887 	  return fold_build2 (MAX_EXPR, type,
1888 			      build_int_cst (type, th), niters);
1889 	}
1890     }
1891   return niters;
1892 }
1893 
1894 /* NITERS is the number of times that the original scalar loop executes
1895    after peeling.  Work out the maximum number of iterations N that can
1896    be handled by the vectorized form of the loop and then either:
1897 
1898    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1899 
1900 	niters_vector = N
1901 
1902    b) set *STEP_VECTOR_PTR to one and generate:
1903 
1904         niters_vector = N / vf
1905 
1906    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1908    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1909 
1910 void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1911 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912 			     tree *niters_vector_ptr, tree *step_vector_ptr,
1913 			     bool niters_no_overflow)
1914 {
1915   tree ni_minus_gap, var;
1916   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1918   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919   tree log_vf = NULL_TREE;
1920 
1921   /* If epilogue loop is required because of data accesses with gaps, we
1922      subtract one iteration from the total number of iterations here for
1923      correct calculation of RATIO.  */
1924   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1925     {
1926       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927 				  build_one_cst (type));
1928       if (!is_gimple_val (ni_minus_gap))
1929 	{
1930 	  var = create_tmp_var (type, "ni_gap");
1931 	  gimple *stmts = NULL;
1932 	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1933 					       true, var);
1934 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1935 	}
1936     }
1937   else
1938     ni_minus_gap = niters;
1939 
1940   unsigned HOST_WIDE_INT const_vf;
1941   if (vf.is_constant (&const_vf)
1942       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943     {
1944       /* Create: niters >> log2(vf) */
1945       /* If it's known that niters == number of latch executions + 1 doesn't
1946 	 overflow, we can generate niters >> log2(vf); otherwise we generate
1947 	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1948 	 will be at least one.  */
1949       log_vf = build_int_cst (type, exact_log2 (const_vf));
1950       if (niters_no_overflow)
1951 	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1952       else
1953 	niters_vector
1954 	  = fold_build2 (PLUS_EXPR, type,
1955 			 fold_build2 (RSHIFT_EXPR, type,
1956 				      fold_build2 (MINUS_EXPR, type,
1957 						   ni_minus_gap,
1958 						   build_int_cst (type, vf)),
1959 				      log_vf),
1960 			 build_int_cst (type, 1));
1961       step_vector = build_one_cst (type);
1962     }
1963   else
1964     {
1965       niters_vector = ni_minus_gap;
1966       step_vector = build_int_cst (type, vf);
1967     }
1968 
1969   if (!is_gimple_val (niters_vector))
1970     {
1971       var = create_tmp_var (type, "bnd");
1972       gimple_seq stmts = NULL;
1973       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1974       gsi_insert_seq_on_edge_immediate (pe, stmts);
1975       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976 	 we set range information to make niters analyzer's life easier.
1977 	 Note the number of latch iteration value can be TYPE_MAX_VALUE so
1978 	 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
1979       if (stmts != NULL && log_vf)
1980 	{
1981 	  if (niters_no_overflow)
1982 	    set_range_info (niters_vector, VR_RANGE,
1983 			    wi::one (TYPE_PRECISION (type)),
1984 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
1985 						       TYPE_SIGN (type)),
1986 					exact_log2 (const_vf),
1987 					TYPE_SIGN (type)));
1988 	  /* For VF == 1 the vector IV might also overflow so we cannot
1989 	     assert a minimum value of 1.  */
1990 	  else if (const_vf > 1)
1991 	    set_range_info (niters_vector, VR_RANGE,
1992 			    wi::one (TYPE_PRECISION (type)),
1993 			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
1994 						       TYPE_SIGN (type))
1995 					- (const_vf - 1),
1996 					exact_log2 (const_vf), TYPE_SIGN (type))
1997 			    + 1);
1998 	}
1999     }
2000   *niters_vector_ptr = niters_vector;
2001   *step_vector_ptr = step_vector;
2002 
2003   return;
2004 }
2005 
2006 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2007    loop specified by LOOP_VINFO after vectorization, compute the number
2008    of iterations before vectorization (niters_vector * vf) and store it
2009    to NITERS_VECTOR_MULT_VF_PTR.  */
2010 
2011 static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)2012 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2013 				     tree niters_vector,
2014 				     tree *niters_vector_mult_vf_ptr)
2015 {
2016   /* We should be using a step_vector of VF if VF is variable.  */
2017   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2018   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2019   tree type = TREE_TYPE (niters_vector);
2020   tree log_vf = build_int_cst (type, exact_log2 (vf));
2021   basic_block exit_bb = single_exit (loop)->dest;
2022 
2023   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2024   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2025 					    niters_vector, log_vf);
2026   if (!is_gimple_val (niters_vector_mult_vf))
2027     {
2028       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2029       gimple_seq stmts = NULL;
2030       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2031 						    &stmts, true, var);
2032       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2033       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2034     }
2035   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2036 }
2037 
2038 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2039    this function searches for the corresponding lcssa phi node in exit
2040    bb of LOOP.  If it is found, return the phi result; otherwise return
2041    NULL.  */
2042 
2043 static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2044 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2045 		gphi *lcssa_phi)
2046 {
2047   gphi_iterator gsi;
2048   edge e = single_exit (loop);
2049 
2050   gcc_assert (single_pred_p (e->dest));
2051   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2052     {
2053       gphi *phi = gsi.phi ();
2054       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2055 			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2056 	return PHI_RESULT (phi);
2057     }
2058   return NULL_TREE;
2059 }
2060 
2061 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2062    from SECOND/FIRST and puts it at the original loop's preheader/exit
2063    edge, the two loops are arranged as below:
2064 
2065        preheader_a:
2066      first_loop:
2067        header_a:
2068 	 i_1 = PHI<i_0, i_2>;
2069 	 ...
2070 	 i_2 = i_1 + 1;
2071 	 if (cond_a)
2072 	   goto latch_a;
2073 	 else
2074 	   goto between_bb;
2075        latch_a:
2076 	 goto header_a;
2077 
2078        between_bb:
2079 	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2080 
2081      second_loop:
2082        header_b:
2083 	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2084 				 or with i_2 if no LCSSA phi is created
2085 				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2086 	 ...
2087 	 i_4 = i_3 + 1;
2088 	 if (cond_b)
2089 	   goto latch_b;
2090 	 else
2091 	   goto exit_bb;
2092        latch_b:
2093 	 goto header_b;
2094 
2095        exit_bb:
2096 
2097    This function creates loop closed SSA for the first loop; update the
2098    second loop's PHI nodes by replacing argument on incoming edge with the
2099    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2100    is false, Loop closed ssa phis will only be created for non-iv phis for
2101    the first loop.
2102 
2103    This function assumes exit bb of the first loop is preheader bb of the
2104    second loop, i.e, between_bb in the example code.  With PHIs updated,
2105    the second loop will execute rest iterations of the first.  */
2106 
2107 static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)2108 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2109 				   class loop *first, class loop *second,
2110 				   bool create_lcssa_for_iv_phis)
2111 {
2112   gphi_iterator gsi_update, gsi_orig;
2113   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2114 
2115   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2116   edge second_preheader_e = loop_preheader_edge (second);
2117   basic_block between_bb = single_exit (first)->dest;
2118 
2119   gcc_assert (between_bb == second_preheader_e->src);
2120   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2121   /* Either the first loop or the second is the loop to be vectorized.  */
2122   gcc_assert (loop == first || loop == second);
2123 
2124   for (gsi_orig = gsi_start_phis (first->header),
2125        gsi_update = gsi_start_phis (second->header);
2126        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2127        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2128     {
2129       gphi *orig_phi = gsi_orig.phi ();
2130       gphi *update_phi = gsi_update.phi ();
2131 
2132       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2133       /* Generate lcssa PHI node for the first loop.  */
2134       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2135       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2136       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2137 	{
2138 	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2139 	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2140 	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2141 	  arg = new_res;
2142 	}
2143 
2144       /* Update PHI node in the second loop by replacing arg on the loop's
2145 	 incoming edge.  */
2146       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2147     }
2148 
2149   /* For epilogue peeling we have to make sure to copy all LC PHIs
2150      for correct vectorization of live stmts.  */
2151   if (loop == first)
2152     {
2153       basic_block orig_exit = single_exit (second)->dest;
2154       for (gsi_orig = gsi_start_phis (orig_exit);
2155 	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2156 	{
2157 	  gphi *orig_phi = gsi_orig.phi ();
2158 	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2159 	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2160 	    continue;
2161 
2162 	  /* Already created in the above loop.   */
2163 	  if (find_guard_arg (first, second, orig_phi))
2164 	    continue;
2165 
2166 	  tree new_res = copy_ssa_name (orig_arg);
2167 	  gphi *lcphi = create_phi_node (new_res, between_bb);
2168 	  add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2169 	}
2170     }
2171 }
2172 
2173 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2174    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2175    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2176    appear like below:
2177 
2178        guard_bb:
2179 	 if (cond)
2180 	   goto merge_bb;
2181 	 else
2182 	   goto skip_loop;
2183 
2184      skip_loop:
2185        header_a:
2186 	 i_1 = PHI<i_0, i_2>;
2187 	 ...
2188 	 i_2 = i_1 + 1;
2189 	 if (cond_a)
2190 	   goto latch_a;
2191 	 else
2192 	   goto exit_a;
2193        latch_a:
2194 	 goto header_a;
2195 
2196        exit_a:
2197 	 i_5 = PHI<i_2>;
2198 
2199        merge_bb:
2200 	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2201 
2202      update_loop:
2203        header_b:
2204 	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2205 	 ...
2206 	 i_4 = i_3 + 1;
2207 	 if (cond_b)
2208 	   goto latch_b;
2209 	 else
2210 	   goto exit_bb;
2211        latch_b:
2212 	 goto header_b;
2213 
2214        exit_bb:
2215 
2216    This function creates PHI nodes at merge_bb and replaces the use of i_5
2217    in the update_loop's PHI node with the result of new PHI result.  */
2218 
2219 static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2220 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2221 				    class loop *update_loop,
2222 				    edge guard_edge, edge merge_edge)
2223 {
2224   location_t merge_loc, guard_loc;
2225   edge orig_e = loop_preheader_edge (skip_loop);
2226   edge update_e = loop_preheader_edge (update_loop);
2227   gphi_iterator gsi_orig, gsi_update;
2228 
2229   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2230 	gsi_update = gsi_start_phis (update_loop->header));
2231        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2232        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2233     {
2234       gphi *orig_phi = gsi_orig.phi ();
2235       gphi *update_phi = gsi_update.phi ();
2236 
2237       /* Generate new phi node at merge bb of the guard.  */
2238       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2239       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2240 
2241       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2242 	 args in NEW_PHI for these edges.  */
2243       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2244       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2245       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2246       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2247       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2248       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2249 
2250       /* Update phi in UPDATE_PHI.  */
2251       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2252     }
2253 }
2254 
2255 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2256    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2257    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2258    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2259    The CFG looks like:
2260 
2261      loop:
2262        header_a:
2263 	 i_1 = PHI<i_0, i_2>;
2264 	 ...
2265 	 i_2 = i_1 + 1;
2266 	 if (cond_a)
2267 	   goto latch_a;
2268 	 else
2269 	   goto exit_a;
2270        latch_a:
2271 	 goto header_a;
2272 
2273        exit_a:
2274 
2275        guard_bb:
2276 	 if (cond)
2277 	   goto merge_bb;
2278 	 else
2279 	   goto epilog_loop;
2280 
2281        ;; fall_through_bb
2282 
2283      epilog_loop:
2284        header_b:
2285 	 i_3 = PHI<i_2, i_4>;
2286 	 ...
2287 	 i_4 = i_3 + 1;
2288 	 if (cond_b)
2289 	   goto latch_b;
2290 	 else
2291 	   goto merge_bb;
2292        latch_b:
2293 	 goto header_b;
2294 
2295        merge_bb:
2296 	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2297 
2298        exit_bb:
2299 	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2300 
2301    For each name used out side EPILOG (i.e - for each name that has a lcssa
2302    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2303    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2304    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2305    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2306    in exit_bb will also be updated.  */
2307 
2308 static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2309 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2310 				    edge guard_edge, edge merge_edge)
2311 {
2312   gphi_iterator gsi;
2313   basic_block merge_bb = guard_edge->dest;
2314 
2315   gcc_assert (single_succ_p (merge_bb));
2316   edge e = single_succ_edge (merge_bb);
2317   basic_block exit_bb = e->dest;
2318   gcc_assert (single_pred_p (exit_bb));
2319   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2320 
2321   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2322     {
2323       gphi *update_phi = gsi.phi ();
2324       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2325 
2326       tree merge_arg = NULL_TREE;
2327 
2328       /* If the old argument is a SSA_NAME use its current_def.  */
2329       if (TREE_CODE (old_arg) == SSA_NAME)
2330 	merge_arg = get_current_def (old_arg);
2331       /* If it's a constant or doesn't have a current_def, just use the old
2332 	 argument.  */
2333       if (!merge_arg)
2334 	merge_arg = old_arg;
2335 
2336       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2337       /* If the var is live after loop but not a reduction, we simply
2338 	 use the old arg.  */
2339       if (!guard_arg)
2340 	guard_arg = old_arg;
2341 
2342       /* Create new phi node in MERGE_BB:  */
2343       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2344       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2345 
2346       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2347 	 the two PHI args in merge_phi for these edges.  */
2348       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2349       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2350 
2351       /* Update the original phi in exit_bb.  */
2352       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2353     }
2354 }
2355 
2356 /* EPILOG loop is duplicated from the original loop for vectorizing,
2357    the arg of its loop closed ssa PHI needs to be updated.  */
2358 
2359 static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2360 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2361 {
2362   gphi_iterator gsi;
2363   basic_block exit_bb = single_exit (epilog)->dest;
2364 
2365   gcc_assert (single_pred_p (exit_bb));
2366   edge e = EDGE_PRED (exit_bb, 0);
2367   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2368     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2369 }
2370 
2371 /* Function vect_do_peeling.
2372 
2373    Input:
2374    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2375 
2376        preheader:
2377      LOOP:
2378        header_bb:
2379 	 loop_body
2380 	 if (exit_loop_cond) goto exit_bb
2381 	 else                goto header_bb
2382        exit_bb:
2383 
2384    - NITERS: The number of iterations of the loop.
2385    - NITERSM1: The number of iterations of the loop's latch.
2386    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2387    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2388 			      CHECK_PROFITABILITY is true.
2389    Output:
2390    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2391      iterate after vectorization; see vect_set_loop_condition for details.
2392    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2393      should be set to the number of scalar iterations handled by the
2394      vector loop.  The SSA name is only used on exit from the loop.
2395 
2396    This function peels prolog and epilog from the loop, adds guards skipping
2397    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2398    would look like:
2399 
2400        guard_bb_1:
2401 	 if (prefer_scalar_loop) goto merge_bb_1
2402 	 else                    goto guard_bb_2
2403 
2404        guard_bb_2:
2405          if (skip_prolog) goto merge_bb_2
2406          else             goto prolog_preheader
2407 
2408        prolog_preheader:
2409      PROLOG:
2410        prolog_header_bb:
2411 	 prolog_body
2412 	 if (exit_prolog_cond) goto prolog_exit_bb
2413 	 else                  goto prolog_header_bb
2414        prolog_exit_bb:
2415 
2416        merge_bb_2:
2417 
2418        vector_preheader:
2419      VECTOR LOOP:
2420        vector_header_bb:
2421 	 vector_body
2422 	 if (exit_vector_cond) goto vector_exit_bb
2423 	 else                  goto vector_header_bb
2424        vector_exit_bb:
2425 
2426        guard_bb_3:
2427 	 if (skip_epilog) goto merge_bb_3
2428 	 else             goto epilog_preheader
2429 
2430        merge_bb_1:
2431 
2432        epilog_preheader:
2433      EPILOG:
2434        epilog_header_bb:
2435 	 epilog_body
2436 	 if (exit_epilog_cond) goto merge_bb_3
2437 	 else                  goto epilog_header_bb
2438 
2439        merge_bb_3:
2440 
2441    Note this function peels prolog and epilog only if it's necessary,
2442    as well as guards.
2443    This function returns the epilogue loop if a decision was made to vectorize
2444    it, otherwise NULL.
2445 
2446    The analysis resulting in this epilogue loop's loop_vec_info was performed
2447    in the same vect_analyze_loop call as the main loop's.  At that time
2448    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2449    vectorization factors than the main loop.  This list is stored in the main
2450    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2451    vectorize the epilogue loop for a lower vectorization factor,  the
2452    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2453    updated and linked to the epilogue loop.  This is later used to vectorize
2454    the epilogue.  The reason the loop_vec_info needs updating is that it was
2455    constructed based on the original main loop, and the epilogue loop is a
2456    copy of this loop, so all links pointing to statements in the original loop
2457    need updating.  Furthermore, these loop_vec_infos share the
2458    data_reference's records, which will also need to be updated.
2459 
2460    TODO: Guard for prefer_scalar_loop should be emitted along with
2461    versioning conditions if loop versioning is needed.  */
2462 
2463 
2464 class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)2465 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2466 		 tree *niters_vector, tree *step_vector,
2467 		 tree *niters_vector_mult_vf_var, int th,
2468 		 bool check_profitability, bool niters_no_overflow,
2469 		 tree *advance)
2470 {
2471   edge e, guard_e;
2472   tree type = TREE_TYPE (niters), guard_cond;
2473   basic_block guard_bb, guard_to;
2474   profile_probability prob_prolog, prob_vector, prob_epilog;
2475   int estimated_vf;
2476   int prolog_peeling = 0;
2477   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2478   /* We currently do not support prolog peeling if the target alignment is not
2479      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2480      target alignment being constant.  */
2481   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2482   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2483     return NULL;
2484 
2485   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2486     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2487 
2488   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2489   poly_uint64 bound_epilog = 0;
2490   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2491       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2492     bound_epilog += vf - 1;
2493   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2494     bound_epilog += 1;
2495   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2496   poly_uint64 bound_scalar = bound_epilog;
2497 
2498   if (!prolog_peeling && !epilog_peeling)
2499     return NULL;
2500 
2501   /* Before doing any peeling make sure to reset debug binds outside of
2502      the loop refering to defs not in LC SSA.  */
2503   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2504   for (unsigned i = 0; i < loop->num_nodes; ++i)
2505     {
2506       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2507       imm_use_iterator ui;
2508       gimple *use_stmt;
2509       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2510 	   gsi_next (&gsi))
2511 	{
2512 	  FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2513 	    if (gimple_debug_bind_p (use_stmt)
2514 		&& loop != gimple_bb (use_stmt)->loop_father
2515 		&& !flow_loop_nested_p (loop,
2516 					gimple_bb (use_stmt)->loop_father))
2517 	      {
2518 		gimple_debug_bind_reset_value (use_stmt);
2519 		update_stmt (use_stmt);
2520 	      }
2521 	}
2522       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2523 	   gsi_next (&gsi))
2524 	{
2525 	  ssa_op_iter op_iter;
2526 	  def_operand_p def_p;
2527 	  FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2528 	    FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2529 	      if (gimple_debug_bind_p (use_stmt)
2530 		  && loop != gimple_bb (use_stmt)->loop_father
2531 		  && !flow_loop_nested_p (loop,
2532 					  gimple_bb (use_stmt)->loop_father))
2533 		{
2534 		  gimple_debug_bind_reset_value (use_stmt);
2535 		  update_stmt (use_stmt);
2536 		}
2537 	}
2538     }
2539 
2540   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2541   estimated_vf = vect_vf_for_cost (loop_vinfo);
2542   if (estimated_vf == 2)
2543     estimated_vf = 3;
2544   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2545 			.apply_scale (estimated_vf - 1, estimated_vf);
2546 
2547   class loop *prolog, *epilog = NULL;
2548   class loop *first_loop = loop;
2549   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2550 
2551   /* We might have a queued need to update virtual SSA form.  As we
2552      delete the update SSA machinery below after doing a regular
2553      incremental SSA update during loop copying make sure we don't
2554      lose that fact.
2555      ???  Needing to update virtual SSA form by renaming is unfortunate
2556      but not all of the vectorizer code inserting new loads / stores
2557      properly assigns virtual operands to those statements.  */
2558   update_ssa (TODO_update_ssa_only_virtuals);
2559 
2560   create_lcssa_for_virtual_phi (loop);
2561 
2562   /* If we're vectorizing an epilogue loop, the update_ssa above will
2563      have ensured that the virtual operand is in SSA form throughout the
2564      vectorized main loop.  Normally it is possible to trace the updated
2565      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2566      back to scalar-stmt vuses, meaning that the effect of the SSA update
2567      remains local to the main loop.  However, there are rare cases in
2568      which the vectorized loop has vdefs even when the original scalar
2569      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2570      introduces clobbers of the temporary vector array, which in turn
2571      needs new vdefs.  If the scalar loop doesn't write to memory, these
2572      new vdefs will be the only ones in the vector loop.
2573 
2574      In that case, update_ssa will have added a new virtual phi to the
2575      main loop, which previously didn't need one.  Ensure that we (locally)
2576      maintain LCSSA form for the virtual operand, just as we would have
2577      done if the virtual phi had existed from the outset.  This makes it
2578      easier to duplicate the scalar epilogue loop below.  */
2579   tree vop_to_rename = NULL_TREE;
2580   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2581     {
2582       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2583       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2584     }
2585 
2586   if (MAY_HAVE_DEBUG_BIND_STMTS)
2587     {
2588       gcc_assert (!adjust_vec.exists ());
2589       adjust_vec.create (32);
2590     }
2591   initialize_original_copy_tables ();
2592 
2593   /* Record the anchor bb at which the guard should be placed if the scalar
2594      loop might be preferred.  */
2595   basic_block anchor = loop_preheader_edge (loop)->src;
2596 
2597   /* Generate the number of iterations for the prolog loop.  We do this here
2598      so that we can also get the upper bound on the number of iterations.  */
2599   tree niters_prolog;
2600   int bound_prolog = 0;
2601   if (prolog_peeling)
2602     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2603 						  &bound_prolog);
2604   else
2605     niters_prolog = build_int_cst (type, 0);
2606 
2607   loop_vec_info epilogue_vinfo = NULL;
2608   if (vect_epilogues)
2609     {
2610       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2611       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2612     }
2613 
2614   tree niters_vector_mult_vf = NULL_TREE;
2615   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2616   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2617   edge update_e = NULL, skip_e = NULL;
2618   unsigned int lowest_vf = constant_lower_bound (vf);
2619   /* If we know the number of scalar iterations for the main loop we should
2620      check whether after the main loop there are enough iterations left over
2621      for the epilogue.  */
2622   if (vect_epilogues
2623       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624       && prolog_peeling >= 0
2625       && known_eq (vf, lowest_vf))
2626     {
2627       unsigned HOST_WIDE_INT eiters
2628 	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
2629 	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2630 
2631       eiters -= prolog_peeling;
2632       eiters
2633 	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2634 
2635       unsigned int ratio;
2636       unsigned int epilogue_gaps
2637 	= LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2638       while (!(constant_multiple_p
2639 	       (GET_MODE_SIZE (loop_vinfo->vector_mode),
2640 		GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2641 	       && eiters >= lowest_vf / ratio + epilogue_gaps))
2642 	{
2643 	  delete epilogue_vinfo;
2644 	  epilogue_vinfo = NULL;
2645 	  if (loop_vinfo->epilogue_vinfos.length () == 0)
2646 	    {
2647 	      vect_epilogues = false;
2648 	      break;
2649 	    }
2650 	  epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2651 	  loop_vinfo->epilogue_vinfos.ordered_remove (0);
2652 	  epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2653 	}
2654     }
2655   /* Prolog loop may be skipped.  */
2656   bool skip_prolog = (prolog_peeling != 0);
2657   /* Skip this loop to epilog when there are not enough iterations to enter this
2658      vectorized loop.  If true we should perform runtime checks on the NITERS
2659      to check whether we should skip the current vectorized loop.  If we know
2660      the number of scalar iterations we may choose to add a runtime check if
2661      this number "maybe" smaller than the number of iterations required
2662      when we know the number of scalar iterations may potentially
2663      be smaller than the number of iterations required to enter this loop, for
2664      this we use the upper bounds on the prolog and epilog peeling.  When we
2665      don't know the number of iterations and don't require versioning it is
2666      because we have asserted that there are enough scalar iterations to enter
2667      the main loop, so this skip is not necessary.  When we are versioning then
2668      we only add such a skip if we have chosen to vectorize the epilogue.  */
2669   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2670 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2671 				  bound_prolog + bound_epilog)
2672 		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2673 			 || vect_epilogues));
2674   /* Epilog loop must be executed if the number of iterations for epilog
2675      loop is known at compile time, otherwise we need to add a check at
2676      the end of vector loop and skip to the end of epilog loop.  */
2677   bool skip_epilog = (prolog_peeling < 0
2678 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2679 		      || !vf.is_constant ());
2680   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2681   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2682     skip_epilog = false;
2683 
2684   if (skip_vector)
2685     {
2686       split_edge (loop_preheader_edge (loop));
2687 
2688       /* Due to the order in which we peel prolog and epilog, we first
2689 	 propagate probability to the whole loop.  The purpose is to
2690 	 avoid adjusting probabilities of both prolog and vector loops
2691 	 separately.  Note in this case, the probability of epilog loop
2692 	 needs to be scaled back later.  */
2693       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2694       if (prob_vector.initialized_p ())
2695 	{
2696 	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2697 	  scale_loop_profile (loop, prob_vector, 0);
2698 	}
2699     }
2700 
2701   dump_user_location_t loop_loc = find_loop_location (loop);
2702   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2703   if (vect_epilogues)
2704     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2705        use the original scalar loop as remaining epilogue if necessary.  */
2706     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2707       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2708 
2709   if (prolog_peeling)
2710     {
2711       e = loop_preheader_edge (loop);
2712       if (!slpeel_can_duplicate_loop_p (loop, e))
2713 	{
2714 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2715 			   "loop can't be duplicated to preheader edge.\n");
2716 	  gcc_unreachable ();
2717 	}
2718       /* Peel prolog and put it on preheader edge of loop.  */
2719       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2720       if (!prolog)
2721 	{
2722 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2723 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2724 	  gcc_unreachable ();
2725 	}
2726       prolog->force_vectorize = false;
2727       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2728       first_loop = prolog;
2729       reset_original_copy_tables ();
2730 
2731       /* Update the number of iterations for prolog loop.  */
2732       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2733       vect_set_loop_condition (prolog, NULL, niters_prolog,
2734 			       step_prolog, NULL_TREE, false);
2735 
2736       /* Skip the prolog loop.  */
2737       if (skip_prolog)
2738 	{
2739 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2740 				    niters_prolog, build_int_cst (type, 0));
2741 	  guard_bb = loop_preheader_edge (prolog)->src;
2742 	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2743 	  guard_to = split_edge (loop_preheader_edge (loop));
2744 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2745 					   guard_to, guard_bb,
2746 					   prob_prolog.invert (),
2747 					   irred_flag);
2748 	  e = EDGE_PRED (guard_to, 0);
2749 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2750 	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2751 
2752 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2753 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2754 	}
2755 
2756       /* Update init address of DRs.  */
2757       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2758       /* Update niters for vector loop.  */
2759       LOOP_VINFO_NITERS (loop_vinfo)
2760 	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2761       LOOP_VINFO_NITERSM1 (loop_vinfo)
2762 	= fold_build2 (MINUS_EXPR, type,
2763 		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2764       bool new_var_p = false;
2765       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2766       /* It's guaranteed that vector loop bound before vectorization is at
2767 	 least VF, so set range information for newly generated var.  */
2768       if (new_var_p)
2769 	set_range_info (niters, VR_RANGE,
2770 			wi::to_wide (build_int_cst (type, vf)),
2771 			wi::to_wide (TYPE_MAX_VALUE (type)));
2772 
2773       /* Prolog iterates at most bound_prolog times, latch iterates at
2774 	 most bound_prolog - 1 times.  */
2775       record_niter_bound (prolog, bound_prolog - 1, false, true);
2776       delete_update_ssa ();
2777       adjust_vec_debug_stmts ();
2778       scev_reset ();
2779     }
2780 
2781   if (epilog_peeling)
2782     {
2783       e = single_exit (loop);
2784       if (!slpeel_can_duplicate_loop_p (loop, e))
2785 	{
2786 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2787 			   "loop can't be duplicated to exit edge.\n");
2788 	  gcc_unreachable ();
2789 	}
2790       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2791 	 said epilog then we should use a copy of the main loop as a starting
2792 	 point.  This loop may have already had some preliminary transformations
2793 	 to allow for more optimal vectorization, for example if-conversion.
2794 	 If we are not vectorizing the epilog then we should use the scalar loop
2795 	 as the transformations mentioned above make less or no sense when not
2796 	 vectorizing.  */
2797       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2798       if (vop_to_rename)
2799 	{
2800 	  /* Vectorizing the main loop can sometimes introduce a vdef to
2801 	     a loop that previously didn't have one; see the comment above
2802 	     the definition of VOP_TO_RENAME for details.  The definition
2803 	     D that holds on E will then be different from the definition
2804 	     VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2805 	     rename VOP_TO_RENAME to D when copying the loop.
2806 
2807 	     The virtual operand is in LCSSA form for the main loop,
2808 	     and no stmt between the main loop and E needs a vdef,
2809 	     so we know that D is provided by a phi rather than by a
2810 	     vdef on a normal gimple stmt.  */
2811 	  basic_block vdef_bb = e->src;
2812 	  gphi *vphi;
2813 	  while (!(vphi = get_virtual_phi (vdef_bb)))
2814 	    vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2815 	  gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2816 	  set_current_def (vop_to_rename, gimple_phi_result (vphi));
2817 	}
2818       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2819       if (!epilog)
2820 	{
2821 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2822 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2823 	  gcc_unreachable ();
2824 	}
2825       epilog->force_vectorize = false;
2826       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2827 
2828       /* Scalar version loop may be preferred.  In this case, add guard
2829 	 and skip to epilog.  Note this only happens when the number of
2830 	 iterations of loop is unknown at compile time, otherwise this
2831 	 won't be vectorized.  */
2832       if (skip_vector)
2833 	{
2834 	  /* Additional epilogue iteration is peeled if gap exists.  */
2835 	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2836 						bound_prolog, bound_epilog,
2837 						th, &bound_scalar,
2838 						check_profitability);
2839 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2840 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2841 	  guard_bb = anchor;
2842 	  guard_to = split_edge (loop_preheader_edge (epilog));
2843 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2844 					   guard_to, guard_bb,
2845 					   prob_vector.invert (),
2846 					   irred_flag);
2847 	  skip_e = guard_e;
2848 	  e = EDGE_PRED (guard_to, 0);
2849 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2850 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2851 
2852 	  /* Simply propagate profile info from guard_bb to guard_to which is
2853 	     a merge point of control flow.  */
2854 	  guard_to->count = guard_bb->count;
2855 
2856 	  /* Scale probability of epilog loop back.
2857 	     FIXME: We should avoid scaling down and back up.  Profile may
2858 	     get lost if we scale down to 0.  */
2859 	  basic_block *bbs = get_loop_body (epilog);
2860 	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2861 	    bbs[i]->count = bbs[i]->count.apply_scale
2862 				 (bbs[i]->count,
2863 				  bbs[i]->count.apply_probability
2864 				    (prob_vector));
2865 	  free (bbs);
2866 	}
2867 
2868       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2869       /* If loop is peeled for non-zero constant times, now niters refers to
2870 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2871 	 overflows.  */
2872       niters_no_overflow |= (prolog_peeling > 0);
2873       vect_gen_vector_loop_niters (loop_vinfo, niters,
2874 				   niters_vector, step_vector,
2875 				   niters_no_overflow);
2876       if (!integer_onep (*step_vector))
2877 	{
2878 	  /* On exit from the loop we will have an easy way of calcalating
2879 	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2880 	     until then.  */
2881 	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2882 	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2883 	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2884 	}
2885       else
2886 	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2887 					     &niters_vector_mult_vf);
2888       /* Update IVs of original loop as if they were advanced by
2889 	 niters_vector_mult_vf steps.  */
2890       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2891       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2892       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2893 					update_e);
2894 
2895       if (skip_epilog)
2896 	{
2897 	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2898 				    niters, niters_vector_mult_vf);
2899 	  guard_bb = single_exit (loop)->dest;
2900 	  guard_to = split_edge (single_exit (epilog));
2901 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2902 					   skip_vector ? anchor : guard_bb,
2903 					   prob_epilog.invert (),
2904 					   irred_flag);
2905 	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2906 					      single_exit (epilog));
2907 	  /* Only need to handle basic block before epilog loop if it's not
2908 	     the guard_bb, which is the case when skip_vector is true.  */
2909 	  if (guard_bb != bb_before_epilog)
2910 	    {
2911 	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2912 
2913 	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2914 	    }
2915 	  scale_loop_profile (epilog, prob_epilog, 0);
2916 	}
2917       else
2918 	slpeel_update_phi_nodes_for_lcssa (epilog);
2919 
2920       unsigned HOST_WIDE_INT bound;
2921       if (bound_scalar.is_constant (&bound))
2922 	{
2923 	  gcc_assert (bound != 0);
2924 	  /* -1 to convert loop iterations to latch iterations.  */
2925 	  record_niter_bound (epilog, bound - 1, false, true);
2926 	}
2927 
2928       delete_update_ssa ();
2929       adjust_vec_debug_stmts ();
2930       scev_reset ();
2931     }
2932 
2933   if (vect_epilogues)
2934     {
2935       epilog->aux = epilogue_vinfo;
2936       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2937 
2938       loop_constraint_clear (epilog, LOOP_C_INFINITE);
2939 
2940       /* We now must calculate the number of NITERS performed by the previous
2941 	 loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2942       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2943 				 niters_prolog, niters_vector_mult_vf);
2944 
2945       /* If skip_vector we may skip the previous loop, we insert a phi-node to
2946 	 determine whether we are coming from the previous vectorized loop
2947 	 using the update_e edge or the skip_vector basic block using the
2948 	 skip_e edge.  */
2949       if (skip_vector)
2950 	{
2951 	  gcc_assert (update_e != NULL && skip_e != NULL);
2952 	  gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2953 					   update_e->dest);
2954 	  tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2955 	  gimple *stmt = gimple_build_assign (new_ssa, niters);
2956 	  gimple_stmt_iterator gsi;
2957 	  if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2958 	      && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2959 	    {
2960 	      gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2961 	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2962 	    }
2963 	  else
2964 	    {
2965 	      gsi = gsi_last_bb (update_e->src);
2966 	      gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2967 	    }
2968 
2969 	  niters = new_ssa;
2970 	  add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2971 	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2972 		       UNKNOWN_LOCATION);
2973 	  niters = PHI_RESULT (new_phi);
2974 	}
2975 
2976       /* Subtract the number of iterations performed by the vectorized loop
2977 	 from the number of total iterations.  */
2978       tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2979 					  before_loop_niters,
2980 					  niters);
2981 
2982       LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2983       LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2984 	= fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2985 		       epilogue_niters,
2986 		       build_one_cst (TREE_TYPE (epilogue_niters)));
2987 
2988       /* Set ADVANCE to the number of iterations performed by the previous
2989 	 loop and its prologue.  */
2990       *advance = niters;
2991 
2992       /* Redo the peeling for niter analysis as the NITERs and alignment
2993 	 may have been updated to take the main loop into account.  */
2994       determine_peel_for_niter (epilogue_vinfo);
2995     }
2996 
2997   adjust_vec.release ();
2998   free_original_copy_tables ();
2999 
3000   return vect_epilogues ? epilog : NULL;
3001 }
3002 
3003 /* Function vect_create_cond_for_niters_checks.
3004 
3005    Create a conditional expression that represents the run-time checks for
3006    loop's niter.  The loop is guaranteed to terminate if the run-time
3007    checks hold.
3008 
3009    Input:
3010    COND_EXPR  - input conditional expression.  New conditions will be chained
3011 		with logical AND operation.  If it is NULL, then the function
3012 		is used to return the number of alias checks.
3013    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3014 		to be checked.
3015 
3016    Output:
3017    COND_EXPR - conditional expression.
3018 
3019    The returned COND_EXPR is the conditional expression to be used in the
3020    if statement that controls which version of the loop gets executed at
3021    runtime.  */
3022 
3023 static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)3024 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3025 {
3026   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3027 
3028   if (*cond_expr)
3029     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3030 			      *cond_expr, part_cond_expr);
3031   else
3032     *cond_expr = part_cond_expr;
3033 }
3034 
3035 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3036    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3037 
3038 static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3039 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3040 {
3041   if (*cond_expr)
3042     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3043 			      *cond_expr, part_cond_expr);
3044   else
3045     *cond_expr = part_cond_expr;
3046 }
3047 
3048 /* Function vect_create_cond_for_align_checks.
3049 
3050    Create a conditional expression that represents the alignment checks for
3051    all of data references (array element references) whose alignment must be
3052    checked at runtime.
3053 
3054    Input:
3055    COND_EXPR  - input conditional expression.  New conditions will be chained
3056                 with logical AND operation.
3057    LOOP_VINFO - two fields of the loop information are used.
3058                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3059                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3060 
3061    Output:
3062    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3063                          expression.
3064    The returned value is the conditional expression to be used in the if
3065    statement that controls which version of the loop gets executed at runtime.
3066 
3067    The algorithm makes two assumptions:
3068      1) The number of bytes "n" in a vector is a power of 2.
3069      2) An address "a" is aligned if a%n is zero and that this
3070         test can be done as a&(n-1) == 0.  For example, for 16
3071         byte vectors the test is a&0xf == 0.  */
3072 
3073 static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)3074 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3075                                    tree *cond_expr,
3076 				   gimple_seq *cond_expr_stmt_list)
3077 {
3078   vec<stmt_vec_info> may_misalign_stmts
3079     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3080   stmt_vec_info stmt_info;
3081   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3082   tree mask_cst;
3083   unsigned int i;
3084   tree int_ptrsize_type;
3085   char tmp_name[20];
3086   tree or_tmp_name = NULL_TREE;
3087   tree and_tmp_name;
3088   gimple *and_stmt;
3089   tree ptrsize_zero;
3090   tree part_cond_expr;
3091 
3092   /* Check that mask is one less than a power of 2, i.e., mask is
3093      all zeros followed by all ones.  */
3094   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3095 
3096   int_ptrsize_type = signed_type_for (ptr_type_node);
3097 
3098   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3099      of the first vector of the i'th data reference. */
3100 
3101   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3102     {
3103       gimple_seq new_stmt_list = NULL;
3104       tree addr_base;
3105       tree addr_tmp_name;
3106       tree new_or_tmp_name;
3107       gimple *addr_stmt, *or_stmt;
3108       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3109       bool negative = tree_int_cst_compare
3110 	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3111       tree offset = negative
3112 	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3113 
3114       /* create: addr_tmp = (int)(address_of_first_vector) */
3115       addr_base =
3116 	vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3117 					      offset);
3118       if (new_stmt_list != NULL)
3119 	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3120 
3121       sprintf (tmp_name, "addr2int%d", i);
3122       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3123       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3124       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3125 
3126       /* The addresses are OR together.  */
3127 
3128       if (or_tmp_name != NULL_TREE)
3129         {
3130           /* create: or_tmp = or_tmp | addr_tmp */
3131           sprintf (tmp_name, "orptrs%d", i);
3132 	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3133 	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3134 					 or_tmp_name, addr_tmp_name);
3135 	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3136           or_tmp_name = new_or_tmp_name;
3137         }
3138       else
3139         or_tmp_name = addr_tmp_name;
3140 
3141     } /* end for i */
3142 
3143   mask_cst = build_int_cst (int_ptrsize_type, mask);
3144 
3145   /* create: and_tmp = or_tmp & mask  */
3146   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3147 
3148   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3149 				  or_tmp_name, mask_cst);
3150   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3151 
3152   /* Make and_tmp the left operand of the conditional test against zero.
3153      if and_tmp has a nonzero bit then some address is unaligned.  */
3154   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3155   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3156 				and_tmp_name, ptrsize_zero);
3157   chain_cond_expr (cond_expr, part_cond_expr);
3158 }
3159 
3160 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3161    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3162    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3163    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3164 
3165 static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3166 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3167 {
3168   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3169   unsigned int i;
3170   vec_object_pair *pair;
3171   FOR_EACH_VEC_ELT (pairs, i, pair)
3172     {
3173       tree addr1 = build_fold_addr_expr (pair->first);
3174       tree addr2 = build_fold_addr_expr (pair->second);
3175       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3176 					 addr1, addr2);
3177       chain_cond_expr (cond_expr, part_cond_expr);
3178     }
3179 }
3180 
3181 /* Create an expression that is true when all lower-bound conditions for
3182    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3183 
3184 static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3185 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3186 {
3187   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3188   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3189     {
3190       tree expr = lower_bounds[i].expr;
3191       tree type = unsigned_type_for (TREE_TYPE (expr));
3192       expr = fold_convert (type, expr);
3193       poly_uint64 bound = lower_bounds[i].min_value;
3194       if (!lower_bounds[i].unsigned_p)
3195 	{
3196 	  expr = fold_build2 (PLUS_EXPR, type, expr,
3197 			      build_int_cstu (type, bound - 1));
3198 	  bound += bound - 1;
3199 	}
3200       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3201 					 build_int_cstu (type, bound));
3202       chain_cond_expr (cond_expr, part_cond_expr);
3203     }
3204 }
3205 
3206 /* Function vect_create_cond_for_alias_checks.
3207 
3208    Create a conditional expression that represents the run-time checks for
3209    overlapping of address ranges represented by a list of data references
3210    relations passed as input.
3211 
3212    Input:
3213    COND_EXPR  - input conditional expression.  New conditions will be chained
3214                 with logical AND operation.  If it is NULL, then the function
3215                 is used to return the number of alias checks.
3216    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3217 	        to be checked.
3218 
3219    Output:
3220    COND_EXPR - conditional expression.
3221 
3222    The returned COND_EXPR is the conditional expression to be used in the if
3223    statement that controls which version of the loop gets executed at runtime.
3224 */
3225 
3226 void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)3227 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3228 {
3229   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3230     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3231 
3232   if (comp_alias_ddrs.is_empty ())
3233     return;
3234 
3235   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3236 			       &comp_alias_ddrs, cond_expr);
3237   if (dump_enabled_p ())
3238     dump_printf_loc (MSG_NOTE, vect_location,
3239 		     "created %u versioning for alias checks.\n",
3240 		     comp_alias_ddrs.length ());
3241 }
3242 
3243 
3244 /* Function vect_loop_versioning.
3245 
3246    If the loop has data references that may or may not be aligned or/and
3247    has data reference relations whose independence was not proven then
3248    two versions of the loop need to be generated, one which is vectorized
3249    and one which isn't.  A test is then generated to control which of the
3250    loops is executed.  The test checks for the alignment of all of the
3251    data references that may or may not be aligned.  An additional
3252    sequence of runtime tests is generated for each pairs of DDRs whose
3253    independence was not proven.  The vectorized version of loop is
3254    executed only if both alias and alignment tests are passed.
3255 
3256    The test generated to check which version of loop is executed
3257    is modified to also check for profitability as indicated by the
3258    cost model threshold TH.
3259 
3260    The versioning precondition(s) are placed in *COND_EXPR and
3261    *COND_EXPR_STMT_LIST.  */
3262 
3263 class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)3264 vect_loop_versioning (loop_vec_info loop_vinfo,
3265 		      gimple *loop_vectorized_call)
3266 {
3267   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3268   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3269   basic_block condition_bb;
3270   gphi_iterator gsi;
3271   gimple_stmt_iterator cond_exp_gsi;
3272   basic_block merge_bb;
3273   basic_block new_exit_bb;
3274   edge new_exit_e, e;
3275   gphi *orig_phi, *new_phi;
3276   tree cond_expr = NULL_TREE;
3277   gimple_seq cond_expr_stmt_list = NULL;
3278   tree arg;
3279   profile_probability prob = profile_probability::likely ();
3280   gimple_seq gimplify_stmt_list = NULL;
3281   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3282   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3283   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3284   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3285   poly_uint64 versioning_threshold
3286     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3287   tree version_simd_if_cond
3288     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3289   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3290 
3291   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3292       && !ordered_p (th, versioning_threshold))
3293     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3294 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3295 					    th - 1));
3296   if (maybe_ne (versioning_threshold, 0U))
3297     {
3298       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3299 			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3300 					      versioning_threshold - 1));
3301       if (cond_expr)
3302 	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3303 				 expr, cond_expr);
3304       else
3305 	cond_expr = expr;
3306     }
3307 
3308   if (version_niter)
3309     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3310 
3311   if (cond_expr)
3312     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3313 					&cond_expr_stmt_list,
3314 					is_gimple_condexpr, NULL_TREE);
3315 
3316   if (version_align)
3317     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3318 				       &cond_expr_stmt_list);
3319 
3320   if (version_alias)
3321     {
3322       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3323       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3324       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3325     }
3326 
3327   if (version_simd_if_cond)
3328     {
3329       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3330       if (flag_checking)
3331 	if (basic_block bb
3332 	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3333 	  gcc_assert (bb != loop->header
3334 		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3335 		      && (scalar_loop == NULL
3336 			  || (bb != scalar_loop->header
3337 			      && dominated_by_p (CDI_DOMINATORS,
3338 						 scalar_loop->header, bb))));
3339       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3340       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3341 			    version_simd_if_cond, zero);
3342       if (cond_expr)
3343         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3344 				 c, cond_expr);
3345       else
3346         cond_expr = c;
3347       if (dump_enabled_p ())
3348 	dump_printf_loc (MSG_NOTE, vect_location,
3349 			 "created versioning for simd if condition check.\n");
3350     }
3351 
3352   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3353 				      &gimplify_stmt_list,
3354 				      is_gimple_condexpr, NULL_TREE);
3355   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3356 
3357   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3358      invariant in.  */
3359   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3360   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3361        !gsi_end_p (gsi); gsi_next (&gsi))
3362     {
3363       gimple *stmt = gsi_stmt (gsi);
3364       update_stmt (stmt);
3365       ssa_op_iter iter;
3366       use_operand_p use_p;
3367       basic_block def_bb;
3368       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3369 	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3370 	    && flow_bb_inside_loop_p (outermost, def_bb))
3371 	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3372     }
3373 
3374   /* Search for the outermost loop we can version.  Avoid versioning of
3375      non-perfect nests but allow if-conversion versioned loops inside.  */
3376   class loop *loop_to_version = loop;
3377   if (flow_loop_nested_p (outermost, loop))
3378     {
3379       if (dump_enabled_p ())
3380 	dump_printf_loc (MSG_NOTE, vect_location,
3381 			 "trying to apply versioning to outer loop %d\n",
3382 			 outermost->num);
3383       if (outermost->num == 0)
3384 	outermost = superloop_at_depth (loop, 1);
3385       /* And avoid applying versioning on non-perfect nests.  */
3386       while (loop_to_version != outermost
3387 	     && (e = single_exit (loop_outer (loop_to_version)))
3388 	     && !(e->flags & EDGE_COMPLEX)
3389 	     && (!loop_outer (loop_to_version)->inner->next
3390 		 || vect_loop_vectorized_call (loop_to_version))
3391 	     && (!loop_outer (loop_to_version)->inner->next
3392 		 || !loop_outer (loop_to_version)->inner->next->next))
3393 	loop_to_version = loop_outer (loop_to_version);
3394     }
3395 
3396   /* Apply versioning.  If there is already a scalar version created by
3397      if-conversion re-use that.  Note we cannot re-use the copy of
3398      an if-converted outer-loop when vectorizing the inner loop only.  */
3399   gcond *cond;
3400   if ((!loop_to_version->inner || loop == loop_to_version)
3401       && loop_vectorized_call)
3402     {
3403       gcc_assert (scalar_loop);
3404       condition_bb = gimple_bb (loop_vectorized_call);
3405       cond = as_a <gcond *> (last_stmt (condition_bb));
3406       gimple_cond_set_condition_from_tree (cond, cond_expr);
3407       update_stmt (cond);
3408 
3409       if (cond_expr_stmt_list)
3410 	{
3411 	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3412 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3413 				 GSI_SAME_STMT);
3414 	}
3415 
3416       /* if-conversion uses profile_probability::always () for both paths,
3417 	 reset the paths probabilities appropriately.  */
3418       edge te, fe;
3419       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3420       te->probability = prob;
3421       fe->probability = prob.invert ();
3422       /* We can scale loops counts immediately but have to postpone
3423          scaling the scalar loop because we re-use it during peeling.  */
3424       scale_loop_frequencies (loop_to_version, te->probability);
3425       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3426 
3427       nloop = scalar_loop;
3428       if (dump_enabled_p ())
3429 	dump_printf_loc (MSG_NOTE, vect_location,
3430 			 "reusing %sloop version created by if conversion\n",
3431 			 loop_to_version != loop ? "outer " : "");
3432     }
3433   else
3434     {
3435       if (loop_to_version != loop
3436 	  && dump_enabled_p ())
3437 	dump_printf_loc (MSG_NOTE, vect_location,
3438 			 "applying loop versioning to outer loop %d\n",
3439 			 loop_to_version->num);
3440 
3441       initialize_original_copy_tables ();
3442       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3443 			    prob, prob.invert (), prob, prob.invert (), true);
3444       gcc_assert (nloop);
3445       nloop = get_loop_copy (loop);
3446 
3447       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3448          reap those otherwise;  they also refer to the original
3449 	 loops.  */
3450       class loop *l = loop;
3451       while (gimple *call = vect_loop_vectorized_call (l))
3452 	{
3453 	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3454 	  fold_loop_internal_call (call, boolean_false_node);
3455 	  l = loop_outer (l);
3456 	}
3457       free_original_copy_tables ();
3458 
3459       if (cond_expr_stmt_list)
3460 	{
3461 	  cond_exp_gsi = gsi_last_bb (condition_bb);
3462 	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3463 				 GSI_SAME_STMT);
3464 	}
3465 
3466       /* Loop versioning violates an assumption we try to maintain during
3467 	 vectorization - that the loop exit block has a single predecessor.
3468 	 After versioning, the exit block of both loop versions is the same
3469 	 basic block (i.e. it has two predecessors). Just in order to simplify
3470 	 following transformations in the vectorizer, we fix this situation
3471 	 here by adding a new (empty) block on the exit-edge of the loop,
3472 	 with the proper loop-exit phis to maintain loop-closed-form.
3473 	 If loop versioning wasn't done from loop, but scalar_loop instead,
3474 	 merge_bb will have already just a single successor.  */
3475 
3476       merge_bb = single_exit (loop_to_version)->dest;
3477       if (EDGE_COUNT (merge_bb->preds) >= 2)
3478 	{
3479 	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3480 	  new_exit_bb = split_edge (single_exit (loop_to_version));
3481 	  new_exit_e = single_exit (loop_to_version);
3482 	  e = EDGE_SUCC (new_exit_bb, 0);
3483 
3484 	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3485 	       gsi_next (&gsi))
3486 	    {
3487 	      tree new_res;
3488 	      orig_phi = gsi.phi ();
3489 	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3490 	      new_phi = create_phi_node (new_res, new_exit_bb);
3491 	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3492 	      add_phi_arg (new_phi, arg, new_exit_e,
3493 			   gimple_phi_arg_location_from_edge (orig_phi, e));
3494 	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3495 	    }
3496 	}
3497 
3498       update_ssa (TODO_update_ssa);
3499     }
3500 
3501   if (version_niter)
3502     {
3503       /* The versioned loop could be infinite, we need to clear existing
3504 	 niter information which is copied from the original loop.  */
3505       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3506       vect_free_loop_info_assumptions (nloop);
3507       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3508       loop_constraint_set (loop, LOOP_C_INFINITE);
3509     }
3510 
3511   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3512       && dump_enabled_p ())
3513     {
3514       if (version_alias)
3515         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3516 			 vect_location,
3517                          "loop versioned for vectorization because of "
3518 			 "possible aliasing\n");
3519       if (version_align)
3520         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3521 			 vect_location,
3522                          "loop versioned for vectorization to enhance "
3523 			 "alignment\n");
3524 
3525     }
3526 
3527   return nloop;
3528 }
3529