1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53
54 /*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61 /* Renames the use *OP_P. */
62
63 static void
rename_use_op(use_operand_p op_p)64 rename_use_op (use_operand_p op_p)
65 {
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80 }
81
82
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
86
87 static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
103
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
112 FOR_EACH_EDGE (e, ei, bb->preds)
113 {
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
129 }
130 }
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134 }
135 }
136
137
138 struct adjust_info
139 {
140 tree from, to;
141 basic_block bb;
142 };
143
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
151
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156 static void
adjust_debug_stmts_now(adjust_info * ai)157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197 }
198
199 /* Adjust debug stmts as scheduled before. */
200
201 static void
adjust_vec_debug_stmts(void)202 adjust_vec_debug_stmts (void)
203 {
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
206
207 gcc_assert (adjust_vec.exists ());
208
209 while (!adjust_vec.is_empty ())
210 {
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
213 }
214 }
215
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221 static void
adjust_debug_stmts(tree from,tree to,basic_block bb)222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224 adjust_info ai;
225
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240 }
241
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247 static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257 }
258
259 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
260 the mask should have during the first iteration and NEXT_MASK is the
261 value that it should have on subsequent iterations. */
262
263 static void
vect_set_loop_mask(class loop * loop,tree mask,tree init_mask,tree next_mask)264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265 tree next_mask)
266 {
267 gphi *phi = create_phi_node (mask, loop->header);
268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271
272 /* Add SEQ to the end of LOOP's preheader block. */
273
274 static void
add_preheader_seq(class loop * loop,gimple_seq seq)275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283 }
284
285 /* Add SEQ to the beginning of LOOP's header block. */
286
287 static void
add_header_seq(class loop * loop,gimple_seq seq)288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295 }
296
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302 static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305 {
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322 static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_masks * dest_rgm,rgroup_masks * src_rgm)323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324 rgroup_masks *src_rgm)
325 {
326 tree src_masktype = src_rgm->mask_type;
327 tree dest_masktype = dest_rgm->mask_type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342 {
343 tree src = src_rgm->masks[i / 2];
344 tree dest = dest_rgm->masks[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375 {
376 tree src = src_rgm->masks[i / 2];
377 tree dest = dest_rgm->masks[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385 }
386
387 /* Helper for vect_set_loop_condition_masked. Generate definitions for
388 all the masks in RGM and return a mask that is nonzero when the loop
389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
390 Use LOOP_COND_GSI to insert code before the exit gcond.
391
392 RGM belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
394
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
400 It is known that:
401
402 NITERS * RGM->max_nscalars_per_iter
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
407 VF * RGM->max_nscalars_per_iter
408
409 might overflow before hitting a value above:
410
411 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks for RGM. */
415
416 static tree
vect_set_loop_masks_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_masks * rgm,tree niters,tree niters_skip,bool might_wrap_p)417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418 gimple_seq *preheader_seq,
419 gimple_stmt_iterator loop_cond_gsi,
420 rgroup_masks *rgm, tree niters, tree niters_skip,
421 bool might_wrap_p)
422 {
423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425 tree mask_type = rgm->mask_type;
426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429
430 /* Calculate the maximum number of scalar values that the rgroup
431 handles in total, the number that it handles for each iteration
432 of the vector loop, and the number that it should skip during the
433 first iteration of the vector loop. */
434 tree nscalars_total = niters;
435 tree nscalars_step = build_int_cst (iv_type, vf);
436 tree nscalars_skip = niters_skip;
437 if (nscalars_per_iter != 1)
438 {
439 /* We checked before choosing to use a fully-masked loop that these
440 multiplications don't overflow. */
441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444 nscalars_total, compare_factor);
445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 nscalars_step, iv_factor);
447 if (nscalars_skip)
448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449 nscalars_skip, compare_factor);
450 }
451
452 /* Create an induction variable that counts the number of scalars
453 processed. */
454 tree index_before_incr, index_after_incr;
455 gimple_stmt_iterator incr_gsi;
456 bool insert_after;
457 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460
461 tree zero_index = build_int_cst (compare_type, 0);
462 tree test_index, test_limit, first_limit;
463 gimple_stmt_iterator *test_gsi;
464 if (might_wrap_p)
465 {
466 /* In principle the loop should stop iterating once the incremented
467 IV reaches a value greater than or equal to:
468
469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470
471 However, there's no guarantee that this addition doesn't overflow
472 the comparison type, or that the IV hits a value above it before
473 wrapping around. We therefore adjust the limit down by one
474 IV step:
475
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[infinite-prec] NSCALARS_STEP
478
479 and compare the IV against this limit _before_ incrementing it.
480 Since the comparison type is unsigned, we actually want the
481 subtraction to saturate at zero:
482
483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 -[sat] NSCALARS_STEP
485
486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487
488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489
490 where the rightmost subtraction can be done directly in
491 COMPARE_TYPE. */
492 test_index = index_before_incr;
493 tree adjust = gimple_convert (preheader_seq, compare_type,
494 nscalars_step);
495 if (nscalars_skip)
496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 adjust, nscalars_skip);
498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499 nscalars_total, adjust);
500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501 test_limit, adjust);
502 test_gsi = &incr_gsi;
503
504 /* Get a safe limit for the first iteration. */
505 if (nscalars_skip)
506 {
507 /* The first vector iteration can handle at most NSCALARS_STEP
508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509 NSCALARS_SKIP to that cannot overflow. */
510 tree const_limit = build_int_cst (compare_type,
511 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 * nscalars_per_iter);
513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 nscalars_total, const_limit);
515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 first_limit, nscalars_skip);
517 }
518 else
519 /* For the first iteration it doesn't matter whether the IV hits
520 a value above NSCALARS_TOTAL. That only matters for the latch
521 condition. */
522 first_limit = nscalars_total;
523 }
524 else
525 {
526 /* Test the incremented IV, which will always hit a value above
527 the bound before wrapping. */
528 test_index = index_after_incr;
529 test_limit = nscalars_total;
530 if (nscalars_skip)
531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 test_limit, nscalars_skip);
533 test_gsi = &loop_cond_gsi;
534
535 first_limit = test_limit;
536 }
537
538 /* Convert the IV value to the comparison type (either a no-op or
539 a demotion). */
540 gimple_seq test_seq = NULL;
541 test_index = gimple_convert (&test_seq, compare_type, test_index);
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543
544 /* Provide a definition of each mask in the group. */
545 tree next_mask = NULL_TREE;
546 tree mask;
547 unsigned int i;
548 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549 {
550 /* Previous masks will cover BIAS scalars. This mask covers the
551 next batch. */
552 poly_uint64 bias = nscalars_per_mask * i;
553 tree bias_tree = build_int_cst (compare_type, bias);
554 gimple *tmp_stmt;
555
556 /* See whether the first iteration of the vector loop is known
557 to have a full mask. */
558 poly_uint64 const_limit;
559 bool first_iteration_full
560 = (poly_int_tree_p (first_limit, &const_limit)
561 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562
563 /* Rather than have a new IV that starts at BIAS and goes up to
564 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565 and adjust the bound down by BIAS. */
566 tree this_test_limit = test_limit;
567 if (i != 0)
568 {
569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 compare_type, this_test_limit,
571 bias_tree);
572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 compare_type, this_test_limit,
574 bias_tree);
575 }
576
577 /* Create the initial mask. First include all scalars that
578 are within the loop limit. */
579 tree init_mask = NULL_TREE;
580 if (!first_iteration_full)
581 {
582 tree start, end;
583 if (first_limit == test_limit)
584 {
585 /* Use a natural test between zero (the initial IV value)
586 and the loop limit. The "else" block would be valid too,
587 but this choice can avoid the need to load BIAS_TREE into
588 a register. */
589 start = zero_index;
590 end = this_test_limit;
591 }
592 else
593 {
594 /* FIRST_LIMIT is the maximum number of scalars handled by the
595 first iteration of the vector loop. Test the portion
596 associated with this mask. */
597 start = bias_tree;
598 end = first_limit;
599 }
600
601 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602 tmp_stmt = vect_gen_while (init_mask, start, end);
603 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604 }
605
606 /* Now AND out the bits that are within the number of skipped
607 scalars. */
608 poly_uint64 const_skip;
609 if (nscalars_skip
610 && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 && known_le (const_skip, bias)))
612 {
613 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614 bias_tree, nscalars_skip);
615 if (init_mask)
616 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617 init_mask, unskipped_mask);
618 else
619 init_mask = unskipped_mask;
620 }
621
622 if (!init_mask)
623 /* First iteration is full. */
624 init_mask = build_minus_one_cst (mask_type);
625
626 /* Get the mask value for the next iteration of the loop. */
627 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630
631 vect_set_loop_mask (loop, mask, init_mask, next_mask);
632 }
633 return next_mask;
634 }
635
636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
638 number of iterations of the original scalar loop that should be
639 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
640 as for vect_set_loop_condition.
641
642 Insert the branch-back condition before LOOP_COND_GSI and return the
643 final gcond. */
644
645 static gcond *
vect_set_loop_condition_masked(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647 tree niters, tree final_iv,
648 bool niters_maybe_zero,
649 gimple_stmt_iterator loop_cond_gsi)
650 {
651 gimple_seq preheader_seq = NULL;
652 gimple_seq header_seq = NULL;
653
654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655 unsigned int compare_precision = TYPE_PRECISION (compare_type);
656 tree orig_niters = niters;
657
658 /* Type of the initial value of NITERS. */
659 tree ni_actual_type = TREE_TYPE (niters);
660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662
663 /* Convert NITERS to the same size as the compare. */
664 if (compare_precision > ni_actual_precision
665 && niters_maybe_zero)
666 {
667 /* We know that there is always at least one iteration, so if the
668 count is zero then it must have wrapped. Cope with this by
669 subtracting 1 before the conversion and adding 1 to the result. */
670 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672 niters, build_minus_one_cst (ni_actual_type));
673 niters = gimple_convert (&preheader_seq, compare_type, niters);
674 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675 niters, build_one_cst (compare_type));
676 }
677 else
678 niters = gimple_convert (&preheader_seq, compare_type, niters);
679
680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681
682 /* Iterate over all the rgroups and fill in their masks. We could use
683 the first mask from any rgroup for the loop condition; here we
684 arbitrarily pick the last. */
685 tree test_mask = NULL_TREE;
686 rgroup_masks *rgm;
687 unsigned int i;
688 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689 FOR_EACH_VEC_ELT (*masks, i, rgm)
690 if (!rgm->masks.is_empty ())
691 {
692 /* First try using permutes. This adds a single vector
693 instruction to the loop for each mask, but needs no extra
694 loop invariants or IVs. */
695 unsigned int nmasks = i + 1;
696 if ((nmasks & 1) == 0)
697 {
698 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699 if (!half_rgm->masks.is_empty ()
700 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701 continue;
702 }
703
704 /* See whether zero-based IV would ever generate all-false masks
705 before wrapping around. */
706 bool might_wrap_p
707 = (iv_limit == -1
708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709 UNSIGNED)
710 > compare_precision));
711
712 /* Set up all masks for this group. */
713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714 &preheader_seq,
715 loop_cond_gsi, rgm,
716 niters, niters_skip,
717 might_wrap_p);
718 }
719
720 /* Emit all accumulated statements. */
721 add_preheader_seq (loop, preheader_seq);
722 add_header_seq (loop, header_seq);
723
724 /* Get a boolean result that tells us whether to iterate. */
725 edge exit_edge = single_exit (loop);
726 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729 NULL_TREE, NULL_TREE);
730 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731
732 /* The loop iterates (NITERS - 1) / VF + 1 times.
733 Subtract one from this to get the latch count. */
734 tree step = build_int_cst (compare_type,
735 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737 build_minus_one_cst (compare_type));
738 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739 niters_minus_one, step);
740
741 if (final_iv)
742 {
743 gassign *assign = gimple_build_assign (final_iv, orig_niters);
744 gsi_insert_on_edge_immediate (single_exit (loop), assign);
745 }
746
747 return cond_stmt;
748 }
749
750 /* Like vect_set_loop_condition, but handle the case in which there
751 are no loop masks. */
752
753 static gcond *
vect_set_loop_condition_unmasked(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755 tree step, tree final_iv,
756 bool niters_maybe_zero,
757 gimple_stmt_iterator loop_cond_gsi)
758 {
759 tree indx_before_incr, indx_after_incr;
760 gcond *cond_stmt;
761 gcond *orig_cond;
762 edge pe = loop_preheader_edge (loop);
763 edge exit_edge = single_exit (loop);
764 gimple_stmt_iterator incr_gsi;
765 bool insert_after;
766 enum tree_code code;
767 tree niters_type = TREE_TYPE (niters);
768
769 orig_cond = get_loop_exit_condition (loop);
770 gcc_assert (orig_cond);
771 loop_cond_gsi = gsi_for_stmt (orig_cond);
772
773 tree init, limit;
774 if (!niters_maybe_zero && integer_onep (step))
775 {
776 /* In this case we can use a simple 0-based IV:
777
778 A:
779 x = 0;
780 do
781 {
782 ...
783 x += 1;
784 }
785 while (x < NITERS); */
786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787 init = build_zero_cst (niters_type);
788 limit = niters;
789 }
790 else
791 {
792 /* The following works for all values of NITERS except 0:
793
794 B:
795 x = 0;
796 do
797 {
798 ...
799 x += STEP;
800 }
801 while (x <= NITERS - STEP);
802
803 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 but stops if x + STEP - 1 >= NITERS.
805
806 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 before wrapping around. There are two obvious ways of dealing with
808 this:
809
810 - start at STEP - 1 and compare x before incrementing it
811 - start at -1 and compare x after incrementing it
812
813 The latter is simpler and is what we use. The loop in this case
814 looks like:
815
816 C:
817 x = -1;
818 do
819 {
820 ...
821 x += STEP;
822 }
823 while (x < NITERS - STEP);
824
825 In both cases the loop limit is NITERS - STEP. */
826 gimple_seq seq = NULL;
827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829 if (seq)
830 {
831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 gcc_assert (!new_bb);
833 }
834 if (niters_maybe_zero)
835 {
836 /* Case C. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 init = build_all_ones_cst (niters_type);
839 }
840 else
841 {
842 /* Case B. */
843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 init = build_zero_cst (niters_type);
845 }
846 }
847
848 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849 create_iv (init, step, NULL_TREE, loop,
850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 true, NULL_TREE, true,
853 GSI_SAME_STMT);
854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855 true, GSI_SAME_STMT);
856
857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858 NULL_TREE);
859
860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
861
862 /* Record the number of latch iterations. */
863 if (limit == niters)
864 /* Case A: the loop iterates NITERS times. Subtract one to get the
865 latch count. */
866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 build_int_cst (niters_type, 1));
868 else
869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870 Subtract one from this to get the latch count. */
871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 limit, step);
873
874 if (final_iv)
875 {
876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 indx_after_incr, init);
878 gsi_insert_on_edge_immediate (single_exit (loop), assign);
879 }
880
881 return cond_stmt;
882 }
883
884 /* If we're using fully-masked loops, make LOOP iterate:
885
886 N == (NITERS - 1) / STEP + 1
887
888 times. When NITERS is zero, this is equivalent to making the loop
889 execute (1 << M) / STEP times, where M is the precision of NITERS.
890 NITERS_MAYBE_ZERO is true if this last case might occur.
891
892 If we're not using fully-masked loops, make LOOP iterate:
893
894 N == (NITERS - STEP) / STEP + 1
895
896 times, where NITERS is known to be outside the range [1, STEP - 1].
897 This is equivalent to making the loop execute NITERS / STEP times
898 when NITERS is nonzero and (1 << M) / STEP times otherwise.
899 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900
901 If FINAL_IV is nonnull, it is an SSA name that should be set to
902 N * STEP on exit from the loop.
903
904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
905
906 void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908 tree niters, tree step, tree final_iv,
909 bool niters_maybe_zero)
910 {
911 gcond *cond_stmt;
912 gcond *orig_cond = get_loop_exit_condition (loop);
913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914
915 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917 final_iv, niters_maybe_zero,
918 loop_cond_gsi);
919 else
920 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921 final_iv, niters_maybe_zero,
922 loop_cond_gsi);
923
924 /* Remove old loop exit test. */
925 stmt_vec_info orig_cond_info;
926 if (loop_vinfo
927 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928 loop_vinfo->remove_stmt (orig_cond_info);
929 else
930 gsi_remove (&loop_cond_gsi, true);
931
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934 cond_stmt);
935 }
936
937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938 For all PHI arguments in FROM->dest and TO->dest from those
939 edges ensure that TO->dest PHI arguments have current_def
940 to that in from. */
941
942 static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
944 {
945 gimple_stmt_iterator gsi_from, gsi_to;
946
947 for (gsi_from = gsi_start_phis (from->dest),
948 gsi_to = gsi_start_phis (to->dest);
949 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
950 {
951 gimple *from_phi = gsi_stmt (gsi_from);
952 gimple *to_phi = gsi_stmt (gsi_to);
953 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
954 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955 if (virtual_operand_p (from_arg))
956 {
957 gsi_next (&gsi_from);
958 continue;
959 }
960 if (virtual_operand_p (to_arg))
961 {
962 gsi_next (&gsi_to);
963 continue;
964 }
965 if (TREE_CODE (from_arg) != SSA_NAME)
966 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
967 else if (TREE_CODE (to_arg) == SSA_NAME
968 && from_arg != to_arg)
969 {
970 if (get_current_def (to_arg) == NULL_TREE)
971 {
972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973 TREE_TYPE (get_current_def
974 (from_arg))));
975 set_current_def (to_arg, get_current_def (from_arg));
976 }
977 }
978 gsi_next (&gsi_from);
979 gsi_next (&gsi_to);
980 }
981
982 gphi *from_phi = get_virtual_phi (from->dest);
983 gphi *to_phi = get_virtual_phi (to->dest);
984 if (from_phi)
985 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
987 }
988
989
990 /* Given LOOP this function generates a new copy of it and puts it
991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994 entry or exit of LOOP. */
995
996 class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998 class loop *scalar_loop, edge e)
999 {
1000 class loop *new_loop;
1001 basic_block *new_bbs, *bbs, *pbbs;
1002 bool at_exit;
1003 bool was_imm_dom;
1004 basic_block exit_dest;
1005 edge exit, new_exit;
1006 bool duplicate_outer_loop = false;
1007
1008 exit = single_exit (loop);
1009 at_exit = (e == exit);
1010 if (!at_exit && e != loop_preheader_edge (loop))
1011 return NULL;
1012
1013 if (scalar_loop == NULL)
1014 scalar_loop = loop;
1015
1016 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017 pbbs = bbs + 1;
1018 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019 /* Allow duplication of outer loops. */
1020 if (scalar_loop->inner)
1021 duplicate_outer_loop = true;
1022 /* Check whether duplication is possible. */
1023 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024 {
1025 free (bbs);
1026 return NULL;
1027 }
1028
1029 /* Generate new loop structure. */
1030 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031 duplicate_subloops (scalar_loop, new_loop);
1032
1033 exit_dest = exit->dest;
1034 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035 exit_dest) == loop->header ?
1036 true : false);
1037
1038 /* Also copy the pre-header, this avoids jumping through hoops to
1039 duplicate the loop entry PHI arguments. Create an empty
1040 pre-header unconditionally for this. */
1041 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042 edge entry_e = single_pred_edge (preheader);
1043 bbs[0] = preheader;
1044 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045
1046 exit = single_exit (scalar_loop);
1047 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048 &exit, 1, &new_exit, NULL,
1049 at_exit ? loop->latch : e->src, true);
1050 exit = single_exit (loop);
1051 basic_block new_preheader = new_bbs[0];
1052
1053 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1056 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1057 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1058
1059 if (scalar_loop != loop)
1060 {
1061 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1062 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1063 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1064 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1065 header) to have current_def set, so copy them over. */
1066 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1067 exit);
1068 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1069 0),
1070 EDGE_SUCC (loop->latch, 0));
1071 }
1072
1073 if (at_exit) /* Add the loop copy at exit. */
1074 {
1075 if (scalar_loop != loop)
1076 {
1077 gphi_iterator gsi;
1078 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1079
1080 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1081 gsi_next (&gsi))
1082 {
1083 gphi *phi = gsi.phi ();
1084 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1085 location_t orig_locus
1086 = gimple_phi_arg_location_from_edge (phi, e);
1087
1088 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1089 }
1090 }
1091 redirect_edge_and_branch_force (e, new_preheader);
1092 flush_pending_stmts (e);
1093 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1094 if (was_imm_dom || duplicate_outer_loop)
1095 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1096
1097 /* And remove the non-necessary forwarder again. Keep the other
1098 one so we have a proper pre-header for the loop at the exit edge. */
1099 redirect_edge_pred (single_succ_edge (preheader),
1100 single_pred (preheader));
1101 delete_basic_block (preheader);
1102 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1103 loop_preheader_edge (scalar_loop)->src);
1104 }
1105 else /* Add the copy at entry. */
1106 {
1107 if (scalar_loop != loop)
1108 {
1109 /* Remove the non-necessary forwarder of scalar_loop again. */
1110 redirect_edge_pred (single_succ_edge (preheader),
1111 single_pred (preheader));
1112 delete_basic_block (preheader);
1113 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1114 loop_preheader_edge (scalar_loop)->src);
1115 preheader = split_edge (loop_preheader_edge (loop));
1116 entry_e = single_pred_edge (preheader);
1117 }
1118
1119 redirect_edge_and_branch_force (entry_e, new_preheader);
1120 flush_pending_stmts (entry_e);
1121 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1122
1123 redirect_edge_and_branch_force (new_exit, preheader);
1124 flush_pending_stmts (new_exit);
1125 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1126
1127 /* And remove the non-necessary forwarder again. Keep the other
1128 one so we have a proper pre-header for the loop at the exit edge. */
1129 redirect_edge_pred (single_succ_edge (new_preheader),
1130 single_pred (new_preheader));
1131 delete_basic_block (new_preheader);
1132 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1133 loop_preheader_edge (new_loop)->src);
1134 }
1135
1136 if (scalar_loop != loop)
1137 {
1138 /* Update new_loop->header PHIs, so that on the preheader
1139 edge they are the ones from loop rather than scalar_loop. */
1140 gphi_iterator gsi_orig, gsi_new;
1141 edge orig_e = loop_preheader_edge (loop);
1142 edge new_e = loop_preheader_edge (new_loop);
1143
1144 for (gsi_orig = gsi_start_phis (loop->header),
1145 gsi_new = gsi_start_phis (new_loop->header);
1146 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148 {
1149 gphi *orig_phi = gsi_orig.phi ();
1150 gphi *new_phi = gsi_new.phi ();
1151 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152 location_t orig_locus
1153 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156 }
1157 }
1158
1159 free (new_bbs);
1160 free (bbs);
1161
1162 checking_verify_dominators (CDI_DOMINATORS);
1163
1164 return new_loop;
1165 }
1166
1167
1168 /* Given the condition expression COND, put it as the last statement of
1169 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1171 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1172 new edge as irreducible if IRREDUCIBLE_P is true. */
1173
1174 static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176 basic_block guard_to, basic_block dom_bb,
1177 profile_probability probability, bool irreducible_p)
1178 {
1179 gimple_stmt_iterator gsi;
1180 edge new_e, enter_e;
1181 gcond *cond_stmt;
1182 gimple_seq gimplify_stmt_list = NULL;
1183
1184 enter_e = EDGE_SUCC (guard_bb, 0);
1185 enter_e->flags &= ~EDGE_FALLTHRU;
1186 enter_e->flags |= EDGE_FALSE_VALUE;
1187 gsi = gsi_last_bb (guard_bb);
1188
1189 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190 NULL_TREE);
1191 if (gimplify_stmt_list)
1192 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193
1194 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195 gsi = gsi_last_bb (guard_bb);
1196 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198 /* Add new edge to connect guard block to the merge/loop-exit block. */
1199 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200
1201 new_e->probability = probability;
1202 if (irreducible_p)
1203 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
1205 enter_e->probability = probability.invert ();
1206 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207
1208 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1209 if (enter_e->dest->loop_father->header == enter_e->dest)
1210 split_edge (enter_e);
1211
1212 return new_e;
1213 }
1214
1215
1216 /* This function verifies that the following restrictions apply to LOOP:
1217 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218 for innermost loop and 5 basic blocks for outer-loop.
1219 (2) it is single entry, single exit
1220 (3) its exit condition is the last stmt in the header
1221 (4) E is the entry/exit edge of LOOP.
1222 */
1223
1224 bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226 {
1227 edge exit_e = single_exit (loop);
1228 edge entry_e = loop_preheader_edge (loop);
1229 gcond *orig_cond = get_loop_exit_condition (loop);
1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231 unsigned int num_bb = loop->inner? 5 : 2;
1232
1233 /* All loops have an outer scope; the only case loop->outer is NULL is for
1234 the function itself. */
1235 if (!loop_outer (loop)
1236 || loop->num_nodes != num_bb
1237 || !empty_block_p (loop->latch)
1238 || !single_exit (loop)
1239 /* Verify that new loop exit condition can be trivially modified. */
1240 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241 || (e != exit_e && e != entry_e))
1242 return false;
1243
1244 return true;
1245 }
1246
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248 in the exit bb and rename all the uses after the loop. This simplifies
1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250 (but normally loop closed SSA form doesn't require virtual PHIs to be
1251 in the same form). Doing this early simplifies the checking what
1252 uses should be renamed.
1253
1254 If we create a new phi after the loop, return the definition that
1255 applies on entry to the loop, otherwise return null. */
1256
1257 static tree
create_lcssa_for_virtual_phi(class loop * loop)1258 create_lcssa_for_virtual_phi (class loop *loop)
1259 {
1260 gphi_iterator gsi;
1261 edge exit_e = single_exit (loop);
1262
1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265 {
1266 gphi *phi = gsi.phi ();
1267 for (gsi = gsi_start_phis (exit_e->dest);
1268 !gsi_end_p (gsi); gsi_next (&gsi))
1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270 break;
1271 if (gsi_end_p (gsi))
1272 {
1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 imm_use_iterator imm_iter;
1277 gimple *stmt;
1278 use_operand_p use_p;
1279
1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283 gimple_phi_set_result (new_phi, new_vop);
1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285 if (stmt != new_phi
1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 SET_USE (use_p, new_vop);
1289
1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291 }
1292 break;
1293 }
1294 return NULL_TREE;
1295 }
1296
1297 /* Function vect_get_loop_location.
1298
1299 Extract the location of the loop in the source code.
1300 If the loop is not well formed for vectorization, an estimated
1301 location is calculated.
1302 Return the loop location if succeed and NULL if not. */
1303
1304 dump_user_location_t
find_loop_location(class loop * loop)1305 find_loop_location (class loop *loop)
1306 {
1307 gimple *stmt = NULL;
1308 basic_block bb;
1309 gimple_stmt_iterator si;
1310
1311 if (!loop)
1312 return dump_user_location_t ();
1313
1314 stmt = get_loop_exit_condition (loop);
1315
1316 if (stmt
1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318 return stmt;
1319
1320 /* If we got here the loop is probably not "well formed",
1321 try to estimate the loop location */
1322
1323 if (!loop->header)
1324 return dump_user_location_t ();
1325
1326 bb = loop->header;
1327
1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329 {
1330 stmt = gsi_stmt (si);
1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332 return stmt;
1333 }
1334
1335 return dump_user_location_t ();
1336 }
1337
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339 loop to be vectorized. */
1340
1341 static bool
iv_phi_p(stmt_vec_info stmt_info)1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345 if (virtual_operand_p (PHI_RESULT (phi)))
1346 return false;
1347
1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350 return false;
1351
1352 return true;
1353 }
1354
1355 /* Function vect_can_advance_ivs_p
1356
1357 In case the number of iterations that LOOP iterates is unknown at compile
1358 time, an epilog loop will be generated, and the loop induction variables
1359 (IVs) will be "advanced" to the value they are supposed to take just before
1360 the epilog loop. Here we check that the access function of the loop IVs
1361 and the expression that represents the loop bound are simple enough.
1362 These restrictions will be relaxed in the future. */
1363
1364 bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block bb = loop->header;
1369 gphi_iterator gsi;
1370
1371 /* Analyze phi functions of the loop header. */
1372
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376 {
1377 tree evolution_part;
1378
1379 gphi *phi = gsi.phi ();
1380 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383 phi_info->stmt);
1384
1385 /* Skip virtual phi's. The data dependences that are associated with
1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387
1388 Skip reduction phis. */
1389 if (!iv_phi_p (phi_info))
1390 {
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "reduc or virtual phi. skip.\n");
1394 continue;
1395 }
1396
1397 /* Analyze the evolution function. */
1398
1399 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400 if (evolution_part == NULL_TREE)
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf (MSG_MISSED_OPTIMIZATION,
1404 "No access function or evolution.\n");
1405 return false;
1406 }
1407
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are not invariants in the loop. */
1410
1411 if (!expr_invariant_in_loop_p (loop, evolution_part))
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution not invariant in loop.\n");
1416 return false;
1417 }
1418
1419 /* FORNOW: We do not transform initial conditions of IVs
1420 which evolution functions are a polynomial of degree >= 2. */
1421
1422 if (tree_is_chrec (evolution_part))
1423 {
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "evolution is chrec.\n");
1427 return false;
1428 }
1429 }
1430
1431 return true;
1432 }
1433
1434
1435 /* Function vect_update_ivs_after_vectorizer.
1436
1437 "Advance" the induction variables of LOOP to the value they should take
1438 after the execution of LOOP. This is currently necessary because the
1439 vectorizer does not handle induction variables that are used after the
1440 loop. Such a situation occurs when the last iterations of LOOP are
1441 peeled, because:
1442 1. We introduced new uses after LOOP for IVs that were not originally used
1443 after LOOP: the IVs of LOOP are now used by an epilog loop.
1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445 times, whereas the loop IVs should be bumped N times.
1446
1447 Input:
1448 - LOOP - a loop that is going to be vectorized. The last few iterations
1449 of LOOP were peeled.
1450 - NITERS - the number of iterations that LOOP executes (before it is
1451 vectorized). i.e, the number of times the ivs should be bumped.
1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453 coming out from LOOP on which there are uses of the LOOP ivs
1454 (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456 The new definitions of the ivs are placed in LOOP->exit.
1457 The phi args associated with the edge UPDATE_E in the bb
1458 UPDATE_E->dest are updated accordingly.
1459
1460 Assumption 1: Like the rest of the vectorizer, this function assumes
1461 a single loop exit that has a single predecessor.
1462
1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464 organized in the same order.
1465
1466 Assumption 3: The access function of the ivs is simple enough (see
1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1468
1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470 coming out of LOOP on which the ivs of LOOP are used (this is the path
1471 that leads to the epilog loop; other paths skip the epilog loop). This
1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473 needs to have its phis updated.
1474 */
1475
1476 static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 tree niters, edge update_e)
1479 {
1480 gphi_iterator gsi, gsi1;
1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482 basic_block update_bb = update_e->dest;
1483 basic_block exit_bb = single_exit (loop)->dest;
1484
1485 /* Make sure there exists a single-predecessor exit bb: */
1486 gcc_assert (single_pred_p (exit_bb));
1487 gcc_assert (single_succ_edge (exit_bb) == update_e);
1488
1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491 gsi_next (&gsi), gsi_next (&gsi1))
1492 {
1493 tree init_expr;
1494 tree step_expr, off;
1495 tree type;
1496 tree var, ni, ni_name;
1497 gimple_stmt_iterator last_gsi;
1498
1499 gphi *phi = gsi.phi ();
1500 gphi *phi1 = gsi1.phi ();
1501 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_NOTE, vect_location,
1504 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505
1506 /* Skip reduction and virtual phis. */
1507 if (!iv_phi_p (phi_info))
1508 {
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "reduc or virtual phi. skip.\n");
1512 continue;
1513 }
1514
1515 type = TREE_TYPE (gimple_phi_result (phi));
1516 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517 step_expr = unshare_expr (step_expr);
1518
1519 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520 of degree >= 2 or exponential. */
1521 gcc_assert (!tree_is_chrec (step_expr));
1522
1523 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524
1525 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526 fold_convert (TREE_TYPE (step_expr), niters),
1527 step_expr);
1528 if (POINTER_TYPE_P (type))
1529 ni = fold_build_pointer_plus (init_expr, off);
1530 else
1531 ni = fold_build2 (PLUS_EXPR, type,
1532 init_expr, fold_convert (type, off));
1533
1534 var = create_tmp_var (type, "tmp");
1535
1536 last_gsi = gsi_last_bb (exit_bb);
1537 gimple_seq new_stmts = NULL;
1538 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539 /* Exit_bb shouldn't be empty. */
1540 if (!gsi_end_p (last_gsi))
1541 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542 else
1543 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544
1545 /* Fix phi expressions in the successor bb. */
1546 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547 }
1548 }
1549
1550 /* Return a gimple value containing the misalignment (measured in vector
1551 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552 it is away from a perfectly aligned address. Add any new statements
1553 to SEQ. */
1554
1555 static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557 {
1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559 stmt_vec_info stmt_info = dr_info->stmt;
1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563 unsigned HOST_WIDE_INT target_align_c;
1564 tree target_align_minus_1;
1565
1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567 size_zero_node) < 0;
1568 tree offset = (negative
1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570 : size_zero_node);
1571 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572 offset);
1573 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1574 if (target_align.is_constant (&target_align_c))
1575 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1576 else
1577 {
1578 tree vla = build_int_cst (type, target_align);
1579 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1580 fold_build2 (MINUS_EXPR, type,
1581 build_int_cst (type, 0), vla));
1582 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1583 build_int_cst (type, 1));
1584 }
1585
1586 HOST_WIDE_INT elem_size
1587 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589
1590 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1591 tree int_start_addr = fold_convert (type, start_addr);
1592 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593 target_align_minus_1);
1594
1595 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1596 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597 elem_size_log);
1598
1599 return misalign_in_elems;
1600 }
1601
1602 /* Function vect_gen_prolog_loop_niters
1603
1604 Generate the number of iterations which should be peeled as prolog for the
1605 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1606 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1607 As a result, after the execution of this loop, the data reference DR will
1608 refer to an aligned location. The following computation is generated:
1609
1610 If the misalignment of DR is known at compile time:
1611 addr_mis = int mis = DR_MISALIGNMENT (dr);
1612 Else, compute address misalignment in bytes:
1613 addr_mis = addr & (target_align - 1)
1614
1615 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1616
1617 (elem_size = element type size; an element is the scalar element whose type
1618 is the inner type of the vectype)
1619
1620 The computations will be emitted at the end of BB. We also compute and
1621 store upper bound (included) of the result in BOUND.
1622
1623 When the step of the data-ref in the loop is not 1 (as in interleaved data
1624 and SLP), the number of iterations of the prolog must be divided by the step
1625 (which is equal to the size of interleaved group).
1626
1627 The above formulas assume that VF == number of elements in the vector. This
1628 may not hold when there are multiple-types in the loop.
1629 In this case, for some data-references in the loop the VF does not represent
1630 the number of elements that fit in the vector. Therefore, instead of VF we
1631 use TYPE_VECTOR_SUBPARTS. */
1632
1633 static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1634 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1635 basic_block bb, int *bound)
1636 {
1637 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1638 tree var;
1639 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1640 gimple_seq stmts = NULL, new_stmts = NULL;
1641 tree iters, iters_name;
1642 stmt_vec_info stmt_info = dr_info->stmt;
1643 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1644 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1645
1646 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1647 {
1648 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE, vect_location,
1652 "known peeling = %d.\n", npeel);
1653
1654 iters = build_int_cst (niters_type, npeel);
1655 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1656 }
1657 else
1658 {
1659 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660 tree type = TREE_TYPE (misalign_in_elems);
1661 HOST_WIDE_INT elem_size
1662 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1663 /* We only do prolog peeling if the target alignment is known at compile
1664 time. */
1665 poly_uint64 align_in_elems =
1666 exact_div (target_align, elem_size);
1667 tree align_in_elems_minus_1 =
1668 build_int_cst (type, align_in_elems - 1);
1669 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670
1671 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1672 & (align_in_elems - 1)). */
1673 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1674 size_zero_node) < 0;
1675 if (negative)
1676 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677 align_in_elems_tree);
1678 else
1679 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680 misalign_in_elems);
1681 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1682 iters = fold_convert (niters_type, iters);
1683 unsigned HOST_WIDE_INT align_in_elems_c;
1684 if (align_in_elems.is_constant (&align_in_elems_c))
1685 *bound = align_in_elems_c - 1;
1686 else
1687 *bound = -1;
1688 }
1689
1690 if (dump_enabled_p ())
1691 dump_printf_loc (MSG_NOTE, vect_location,
1692 "niters for prolog loop: %T\n", iters);
1693
1694 var = create_tmp_var (niters_type, "prolog_loop_niters");
1695 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1696
1697 if (new_stmts)
1698 gimple_seq_add_seq (&stmts, new_stmts);
1699 if (stmts)
1700 {
1701 gcc_assert (single_succ_p (bb));
1702 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1703 if (gsi_end_p (gsi))
1704 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1705 else
1706 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1707 }
1708 return iters_name;
1709 }
1710
1711
1712 /* Function vect_update_init_of_dr
1713
1714 If CODE is PLUS, the vector loop starts NITERS iterations after the
1715 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716 iterations before the scalar one (using masking to skip inactive
1717 elements). This function updates the information recorded in DR to
1718 account for the difference. Specifically, it updates the OFFSET
1719 field of DR_INFO. */
1720
1721 static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1722 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1723 {
1724 struct data_reference *dr = dr_info->dr;
1725 tree offset = dr_info->offset;
1726 if (!offset)
1727 offset = build_zero_cst (sizetype);
1728
1729 niters = fold_build2 (MULT_EXPR, sizetype,
1730 fold_convert (sizetype, niters),
1731 fold_convert (sizetype, DR_STEP (dr)));
1732 offset = fold_build2 (code, sizetype,
1733 fold_convert (sizetype, offset), niters);
1734 dr_info->offset = offset;
1735 }
1736
1737
1738 /* Function vect_update_inits_of_drs
1739
1740 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741 CODE and NITERS are as for vect_update_inits_of_dr. */
1742
1743 void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1744 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745 tree_code code)
1746 {
1747 unsigned int i;
1748 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1749 struct data_reference *dr;
1750
1751 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1752
1753 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1754 here, but since we might use these niters to update the epilogues niters
1755 and data references we can't insert them here as this definition might not
1756 always dominate its uses. */
1757 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1758 niters = fold_convert (sizetype, niters);
1759
1760 FOR_EACH_VEC_ELT (datarefs, i, dr)
1761 {
1762 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1764 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1765 vect_update_init_of_dr (dr_info, niters, code);
1766 }
1767 }
1768
1769 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770 by masking. This involves calculating the number of iterations to
1771 be peeled and then aligning all memory references appropriately. */
1772
1773 void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1774 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775 {
1776 tree misalign_in_elems;
1777 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778
1779 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780
1781 /* From the information recorded in LOOP_VINFO get the number of iterations
1782 that need to be skipped via masking. */
1783 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784 {
1785 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787 misalign_in_elems = build_int_cst (type, misalign);
1788 }
1789 else
1790 {
1791 gimple_seq seq1 = NULL, seq2 = NULL;
1792 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793 misalign_in_elems = fold_convert (type, misalign_in_elems);
1794 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795 &seq2, true, NULL_TREE);
1796 gimple_seq_add_seq (&seq1, seq2);
1797 if (seq1)
1798 {
1799 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801 gcc_assert (!new_bb);
1802 }
1803 }
1804
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location,
1807 "misalignment for fully-masked loop: %T\n",
1808 misalign_in_elems);
1809
1810 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811
1812 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813 }
1814
1815 /* This function builds ni_name = number of iterations. Statements
1816 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1817 it to TRUE if new ssa_var is generated. */
1818
1819 tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1820 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1821 {
1822 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1823 if (TREE_CODE (ni) == INTEGER_CST)
1824 return ni;
1825 else
1826 {
1827 tree ni_name, var;
1828 gimple_seq stmts = NULL;
1829 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1830
1831 var = create_tmp_var (TREE_TYPE (ni), "niters");
1832 ni_name = force_gimple_operand (ni, &stmts, false, var);
1833 if (stmts)
1834 {
1835 gsi_insert_seq_on_edge_immediate (pe, stmts);
1836 if (new_var_p != NULL)
1837 *new_var_p = true;
1838 }
1839
1840 return ni_name;
1841 }
1842 }
1843
1844 /* Calculate the number of iterations above which vectorized loop will be
1845 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1846 of prolog loop. If it's integer const, the integer number is also passed
1847 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1848 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1849 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1850 threshold below which the scalar (rather than vectorized) loop will be
1851 executed. This function stores the upper bound (inclusive) of the result
1852 in BOUND_SCALAR. */
1853
1854 static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1855 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856 int bound_prolog, poly_int64 bound_epilog, int th,
1857 poly_uint64 *bound_scalar,
1858 bool check_profitability)
1859 {
1860 tree type = TREE_TYPE (niters_prolog);
1861 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862 build_int_cst (type, bound_epilog));
1863
1864 *bound_scalar = bound_prolog + bound_epilog;
1865 if (check_profitability)
1866 {
1867 /* TH indicates the minimum niters of vectorized loop, while we
1868 compute the maximum niters of scalar loop. */
1869 th--;
1870 /* Peeling for constant times. */
1871 if (int_niters_prolog >= 0)
1872 {
1873 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1874 return build_int_cst (type, *bound_scalar);
1875 }
1876 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1877 and BOUND_EPILOG are inclusive upper bounds. */
1878 if (known_ge (th, bound_prolog + bound_epilog))
1879 {
1880 *bound_scalar = th;
1881 return build_int_cst (type, th);
1882 }
1883 /* Need to do runtime comparison. */
1884 else if (maybe_gt (th, bound_epilog))
1885 {
1886 *bound_scalar = upper_bound (*bound_scalar, th);
1887 return fold_build2 (MAX_EXPR, type,
1888 build_int_cst (type, th), niters);
1889 }
1890 }
1891 return niters;
1892 }
1893
1894 /* NITERS is the number of times that the original scalar loop executes
1895 after peeling. Work out the maximum number of iterations N that can
1896 be handled by the vectorized form of the loop and then either:
1897
1898 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1899
1900 niters_vector = N
1901
1902 b) set *STEP_VECTOR_PTR to one and generate:
1903
1904 niters_vector = N / vf
1905
1906 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1908 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1909
1910 void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1911 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912 tree *niters_vector_ptr, tree *step_vector_ptr,
1913 bool niters_no_overflow)
1914 {
1915 tree ni_minus_gap, var;
1916 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1918 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919 tree log_vf = NULL_TREE;
1920
1921 /* If epilogue loop is required because of data accesses with gaps, we
1922 subtract one iteration from the total number of iterations here for
1923 correct calculation of RATIO. */
1924 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1925 {
1926 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927 build_one_cst (type));
1928 if (!is_gimple_val (ni_minus_gap))
1929 {
1930 var = create_tmp_var (type, "ni_gap");
1931 gimple *stmts = NULL;
1932 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1933 true, var);
1934 gsi_insert_seq_on_edge_immediate (pe, stmts);
1935 }
1936 }
1937 else
1938 ni_minus_gap = niters;
1939
1940 unsigned HOST_WIDE_INT const_vf;
1941 if (vf.is_constant (&const_vf)
1942 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943 {
1944 /* Create: niters >> log2(vf) */
1945 /* If it's known that niters == number of latch executions + 1 doesn't
1946 overflow, we can generate niters >> log2(vf); otherwise we generate
1947 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1948 will be at least one. */
1949 log_vf = build_int_cst (type, exact_log2 (const_vf));
1950 if (niters_no_overflow)
1951 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1952 else
1953 niters_vector
1954 = fold_build2 (PLUS_EXPR, type,
1955 fold_build2 (RSHIFT_EXPR, type,
1956 fold_build2 (MINUS_EXPR, type,
1957 ni_minus_gap,
1958 build_int_cst (type, vf)),
1959 log_vf),
1960 build_int_cst (type, 1));
1961 step_vector = build_one_cst (type);
1962 }
1963 else
1964 {
1965 niters_vector = ni_minus_gap;
1966 step_vector = build_int_cst (type, vf);
1967 }
1968
1969 if (!is_gimple_val (niters_vector))
1970 {
1971 var = create_tmp_var (type, "bnd");
1972 gimple_seq stmts = NULL;
1973 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1974 gsi_insert_seq_on_edge_immediate (pe, stmts);
1975 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976 we set range information to make niters analyzer's life easier.
1977 Note the number of latch iteration value can be TYPE_MAX_VALUE so
1978 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
1979 if (stmts != NULL && log_vf)
1980 {
1981 if (niters_no_overflow)
1982 set_range_info (niters_vector, VR_RANGE,
1983 wi::one (TYPE_PRECISION (type)),
1984 wi::rshift (wi::max_value (TYPE_PRECISION (type),
1985 TYPE_SIGN (type)),
1986 exact_log2 (const_vf),
1987 TYPE_SIGN (type)));
1988 /* For VF == 1 the vector IV might also overflow so we cannot
1989 assert a minimum value of 1. */
1990 else if (const_vf > 1)
1991 set_range_info (niters_vector, VR_RANGE,
1992 wi::one (TYPE_PRECISION (type)),
1993 wi::rshift (wi::max_value (TYPE_PRECISION (type),
1994 TYPE_SIGN (type))
1995 - (const_vf - 1),
1996 exact_log2 (const_vf), TYPE_SIGN (type))
1997 + 1);
1998 }
1999 }
2000 *niters_vector_ptr = niters_vector;
2001 *step_vector_ptr = step_vector;
2002
2003 return;
2004 }
2005
2006 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2007 loop specified by LOOP_VINFO after vectorization, compute the number
2008 of iterations before vectorization (niters_vector * vf) and store it
2009 to NITERS_VECTOR_MULT_VF_PTR. */
2010
2011 static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)2012 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2013 tree niters_vector,
2014 tree *niters_vector_mult_vf_ptr)
2015 {
2016 /* We should be using a step_vector of VF if VF is variable. */
2017 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2018 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2019 tree type = TREE_TYPE (niters_vector);
2020 tree log_vf = build_int_cst (type, exact_log2 (vf));
2021 basic_block exit_bb = single_exit (loop)->dest;
2022
2023 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2024 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2025 niters_vector, log_vf);
2026 if (!is_gimple_val (niters_vector_mult_vf))
2027 {
2028 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2029 gimple_seq stmts = NULL;
2030 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2031 &stmts, true, var);
2032 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2033 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2034 }
2035 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2036 }
2037
2038 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2039 this function searches for the corresponding lcssa phi node in exit
2040 bb of LOOP. If it is found, return the phi result; otherwise return
2041 NULL. */
2042
2043 static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2044 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2045 gphi *lcssa_phi)
2046 {
2047 gphi_iterator gsi;
2048 edge e = single_exit (loop);
2049
2050 gcc_assert (single_pred_p (e->dest));
2051 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2052 {
2053 gphi *phi = gsi.phi ();
2054 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2055 PHI_ARG_DEF (lcssa_phi, 0), 0))
2056 return PHI_RESULT (phi);
2057 }
2058 return NULL_TREE;
2059 }
2060
2061 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2062 from SECOND/FIRST and puts it at the original loop's preheader/exit
2063 edge, the two loops are arranged as below:
2064
2065 preheader_a:
2066 first_loop:
2067 header_a:
2068 i_1 = PHI<i_0, i_2>;
2069 ...
2070 i_2 = i_1 + 1;
2071 if (cond_a)
2072 goto latch_a;
2073 else
2074 goto between_bb;
2075 latch_a:
2076 goto header_a;
2077
2078 between_bb:
2079 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2080
2081 second_loop:
2082 header_b:
2083 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2084 or with i_2 if no LCSSA phi is created
2085 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2086 ...
2087 i_4 = i_3 + 1;
2088 if (cond_b)
2089 goto latch_b;
2090 else
2091 goto exit_bb;
2092 latch_b:
2093 goto header_b;
2094
2095 exit_bb:
2096
2097 This function creates loop closed SSA for the first loop; update the
2098 second loop's PHI nodes by replacing argument on incoming edge with the
2099 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2100 is false, Loop closed ssa phis will only be created for non-iv phis for
2101 the first loop.
2102
2103 This function assumes exit bb of the first loop is preheader bb of the
2104 second loop, i.e, between_bb in the example code. With PHIs updated,
2105 the second loop will execute rest iterations of the first. */
2106
2107 static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)2108 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2109 class loop *first, class loop *second,
2110 bool create_lcssa_for_iv_phis)
2111 {
2112 gphi_iterator gsi_update, gsi_orig;
2113 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2114
2115 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2116 edge second_preheader_e = loop_preheader_edge (second);
2117 basic_block between_bb = single_exit (first)->dest;
2118
2119 gcc_assert (between_bb == second_preheader_e->src);
2120 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2121 /* Either the first loop or the second is the loop to be vectorized. */
2122 gcc_assert (loop == first || loop == second);
2123
2124 for (gsi_orig = gsi_start_phis (first->header),
2125 gsi_update = gsi_start_phis (second->header);
2126 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2127 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2128 {
2129 gphi *orig_phi = gsi_orig.phi ();
2130 gphi *update_phi = gsi_update.phi ();
2131
2132 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2133 /* Generate lcssa PHI node for the first loop. */
2134 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2135 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2136 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2137 {
2138 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2139 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2140 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2141 arg = new_res;
2142 }
2143
2144 /* Update PHI node in the second loop by replacing arg on the loop's
2145 incoming edge. */
2146 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2147 }
2148
2149 /* For epilogue peeling we have to make sure to copy all LC PHIs
2150 for correct vectorization of live stmts. */
2151 if (loop == first)
2152 {
2153 basic_block orig_exit = single_exit (second)->dest;
2154 for (gsi_orig = gsi_start_phis (orig_exit);
2155 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2156 {
2157 gphi *orig_phi = gsi_orig.phi ();
2158 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2159 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2160 continue;
2161
2162 /* Already created in the above loop. */
2163 if (find_guard_arg (first, second, orig_phi))
2164 continue;
2165
2166 tree new_res = copy_ssa_name (orig_arg);
2167 gphi *lcphi = create_phi_node (new_res, between_bb);
2168 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2169 }
2170 }
2171 }
2172
2173 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2174 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2175 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2176 appear like below:
2177
2178 guard_bb:
2179 if (cond)
2180 goto merge_bb;
2181 else
2182 goto skip_loop;
2183
2184 skip_loop:
2185 header_a:
2186 i_1 = PHI<i_0, i_2>;
2187 ...
2188 i_2 = i_1 + 1;
2189 if (cond_a)
2190 goto latch_a;
2191 else
2192 goto exit_a;
2193 latch_a:
2194 goto header_a;
2195
2196 exit_a:
2197 i_5 = PHI<i_2>;
2198
2199 merge_bb:
2200 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2201
2202 update_loop:
2203 header_b:
2204 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2205 ...
2206 i_4 = i_3 + 1;
2207 if (cond_b)
2208 goto latch_b;
2209 else
2210 goto exit_bb;
2211 latch_b:
2212 goto header_b;
2213
2214 exit_bb:
2215
2216 This function creates PHI nodes at merge_bb and replaces the use of i_5
2217 in the update_loop's PHI node with the result of new PHI result. */
2218
2219 static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2220 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2221 class loop *update_loop,
2222 edge guard_edge, edge merge_edge)
2223 {
2224 location_t merge_loc, guard_loc;
2225 edge orig_e = loop_preheader_edge (skip_loop);
2226 edge update_e = loop_preheader_edge (update_loop);
2227 gphi_iterator gsi_orig, gsi_update;
2228
2229 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2230 gsi_update = gsi_start_phis (update_loop->header));
2231 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2232 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2233 {
2234 gphi *orig_phi = gsi_orig.phi ();
2235 gphi *update_phi = gsi_update.phi ();
2236
2237 /* Generate new phi node at merge bb of the guard. */
2238 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2239 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2240
2241 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2242 args in NEW_PHI for these edges. */
2243 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2244 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2245 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2246 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2247 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2248 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2249
2250 /* Update phi in UPDATE_PHI. */
2251 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2252 }
2253 }
2254
2255 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2256 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2257 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2258 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2259 The CFG looks like:
2260
2261 loop:
2262 header_a:
2263 i_1 = PHI<i_0, i_2>;
2264 ...
2265 i_2 = i_1 + 1;
2266 if (cond_a)
2267 goto latch_a;
2268 else
2269 goto exit_a;
2270 latch_a:
2271 goto header_a;
2272
2273 exit_a:
2274
2275 guard_bb:
2276 if (cond)
2277 goto merge_bb;
2278 else
2279 goto epilog_loop;
2280
2281 ;; fall_through_bb
2282
2283 epilog_loop:
2284 header_b:
2285 i_3 = PHI<i_2, i_4>;
2286 ...
2287 i_4 = i_3 + 1;
2288 if (cond_b)
2289 goto latch_b;
2290 else
2291 goto merge_bb;
2292 latch_b:
2293 goto header_b;
2294
2295 merge_bb:
2296 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2297
2298 exit_bb:
2299 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2300
2301 For each name used out side EPILOG (i.e - for each name that has a lcssa
2302 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2303 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2304 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2305 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2306 in exit_bb will also be updated. */
2307
2308 static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2309 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2310 edge guard_edge, edge merge_edge)
2311 {
2312 gphi_iterator gsi;
2313 basic_block merge_bb = guard_edge->dest;
2314
2315 gcc_assert (single_succ_p (merge_bb));
2316 edge e = single_succ_edge (merge_bb);
2317 basic_block exit_bb = e->dest;
2318 gcc_assert (single_pred_p (exit_bb));
2319 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2320
2321 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2322 {
2323 gphi *update_phi = gsi.phi ();
2324 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2325
2326 tree merge_arg = NULL_TREE;
2327
2328 /* If the old argument is a SSA_NAME use its current_def. */
2329 if (TREE_CODE (old_arg) == SSA_NAME)
2330 merge_arg = get_current_def (old_arg);
2331 /* If it's a constant or doesn't have a current_def, just use the old
2332 argument. */
2333 if (!merge_arg)
2334 merge_arg = old_arg;
2335
2336 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2337 /* If the var is live after loop but not a reduction, we simply
2338 use the old arg. */
2339 if (!guard_arg)
2340 guard_arg = old_arg;
2341
2342 /* Create new phi node in MERGE_BB: */
2343 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2344 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2345
2346 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2347 the two PHI args in merge_phi for these edges. */
2348 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2349 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2350
2351 /* Update the original phi in exit_bb. */
2352 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2353 }
2354 }
2355
2356 /* EPILOG loop is duplicated from the original loop for vectorizing,
2357 the arg of its loop closed ssa PHI needs to be updated. */
2358
2359 static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2360 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2361 {
2362 gphi_iterator gsi;
2363 basic_block exit_bb = single_exit (epilog)->dest;
2364
2365 gcc_assert (single_pred_p (exit_bb));
2366 edge e = EDGE_PRED (exit_bb, 0);
2367 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2368 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2369 }
2370
2371 /* Function vect_do_peeling.
2372
2373 Input:
2374 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2375
2376 preheader:
2377 LOOP:
2378 header_bb:
2379 loop_body
2380 if (exit_loop_cond) goto exit_bb
2381 else goto header_bb
2382 exit_bb:
2383
2384 - NITERS: The number of iterations of the loop.
2385 - NITERSM1: The number of iterations of the loop's latch.
2386 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2387 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2388 CHECK_PROFITABILITY is true.
2389 Output:
2390 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2391 iterate after vectorization; see vect_set_loop_condition for details.
2392 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2393 should be set to the number of scalar iterations handled by the
2394 vector loop. The SSA name is only used on exit from the loop.
2395
2396 This function peels prolog and epilog from the loop, adds guards skipping
2397 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2398 would look like:
2399
2400 guard_bb_1:
2401 if (prefer_scalar_loop) goto merge_bb_1
2402 else goto guard_bb_2
2403
2404 guard_bb_2:
2405 if (skip_prolog) goto merge_bb_2
2406 else goto prolog_preheader
2407
2408 prolog_preheader:
2409 PROLOG:
2410 prolog_header_bb:
2411 prolog_body
2412 if (exit_prolog_cond) goto prolog_exit_bb
2413 else goto prolog_header_bb
2414 prolog_exit_bb:
2415
2416 merge_bb_2:
2417
2418 vector_preheader:
2419 VECTOR LOOP:
2420 vector_header_bb:
2421 vector_body
2422 if (exit_vector_cond) goto vector_exit_bb
2423 else goto vector_header_bb
2424 vector_exit_bb:
2425
2426 guard_bb_3:
2427 if (skip_epilog) goto merge_bb_3
2428 else goto epilog_preheader
2429
2430 merge_bb_1:
2431
2432 epilog_preheader:
2433 EPILOG:
2434 epilog_header_bb:
2435 epilog_body
2436 if (exit_epilog_cond) goto merge_bb_3
2437 else goto epilog_header_bb
2438
2439 merge_bb_3:
2440
2441 Note this function peels prolog and epilog only if it's necessary,
2442 as well as guards.
2443 This function returns the epilogue loop if a decision was made to vectorize
2444 it, otherwise NULL.
2445
2446 The analysis resulting in this epilogue loop's loop_vec_info was performed
2447 in the same vect_analyze_loop call as the main loop's. At that time
2448 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2449 vectorization factors than the main loop. This list is stored in the main
2450 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2451 vectorize the epilogue loop for a lower vectorization factor, the
2452 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2453 updated and linked to the epilogue loop. This is later used to vectorize
2454 the epilogue. The reason the loop_vec_info needs updating is that it was
2455 constructed based on the original main loop, and the epilogue loop is a
2456 copy of this loop, so all links pointing to statements in the original loop
2457 need updating. Furthermore, these loop_vec_infos share the
2458 data_reference's records, which will also need to be updated.
2459
2460 TODO: Guard for prefer_scalar_loop should be emitted along with
2461 versioning conditions if loop versioning is needed. */
2462
2463
2464 class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)2465 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2466 tree *niters_vector, tree *step_vector,
2467 tree *niters_vector_mult_vf_var, int th,
2468 bool check_profitability, bool niters_no_overflow,
2469 tree *advance)
2470 {
2471 edge e, guard_e;
2472 tree type = TREE_TYPE (niters), guard_cond;
2473 basic_block guard_bb, guard_to;
2474 profile_probability prob_prolog, prob_vector, prob_epilog;
2475 int estimated_vf;
2476 int prolog_peeling = 0;
2477 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2478 /* We currently do not support prolog peeling if the target alignment is not
2479 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2480 target alignment being constant. */
2481 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2482 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2483 return NULL;
2484
2485 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2486 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2487
2488 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2489 poly_uint64 bound_epilog = 0;
2490 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2491 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2492 bound_epilog += vf - 1;
2493 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2494 bound_epilog += 1;
2495 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2496 poly_uint64 bound_scalar = bound_epilog;
2497
2498 if (!prolog_peeling && !epilog_peeling)
2499 return NULL;
2500
2501 /* Before doing any peeling make sure to reset debug binds outside of
2502 the loop refering to defs not in LC SSA. */
2503 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2504 for (unsigned i = 0; i < loop->num_nodes; ++i)
2505 {
2506 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2507 imm_use_iterator ui;
2508 gimple *use_stmt;
2509 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2510 gsi_next (&gsi))
2511 {
2512 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2513 if (gimple_debug_bind_p (use_stmt)
2514 && loop != gimple_bb (use_stmt)->loop_father
2515 && !flow_loop_nested_p (loop,
2516 gimple_bb (use_stmt)->loop_father))
2517 {
2518 gimple_debug_bind_reset_value (use_stmt);
2519 update_stmt (use_stmt);
2520 }
2521 }
2522 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2523 gsi_next (&gsi))
2524 {
2525 ssa_op_iter op_iter;
2526 def_operand_p def_p;
2527 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2528 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2529 if (gimple_debug_bind_p (use_stmt)
2530 && loop != gimple_bb (use_stmt)->loop_father
2531 && !flow_loop_nested_p (loop,
2532 gimple_bb (use_stmt)->loop_father))
2533 {
2534 gimple_debug_bind_reset_value (use_stmt);
2535 update_stmt (use_stmt);
2536 }
2537 }
2538 }
2539
2540 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2541 estimated_vf = vect_vf_for_cost (loop_vinfo);
2542 if (estimated_vf == 2)
2543 estimated_vf = 3;
2544 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2545 .apply_scale (estimated_vf - 1, estimated_vf);
2546
2547 class loop *prolog, *epilog = NULL;
2548 class loop *first_loop = loop;
2549 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2550
2551 /* We might have a queued need to update virtual SSA form. As we
2552 delete the update SSA machinery below after doing a regular
2553 incremental SSA update during loop copying make sure we don't
2554 lose that fact.
2555 ??? Needing to update virtual SSA form by renaming is unfortunate
2556 but not all of the vectorizer code inserting new loads / stores
2557 properly assigns virtual operands to those statements. */
2558 update_ssa (TODO_update_ssa_only_virtuals);
2559
2560 create_lcssa_for_virtual_phi (loop);
2561
2562 /* If we're vectorizing an epilogue loop, the update_ssa above will
2563 have ensured that the virtual operand is in SSA form throughout the
2564 vectorized main loop. Normally it is possible to trace the updated
2565 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2566 back to scalar-stmt vuses, meaning that the effect of the SSA update
2567 remains local to the main loop. However, there are rare cases in
2568 which the vectorized loop has vdefs even when the original scalar
2569 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2570 introduces clobbers of the temporary vector array, which in turn
2571 needs new vdefs. If the scalar loop doesn't write to memory, these
2572 new vdefs will be the only ones in the vector loop.
2573
2574 In that case, update_ssa will have added a new virtual phi to the
2575 main loop, which previously didn't need one. Ensure that we (locally)
2576 maintain LCSSA form for the virtual operand, just as we would have
2577 done if the virtual phi had existed from the outset. This makes it
2578 easier to duplicate the scalar epilogue loop below. */
2579 tree vop_to_rename = NULL_TREE;
2580 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2581 {
2582 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2583 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2584 }
2585
2586 if (MAY_HAVE_DEBUG_BIND_STMTS)
2587 {
2588 gcc_assert (!adjust_vec.exists ());
2589 adjust_vec.create (32);
2590 }
2591 initialize_original_copy_tables ();
2592
2593 /* Record the anchor bb at which the guard should be placed if the scalar
2594 loop might be preferred. */
2595 basic_block anchor = loop_preheader_edge (loop)->src;
2596
2597 /* Generate the number of iterations for the prolog loop. We do this here
2598 so that we can also get the upper bound on the number of iterations. */
2599 tree niters_prolog;
2600 int bound_prolog = 0;
2601 if (prolog_peeling)
2602 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2603 &bound_prolog);
2604 else
2605 niters_prolog = build_int_cst (type, 0);
2606
2607 loop_vec_info epilogue_vinfo = NULL;
2608 if (vect_epilogues)
2609 {
2610 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2611 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2612 }
2613
2614 tree niters_vector_mult_vf = NULL_TREE;
2615 /* Saving NITERs before the loop, as this may be changed by prologue. */
2616 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2617 edge update_e = NULL, skip_e = NULL;
2618 unsigned int lowest_vf = constant_lower_bound (vf);
2619 /* If we know the number of scalar iterations for the main loop we should
2620 check whether after the main loop there are enough iterations left over
2621 for the epilogue. */
2622 if (vect_epilogues
2623 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624 && prolog_peeling >= 0
2625 && known_eq (vf, lowest_vf))
2626 {
2627 unsigned HOST_WIDE_INT eiters
2628 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2629 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2630
2631 eiters -= prolog_peeling;
2632 eiters
2633 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2634
2635 unsigned int ratio;
2636 unsigned int epilogue_gaps
2637 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2638 while (!(constant_multiple_p
2639 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2640 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2641 && eiters >= lowest_vf / ratio + epilogue_gaps))
2642 {
2643 delete epilogue_vinfo;
2644 epilogue_vinfo = NULL;
2645 if (loop_vinfo->epilogue_vinfos.length () == 0)
2646 {
2647 vect_epilogues = false;
2648 break;
2649 }
2650 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2651 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2652 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2653 }
2654 }
2655 /* Prolog loop may be skipped. */
2656 bool skip_prolog = (prolog_peeling != 0);
2657 /* Skip this loop to epilog when there are not enough iterations to enter this
2658 vectorized loop. If true we should perform runtime checks on the NITERS
2659 to check whether we should skip the current vectorized loop. If we know
2660 the number of scalar iterations we may choose to add a runtime check if
2661 this number "maybe" smaller than the number of iterations required
2662 when we know the number of scalar iterations may potentially
2663 be smaller than the number of iterations required to enter this loop, for
2664 this we use the upper bounds on the prolog and epilog peeling. When we
2665 don't know the number of iterations and don't require versioning it is
2666 because we have asserted that there are enough scalar iterations to enter
2667 the main loop, so this skip is not necessary. When we are versioning then
2668 we only add such a skip if we have chosen to vectorize the epilogue. */
2669 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2670 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2671 bound_prolog + bound_epilog)
2672 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2673 || vect_epilogues));
2674 /* Epilog loop must be executed if the number of iterations for epilog
2675 loop is known at compile time, otherwise we need to add a check at
2676 the end of vector loop and skip to the end of epilog loop. */
2677 bool skip_epilog = (prolog_peeling < 0
2678 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2679 || !vf.is_constant ());
2680 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2681 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2682 skip_epilog = false;
2683
2684 if (skip_vector)
2685 {
2686 split_edge (loop_preheader_edge (loop));
2687
2688 /* Due to the order in which we peel prolog and epilog, we first
2689 propagate probability to the whole loop. The purpose is to
2690 avoid adjusting probabilities of both prolog and vector loops
2691 separately. Note in this case, the probability of epilog loop
2692 needs to be scaled back later. */
2693 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2694 if (prob_vector.initialized_p ())
2695 {
2696 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2697 scale_loop_profile (loop, prob_vector, 0);
2698 }
2699 }
2700
2701 dump_user_location_t loop_loc = find_loop_location (loop);
2702 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2703 if (vect_epilogues)
2704 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2705 use the original scalar loop as remaining epilogue if necessary. */
2706 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2707 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2708
2709 if (prolog_peeling)
2710 {
2711 e = loop_preheader_edge (loop);
2712 if (!slpeel_can_duplicate_loop_p (loop, e))
2713 {
2714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2715 "loop can't be duplicated to preheader edge.\n");
2716 gcc_unreachable ();
2717 }
2718 /* Peel prolog and put it on preheader edge of loop. */
2719 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2720 if (!prolog)
2721 {
2722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2723 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2724 gcc_unreachable ();
2725 }
2726 prolog->force_vectorize = false;
2727 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2728 first_loop = prolog;
2729 reset_original_copy_tables ();
2730
2731 /* Update the number of iterations for prolog loop. */
2732 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2733 vect_set_loop_condition (prolog, NULL, niters_prolog,
2734 step_prolog, NULL_TREE, false);
2735
2736 /* Skip the prolog loop. */
2737 if (skip_prolog)
2738 {
2739 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2740 niters_prolog, build_int_cst (type, 0));
2741 guard_bb = loop_preheader_edge (prolog)->src;
2742 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2743 guard_to = split_edge (loop_preheader_edge (loop));
2744 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2745 guard_to, guard_bb,
2746 prob_prolog.invert (),
2747 irred_flag);
2748 e = EDGE_PRED (guard_to, 0);
2749 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2750 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2751
2752 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2753 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2754 }
2755
2756 /* Update init address of DRs. */
2757 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2758 /* Update niters for vector loop. */
2759 LOOP_VINFO_NITERS (loop_vinfo)
2760 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2761 LOOP_VINFO_NITERSM1 (loop_vinfo)
2762 = fold_build2 (MINUS_EXPR, type,
2763 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2764 bool new_var_p = false;
2765 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2766 /* It's guaranteed that vector loop bound before vectorization is at
2767 least VF, so set range information for newly generated var. */
2768 if (new_var_p)
2769 set_range_info (niters, VR_RANGE,
2770 wi::to_wide (build_int_cst (type, vf)),
2771 wi::to_wide (TYPE_MAX_VALUE (type)));
2772
2773 /* Prolog iterates at most bound_prolog times, latch iterates at
2774 most bound_prolog - 1 times. */
2775 record_niter_bound (prolog, bound_prolog - 1, false, true);
2776 delete_update_ssa ();
2777 adjust_vec_debug_stmts ();
2778 scev_reset ();
2779 }
2780
2781 if (epilog_peeling)
2782 {
2783 e = single_exit (loop);
2784 if (!slpeel_can_duplicate_loop_p (loop, e))
2785 {
2786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2787 "loop can't be duplicated to exit edge.\n");
2788 gcc_unreachable ();
2789 }
2790 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2791 said epilog then we should use a copy of the main loop as a starting
2792 point. This loop may have already had some preliminary transformations
2793 to allow for more optimal vectorization, for example if-conversion.
2794 If we are not vectorizing the epilog then we should use the scalar loop
2795 as the transformations mentioned above make less or no sense when not
2796 vectorizing. */
2797 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2798 if (vop_to_rename)
2799 {
2800 /* Vectorizing the main loop can sometimes introduce a vdef to
2801 a loop that previously didn't have one; see the comment above
2802 the definition of VOP_TO_RENAME for details. The definition
2803 D that holds on E will then be different from the definition
2804 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2805 rename VOP_TO_RENAME to D when copying the loop.
2806
2807 The virtual operand is in LCSSA form for the main loop,
2808 and no stmt between the main loop and E needs a vdef,
2809 so we know that D is provided by a phi rather than by a
2810 vdef on a normal gimple stmt. */
2811 basic_block vdef_bb = e->src;
2812 gphi *vphi;
2813 while (!(vphi = get_virtual_phi (vdef_bb)))
2814 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2815 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2816 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2817 }
2818 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2819 if (!epilog)
2820 {
2821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2822 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2823 gcc_unreachable ();
2824 }
2825 epilog->force_vectorize = false;
2826 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2827
2828 /* Scalar version loop may be preferred. In this case, add guard
2829 and skip to epilog. Note this only happens when the number of
2830 iterations of loop is unknown at compile time, otherwise this
2831 won't be vectorized. */
2832 if (skip_vector)
2833 {
2834 /* Additional epilogue iteration is peeled if gap exists. */
2835 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2836 bound_prolog, bound_epilog,
2837 th, &bound_scalar,
2838 check_profitability);
2839 /* Build guard against NITERSM1 since NITERS may overflow. */
2840 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2841 guard_bb = anchor;
2842 guard_to = split_edge (loop_preheader_edge (epilog));
2843 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2844 guard_to, guard_bb,
2845 prob_vector.invert (),
2846 irred_flag);
2847 skip_e = guard_e;
2848 e = EDGE_PRED (guard_to, 0);
2849 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2850 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2851
2852 /* Simply propagate profile info from guard_bb to guard_to which is
2853 a merge point of control flow. */
2854 guard_to->count = guard_bb->count;
2855
2856 /* Scale probability of epilog loop back.
2857 FIXME: We should avoid scaling down and back up. Profile may
2858 get lost if we scale down to 0. */
2859 basic_block *bbs = get_loop_body (epilog);
2860 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2861 bbs[i]->count = bbs[i]->count.apply_scale
2862 (bbs[i]->count,
2863 bbs[i]->count.apply_probability
2864 (prob_vector));
2865 free (bbs);
2866 }
2867
2868 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2869 /* If loop is peeled for non-zero constant times, now niters refers to
2870 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2871 overflows. */
2872 niters_no_overflow |= (prolog_peeling > 0);
2873 vect_gen_vector_loop_niters (loop_vinfo, niters,
2874 niters_vector, step_vector,
2875 niters_no_overflow);
2876 if (!integer_onep (*step_vector))
2877 {
2878 /* On exit from the loop we will have an easy way of calcalating
2879 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2880 until then. */
2881 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2882 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2883 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2884 }
2885 else
2886 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2887 &niters_vector_mult_vf);
2888 /* Update IVs of original loop as if they were advanced by
2889 niters_vector_mult_vf steps. */
2890 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2891 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2892 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2893 update_e);
2894
2895 if (skip_epilog)
2896 {
2897 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2898 niters, niters_vector_mult_vf);
2899 guard_bb = single_exit (loop)->dest;
2900 guard_to = split_edge (single_exit (epilog));
2901 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2902 skip_vector ? anchor : guard_bb,
2903 prob_epilog.invert (),
2904 irred_flag);
2905 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2906 single_exit (epilog));
2907 /* Only need to handle basic block before epilog loop if it's not
2908 the guard_bb, which is the case when skip_vector is true. */
2909 if (guard_bb != bb_before_epilog)
2910 {
2911 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2912
2913 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2914 }
2915 scale_loop_profile (epilog, prob_epilog, 0);
2916 }
2917 else
2918 slpeel_update_phi_nodes_for_lcssa (epilog);
2919
2920 unsigned HOST_WIDE_INT bound;
2921 if (bound_scalar.is_constant (&bound))
2922 {
2923 gcc_assert (bound != 0);
2924 /* -1 to convert loop iterations to latch iterations. */
2925 record_niter_bound (epilog, bound - 1, false, true);
2926 }
2927
2928 delete_update_ssa ();
2929 adjust_vec_debug_stmts ();
2930 scev_reset ();
2931 }
2932
2933 if (vect_epilogues)
2934 {
2935 epilog->aux = epilogue_vinfo;
2936 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2937
2938 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2939
2940 /* We now must calculate the number of NITERS performed by the previous
2941 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2942 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2943 niters_prolog, niters_vector_mult_vf);
2944
2945 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2946 determine whether we are coming from the previous vectorized loop
2947 using the update_e edge or the skip_vector basic block using the
2948 skip_e edge. */
2949 if (skip_vector)
2950 {
2951 gcc_assert (update_e != NULL && skip_e != NULL);
2952 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2953 update_e->dest);
2954 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2955 gimple *stmt = gimple_build_assign (new_ssa, niters);
2956 gimple_stmt_iterator gsi;
2957 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2958 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2959 {
2960 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2961 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2962 }
2963 else
2964 {
2965 gsi = gsi_last_bb (update_e->src);
2966 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2967 }
2968
2969 niters = new_ssa;
2970 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2971 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2972 UNKNOWN_LOCATION);
2973 niters = PHI_RESULT (new_phi);
2974 }
2975
2976 /* Subtract the number of iterations performed by the vectorized loop
2977 from the number of total iterations. */
2978 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2979 before_loop_niters,
2980 niters);
2981
2982 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2983 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2984 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2985 epilogue_niters,
2986 build_one_cst (TREE_TYPE (epilogue_niters)));
2987
2988 /* Set ADVANCE to the number of iterations performed by the previous
2989 loop and its prologue. */
2990 *advance = niters;
2991
2992 /* Redo the peeling for niter analysis as the NITERs and alignment
2993 may have been updated to take the main loop into account. */
2994 determine_peel_for_niter (epilogue_vinfo);
2995 }
2996
2997 adjust_vec.release ();
2998 free_original_copy_tables ();
2999
3000 return vect_epilogues ? epilog : NULL;
3001 }
3002
3003 /* Function vect_create_cond_for_niters_checks.
3004
3005 Create a conditional expression that represents the run-time checks for
3006 loop's niter. The loop is guaranteed to terminate if the run-time
3007 checks hold.
3008
3009 Input:
3010 COND_EXPR - input conditional expression. New conditions will be chained
3011 with logical AND operation. If it is NULL, then the function
3012 is used to return the number of alias checks.
3013 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3014 to be checked.
3015
3016 Output:
3017 COND_EXPR - conditional expression.
3018
3019 The returned COND_EXPR is the conditional expression to be used in the
3020 if statement that controls which version of the loop gets executed at
3021 runtime. */
3022
3023 static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)3024 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3025 {
3026 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3027
3028 if (*cond_expr)
3029 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3030 *cond_expr, part_cond_expr);
3031 else
3032 *cond_expr = part_cond_expr;
3033 }
3034
3035 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3036 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3037
3038 static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3039 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3040 {
3041 if (*cond_expr)
3042 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3043 *cond_expr, part_cond_expr);
3044 else
3045 *cond_expr = part_cond_expr;
3046 }
3047
3048 /* Function vect_create_cond_for_align_checks.
3049
3050 Create a conditional expression that represents the alignment checks for
3051 all of data references (array element references) whose alignment must be
3052 checked at runtime.
3053
3054 Input:
3055 COND_EXPR - input conditional expression. New conditions will be chained
3056 with logical AND operation.
3057 LOOP_VINFO - two fields of the loop information are used.
3058 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3059 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3060
3061 Output:
3062 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3063 expression.
3064 The returned value is the conditional expression to be used in the if
3065 statement that controls which version of the loop gets executed at runtime.
3066
3067 The algorithm makes two assumptions:
3068 1) The number of bytes "n" in a vector is a power of 2.
3069 2) An address "a" is aligned if a%n is zero and that this
3070 test can be done as a&(n-1) == 0. For example, for 16
3071 byte vectors the test is a&0xf == 0. */
3072
3073 static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)3074 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3075 tree *cond_expr,
3076 gimple_seq *cond_expr_stmt_list)
3077 {
3078 vec<stmt_vec_info> may_misalign_stmts
3079 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3080 stmt_vec_info stmt_info;
3081 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3082 tree mask_cst;
3083 unsigned int i;
3084 tree int_ptrsize_type;
3085 char tmp_name[20];
3086 tree or_tmp_name = NULL_TREE;
3087 tree and_tmp_name;
3088 gimple *and_stmt;
3089 tree ptrsize_zero;
3090 tree part_cond_expr;
3091
3092 /* Check that mask is one less than a power of 2, i.e., mask is
3093 all zeros followed by all ones. */
3094 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3095
3096 int_ptrsize_type = signed_type_for (ptr_type_node);
3097
3098 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3099 of the first vector of the i'th data reference. */
3100
3101 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3102 {
3103 gimple_seq new_stmt_list = NULL;
3104 tree addr_base;
3105 tree addr_tmp_name;
3106 tree new_or_tmp_name;
3107 gimple *addr_stmt, *or_stmt;
3108 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3109 bool negative = tree_int_cst_compare
3110 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3111 tree offset = negative
3112 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3113
3114 /* create: addr_tmp = (int)(address_of_first_vector) */
3115 addr_base =
3116 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3117 offset);
3118 if (new_stmt_list != NULL)
3119 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3120
3121 sprintf (tmp_name, "addr2int%d", i);
3122 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3123 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3124 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3125
3126 /* The addresses are OR together. */
3127
3128 if (or_tmp_name != NULL_TREE)
3129 {
3130 /* create: or_tmp = or_tmp | addr_tmp */
3131 sprintf (tmp_name, "orptrs%d", i);
3132 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3133 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3134 or_tmp_name, addr_tmp_name);
3135 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3136 or_tmp_name = new_or_tmp_name;
3137 }
3138 else
3139 or_tmp_name = addr_tmp_name;
3140
3141 } /* end for i */
3142
3143 mask_cst = build_int_cst (int_ptrsize_type, mask);
3144
3145 /* create: and_tmp = or_tmp & mask */
3146 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3147
3148 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3149 or_tmp_name, mask_cst);
3150 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3151
3152 /* Make and_tmp the left operand of the conditional test against zero.
3153 if and_tmp has a nonzero bit then some address is unaligned. */
3154 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3155 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3156 and_tmp_name, ptrsize_zero);
3157 chain_cond_expr (cond_expr, part_cond_expr);
3158 }
3159
3160 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3161 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3162 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3163 and this new condition are true. Treat a null *COND_EXPR as "true". */
3164
3165 static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3166 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3167 {
3168 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3169 unsigned int i;
3170 vec_object_pair *pair;
3171 FOR_EACH_VEC_ELT (pairs, i, pair)
3172 {
3173 tree addr1 = build_fold_addr_expr (pair->first);
3174 tree addr2 = build_fold_addr_expr (pair->second);
3175 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3176 addr1, addr2);
3177 chain_cond_expr (cond_expr, part_cond_expr);
3178 }
3179 }
3180
3181 /* Create an expression that is true when all lower-bound conditions for
3182 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3183
3184 static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3185 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3186 {
3187 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3188 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3189 {
3190 tree expr = lower_bounds[i].expr;
3191 tree type = unsigned_type_for (TREE_TYPE (expr));
3192 expr = fold_convert (type, expr);
3193 poly_uint64 bound = lower_bounds[i].min_value;
3194 if (!lower_bounds[i].unsigned_p)
3195 {
3196 expr = fold_build2 (PLUS_EXPR, type, expr,
3197 build_int_cstu (type, bound - 1));
3198 bound += bound - 1;
3199 }
3200 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3201 build_int_cstu (type, bound));
3202 chain_cond_expr (cond_expr, part_cond_expr);
3203 }
3204 }
3205
3206 /* Function vect_create_cond_for_alias_checks.
3207
3208 Create a conditional expression that represents the run-time checks for
3209 overlapping of address ranges represented by a list of data references
3210 relations passed as input.
3211
3212 Input:
3213 COND_EXPR - input conditional expression. New conditions will be chained
3214 with logical AND operation. If it is NULL, then the function
3215 is used to return the number of alias checks.
3216 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3217 to be checked.
3218
3219 Output:
3220 COND_EXPR - conditional expression.
3221
3222 The returned COND_EXPR is the conditional expression to be used in the if
3223 statement that controls which version of the loop gets executed at runtime.
3224 */
3225
3226 void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)3227 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3228 {
3229 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3230 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3231
3232 if (comp_alias_ddrs.is_empty ())
3233 return;
3234
3235 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3236 &comp_alias_ddrs, cond_expr);
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "created %u versioning for alias checks.\n",
3240 comp_alias_ddrs.length ());
3241 }
3242
3243
3244 /* Function vect_loop_versioning.
3245
3246 If the loop has data references that may or may not be aligned or/and
3247 has data reference relations whose independence was not proven then
3248 two versions of the loop need to be generated, one which is vectorized
3249 and one which isn't. A test is then generated to control which of the
3250 loops is executed. The test checks for the alignment of all of the
3251 data references that may or may not be aligned. An additional
3252 sequence of runtime tests is generated for each pairs of DDRs whose
3253 independence was not proven. The vectorized version of loop is
3254 executed only if both alias and alignment tests are passed.
3255
3256 The test generated to check which version of loop is executed
3257 is modified to also check for profitability as indicated by the
3258 cost model threshold TH.
3259
3260 The versioning precondition(s) are placed in *COND_EXPR and
3261 *COND_EXPR_STMT_LIST. */
3262
3263 class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)3264 vect_loop_versioning (loop_vec_info loop_vinfo,
3265 gimple *loop_vectorized_call)
3266 {
3267 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3268 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3269 basic_block condition_bb;
3270 gphi_iterator gsi;
3271 gimple_stmt_iterator cond_exp_gsi;
3272 basic_block merge_bb;
3273 basic_block new_exit_bb;
3274 edge new_exit_e, e;
3275 gphi *orig_phi, *new_phi;
3276 tree cond_expr = NULL_TREE;
3277 gimple_seq cond_expr_stmt_list = NULL;
3278 tree arg;
3279 profile_probability prob = profile_probability::likely ();
3280 gimple_seq gimplify_stmt_list = NULL;
3281 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3282 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3283 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3284 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3285 poly_uint64 versioning_threshold
3286 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3287 tree version_simd_if_cond
3288 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3289 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3290
3291 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3292 && !ordered_p (th, versioning_threshold))
3293 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3294 build_int_cst (TREE_TYPE (scalar_loop_iters),
3295 th - 1));
3296 if (maybe_ne (versioning_threshold, 0U))
3297 {
3298 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3299 build_int_cst (TREE_TYPE (scalar_loop_iters),
3300 versioning_threshold - 1));
3301 if (cond_expr)
3302 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3303 expr, cond_expr);
3304 else
3305 cond_expr = expr;
3306 }
3307
3308 if (version_niter)
3309 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3310
3311 if (cond_expr)
3312 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3313 &cond_expr_stmt_list,
3314 is_gimple_condexpr, NULL_TREE);
3315
3316 if (version_align)
3317 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3318 &cond_expr_stmt_list);
3319
3320 if (version_alias)
3321 {
3322 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3323 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3324 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3325 }
3326
3327 if (version_simd_if_cond)
3328 {
3329 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3330 if (flag_checking)
3331 if (basic_block bb
3332 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3333 gcc_assert (bb != loop->header
3334 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3335 && (scalar_loop == NULL
3336 || (bb != scalar_loop->header
3337 && dominated_by_p (CDI_DOMINATORS,
3338 scalar_loop->header, bb))));
3339 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3340 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3341 version_simd_if_cond, zero);
3342 if (cond_expr)
3343 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3344 c, cond_expr);
3345 else
3346 cond_expr = c;
3347 if (dump_enabled_p ())
3348 dump_printf_loc (MSG_NOTE, vect_location,
3349 "created versioning for simd if condition check.\n");
3350 }
3351
3352 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3353 &gimplify_stmt_list,
3354 is_gimple_condexpr, NULL_TREE);
3355 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3356
3357 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3358 invariant in. */
3359 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3360 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3361 !gsi_end_p (gsi); gsi_next (&gsi))
3362 {
3363 gimple *stmt = gsi_stmt (gsi);
3364 update_stmt (stmt);
3365 ssa_op_iter iter;
3366 use_operand_p use_p;
3367 basic_block def_bb;
3368 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3369 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3370 && flow_bb_inside_loop_p (outermost, def_bb))
3371 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3372 }
3373
3374 /* Search for the outermost loop we can version. Avoid versioning of
3375 non-perfect nests but allow if-conversion versioned loops inside. */
3376 class loop *loop_to_version = loop;
3377 if (flow_loop_nested_p (outermost, loop))
3378 {
3379 if (dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE, vect_location,
3381 "trying to apply versioning to outer loop %d\n",
3382 outermost->num);
3383 if (outermost->num == 0)
3384 outermost = superloop_at_depth (loop, 1);
3385 /* And avoid applying versioning on non-perfect nests. */
3386 while (loop_to_version != outermost
3387 && (e = single_exit (loop_outer (loop_to_version)))
3388 && !(e->flags & EDGE_COMPLEX)
3389 && (!loop_outer (loop_to_version)->inner->next
3390 || vect_loop_vectorized_call (loop_to_version))
3391 && (!loop_outer (loop_to_version)->inner->next
3392 || !loop_outer (loop_to_version)->inner->next->next))
3393 loop_to_version = loop_outer (loop_to_version);
3394 }
3395
3396 /* Apply versioning. If there is already a scalar version created by
3397 if-conversion re-use that. Note we cannot re-use the copy of
3398 an if-converted outer-loop when vectorizing the inner loop only. */
3399 gcond *cond;
3400 if ((!loop_to_version->inner || loop == loop_to_version)
3401 && loop_vectorized_call)
3402 {
3403 gcc_assert (scalar_loop);
3404 condition_bb = gimple_bb (loop_vectorized_call);
3405 cond = as_a <gcond *> (last_stmt (condition_bb));
3406 gimple_cond_set_condition_from_tree (cond, cond_expr);
3407 update_stmt (cond);
3408
3409 if (cond_expr_stmt_list)
3410 {
3411 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3412 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3413 GSI_SAME_STMT);
3414 }
3415
3416 /* if-conversion uses profile_probability::always () for both paths,
3417 reset the paths probabilities appropriately. */
3418 edge te, fe;
3419 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3420 te->probability = prob;
3421 fe->probability = prob.invert ();
3422 /* We can scale loops counts immediately but have to postpone
3423 scaling the scalar loop because we re-use it during peeling. */
3424 scale_loop_frequencies (loop_to_version, te->probability);
3425 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3426
3427 nloop = scalar_loop;
3428 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_NOTE, vect_location,
3430 "reusing %sloop version created by if conversion\n",
3431 loop_to_version != loop ? "outer " : "");
3432 }
3433 else
3434 {
3435 if (loop_to_version != loop
3436 && dump_enabled_p ())
3437 dump_printf_loc (MSG_NOTE, vect_location,
3438 "applying loop versioning to outer loop %d\n",
3439 loop_to_version->num);
3440
3441 initialize_original_copy_tables ();
3442 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3443 prob, prob.invert (), prob, prob.invert (), true);
3444 gcc_assert (nloop);
3445 nloop = get_loop_copy (loop);
3446
3447 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3448 reap those otherwise; they also refer to the original
3449 loops. */
3450 class loop *l = loop;
3451 while (gimple *call = vect_loop_vectorized_call (l))
3452 {
3453 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3454 fold_loop_internal_call (call, boolean_false_node);
3455 l = loop_outer (l);
3456 }
3457 free_original_copy_tables ();
3458
3459 if (cond_expr_stmt_list)
3460 {
3461 cond_exp_gsi = gsi_last_bb (condition_bb);
3462 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3463 GSI_SAME_STMT);
3464 }
3465
3466 /* Loop versioning violates an assumption we try to maintain during
3467 vectorization - that the loop exit block has a single predecessor.
3468 After versioning, the exit block of both loop versions is the same
3469 basic block (i.e. it has two predecessors). Just in order to simplify
3470 following transformations in the vectorizer, we fix this situation
3471 here by adding a new (empty) block on the exit-edge of the loop,
3472 with the proper loop-exit phis to maintain loop-closed-form.
3473 If loop versioning wasn't done from loop, but scalar_loop instead,
3474 merge_bb will have already just a single successor. */
3475
3476 merge_bb = single_exit (loop_to_version)->dest;
3477 if (EDGE_COUNT (merge_bb->preds) >= 2)
3478 {
3479 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3480 new_exit_bb = split_edge (single_exit (loop_to_version));
3481 new_exit_e = single_exit (loop_to_version);
3482 e = EDGE_SUCC (new_exit_bb, 0);
3483
3484 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3485 gsi_next (&gsi))
3486 {
3487 tree new_res;
3488 orig_phi = gsi.phi ();
3489 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3490 new_phi = create_phi_node (new_res, new_exit_bb);
3491 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3492 add_phi_arg (new_phi, arg, new_exit_e,
3493 gimple_phi_arg_location_from_edge (orig_phi, e));
3494 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3495 }
3496 }
3497
3498 update_ssa (TODO_update_ssa);
3499 }
3500
3501 if (version_niter)
3502 {
3503 /* The versioned loop could be infinite, we need to clear existing
3504 niter information which is copied from the original loop. */
3505 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3506 vect_free_loop_info_assumptions (nloop);
3507 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3508 loop_constraint_set (loop, LOOP_C_INFINITE);
3509 }
3510
3511 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3512 && dump_enabled_p ())
3513 {
3514 if (version_alias)
3515 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3516 vect_location,
3517 "loop versioned for vectorization because of "
3518 "possible aliasing\n");
3519 if (version_align)
3520 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3521 vect_location,
3522 "loop versioned for vectorization to enhance "
3523 "alignment\n");
3524
3525 }
3526
3527 return nloop;
3528 }
3529