1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53
54 /*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61 /* Renames the use *OP_P. */
62
63 static void
rename_use_op(use_operand_p op_p)64 rename_use_op (use_operand_p op_p)
65 {
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80 }
81
82
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
86
87 static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
103
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
112 FOR_EACH_EDGE (e, ei, bb->preds)
113 {
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
129 }
130 }
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134 }
135 }
136
137
138 struct adjust_info
139 {
140 tree from, to;
141 basic_block bb;
142 };
143
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
151
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156 static void
adjust_debug_stmts_now(adjust_info * ai)157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197 }
198
199 /* Adjust debug stmts as scheduled before. */
200
201 static void
adjust_vec_debug_stmts(void)202 adjust_vec_debug_stmts (void)
203 {
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
206
207 gcc_assert (adjust_vec.exists ());
208
209 while (!adjust_vec.is_empty ())
210 {
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
213 }
214 }
215
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221 static void
adjust_debug_stmts(tree from,tree to,basic_block bb)222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224 adjust_info ai;
225
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240 }
241
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247 static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257 }
258
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
262
263 static void
vect_set_loop_control(class loop * loop,tree ctrl,tree init_ctrl,tree next_ctrl)264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
266 {
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271
272 /* Add SEQ to the end of LOOP's preheader block. */
273
274 static void
add_preheader_seq(class loop * loop,gimple_seq seq)275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283 }
284
285 /* Add SEQ to the beginning of LOOP's header block. */
286
287 static void
add_header_seq(class loop * loop,gimple_seq seq)288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295 }
296
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302 static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305 {
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322 static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_controls * dest_rgm,rgroup_controls * src_rgm)323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 rgroup_controls *src_rgm)
325 {
326 tree src_masktype = src_rgm->type;
327 tree dest_masktype = dest_rgm->type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
342 {
343 tree src = src_rgm->controls[i / 2];
344 tree dest = dest_rgm->controls[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
375 {
376 tree src = src_rgm->controls[i / 2];
377 tree dest = dest_rgm->controls[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385 }
386
387 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
388 for all the rgroup controls in RGC and return a control that is nonzero
389 when the loop needs to iterate. Add any new preheader statements to
390 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
391
392 RGC belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
394
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
400 It is known that:
401
402 NITERS * RGC->max_nscalars_per_iter * RGC->factor
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
407 VF * RGC->max_nscalars_per_iter * RGC->factor
408
409 might overflow before hitting a value above:
410
411 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
412
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks or zero
415 lengths for RGC.
416
417 Note: the cost of the code generated by this function is modeled
418 by vect_estimate_min_profitable_iters, so changes here may need
419 corresponding changes there. */
420
421 static tree
vect_set_loop_controls_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_seq * header_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_controls * rgc,tree niters,tree niters_skip,bool might_wrap_p)422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 gimple_seq *preheader_seq,
424 gimple_seq *header_seq,
425 gimple_stmt_iterator loop_cond_gsi,
426 rgroup_controls *rgc, tree niters,
427 tree niters_skip, bool might_wrap_p)
428 {
429 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
430 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
431 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
432
433 tree ctrl_type = rgc->type;
434 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
435 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
436 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
437 tree length_limit = NULL_TREE;
438 /* For length, we need length_limit to ensure length in range. */
439 if (!use_masks_p)
440 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
441
442 /* Calculate the maximum number of item values that the rgroup
443 handles in total, the number that it handles for each iteration
444 of the vector loop, and the number that it should skip during the
445 first iteration of the vector loop. */
446 tree nitems_total = niters;
447 tree nitems_step = build_int_cst (iv_type, vf);
448 tree nitems_skip = niters_skip;
449 if (nitems_per_iter != 1)
450 {
451 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
452 these multiplications don't overflow. */
453 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
454 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
455 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
456 nitems_total, compare_factor);
457 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
458 nitems_step, iv_factor);
459 if (nitems_skip)
460 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
461 nitems_skip, compare_factor);
462 }
463
464 /* Create an induction variable that counts the number of items
465 processed. */
466 tree index_before_incr, index_after_incr;
467 gimple_stmt_iterator incr_gsi;
468 bool insert_after;
469 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
470 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
471 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
472
473 tree zero_index = build_int_cst (compare_type, 0);
474 tree test_index, test_limit, first_limit;
475 gimple_stmt_iterator *test_gsi;
476 if (might_wrap_p)
477 {
478 /* In principle the loop should stop iterating once the incremented
479 IV reaches a value greater than or equal to:
480
481 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
482
483 However, there's no guarantee that this addition doesn't overflow
484 the comparison type, or that the IV hits a value above it before
485 wrapping around. We therefore adjust the limit down by one
486 IV step:
487
488 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
489 -[infinite-prec] NITEMS_STEP
490
491 and compare the IV against this limit _before_ incrementing it.
492 Since the comparison type is unsigned, we actually want the
493 subtraction to saturate at zero:
494
495 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
496 -[sat] NITEMS_STEP
497
498 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
499
500 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
501
502 where the rightmost subtraction can be done directly in
503 COMPARE_TYPE. */
504 test_index = index_before_incr;
505 tree adjust = gimple_convert (preheader_seq, compare_type,
506 nitems_step);
507 if (nitems_skip)
508 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
509 adjust, nitems_skip);
510 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
511 nitems_total, adjust);
512 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
513 test_limit, adjust);
514 test_gsi = &incr_gsi;
515
516 /* Get a safe limit for the first iteration. */
517 if (nitems_skip)
518 {
519 /* The first vector iteration can handle at most NITEMS_STEP
520 items. NITEMS_STEP <= CONST_LIMIT, and adding
521 NITEMS_SKIP to that cannot overflow. */
522 tree const_limit = build_int_cst (compare_type,
523 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
524 * nitems_per_iter);
525 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
526 nitems_total, const_limit);
527 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
528 first_limit, nitems_skip);
529 }
530 else
531 /* For the first iteration it doesn't matter whether the IV hits
532 a value above NITEMS_TOTAL. That only matters for the latch
533 condition. */
534 first_limit = nitems_total;
535 }
536 else
537 {
538 /* Test the incremented IV, which will always hit a value above
539 the bound before wrapping. */
540 test_index = index_after_incr;
541 test_limit = nitems_total;
542 if (nitems_skip)
543 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
544 test_limit, nitems_skip);
545 test_gsi = &loop_cond_gsi;
546
547 first_limit = test_limit;
548 }
549
550 /* Convert the IV value to the comparison type (either a no-op or
551 a demotion). */
552 gimple_seq test_seq = NULL;
553 test_index = gimple_convert (&test_seq, compare_type, test_index);
554 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
555
556 /* Provide a definition of each control in the group. */
557 tree next_ctrl = NULL_TREE;
558 tree ctrl;
559 unsigned int i;
560 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
561 {
562 /* Previous controls will cover BIAS items. This control covers the
563 next batch. */
564 poly_uint64 bias = nitems_per_ctrl * i;
565 tree bias_tree = build_int_cst (compare_type, bias);
566
567 /* See whether the first iteration of the vector loop is known
568 to have a full control. */
569 poly_uint64 const_limit;
570 bool first_iteration_full
571 = (poly_int_tree_p (first_limit, &const_limit)
572 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
573
574 /* Rather than have a new IV that starts at BIAS and goes up to
575 TEST_LIMIT, prefer to use the same 0-based IV for each control
576 and adjust the bound down by BIAS. */
577 tree this_test_limit = test_limit;
578 if (i != 0)
579 {
580 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
581 compare_type, this_test_limit,
582 bias_tree);
583 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
584 compare_type, this_test_limit,
585 bias_tree);
586 }
587
588 /* Create the initial control. First include all items that
589 are within the loop limit. */
590 tree init_ctrl = NULL_TREE;
591 if (!first_iteration_full)
592 {
593 tree start, end;
594 if (first_limit == test_limit)
595 {
596 /* Use a natural test between zero (the initial IV value)
597 and the loop limit. The "else" block would be valid too,
598 but this choice can avoid the need to load BIAS_TREE into
599 a register. */
600 start = zero_index;
601 end = this_test_limit;
602 }
603 else
604 {
605 /* FIRST_LIMIT is the maximum number of items handled by the
606 first iteration of the vector loop. Test the portion
607 associated with this control. */
608 start = bias_tree;
609 end = first_limit;
610 }
611
612 if (use_masks_p)
613 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
614 start, end, "max_mask");
615 else
616 {
617 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
618 gimple_seq seq = vect_gen_len (init_ctrl, start,
619 end, length_limit);
620 gimple_seq_add_seq (preheader_seq, seq);
621 }
622 }
623
624 /* Now AND out the bits that are within the number of skipped
625 items. */
626 poly_uint64 const_skip;
627 if (nitems_skip
628 && !(poly_int_tree_p (nitems_skip, &const_skip)
629 && known_le (const_skip, bias)))
630 {
631 gcc_assert (use_masks_p);
632 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
633 bias_tree, nitems_skip);
634 if (init_ctrl)
635 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
636 init_ctrl, unskipped_mask);
637 else
638 init_ctrl = unskipped_mask;
639 }
640
641 if (!init_ctrl)
642 {
643 /* First iteration is full. */
644 if (use_masks_p)
645 init_ctrl = build_minus_one_cst (ctrl_type);
646 else
647 init_ctrl = length_limit;
648 }
649
650 /* Get the control value for the next iteration of the loop. */
651 if (use_masks_p)
652 {
653 gimple_seq stmts = NULL;
654 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
655 this_test_limit, "next_mask");
656 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
657 }
658 else
659 {
660 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
661 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
662 length_limit);
663 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
664 }
665
666 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
667 }
668
669 int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
670 if (partial_load_bias != 0)
671 {
672 tree adjusted_len = rgc->bias_adjusted_ctrl;
673 gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
674 rgc->controls[0],
675 build_int_cst
676 (TREE_TYPE (rgc->controls[0]),
677 partial_load_bias));
678 gimple_seq_add_stmt (header_seq, minus);
679 }
680
681 return next_ctrl;
682 }
683
684 /* Set up the iteration condition and rgroup controls for LOOP, given
685 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
686 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
687 the number of iterations of the original scalar loop that should be
688 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
689 for vect_set_loop_condition.
690
691 Insert the branch-back condition before LOOP_COND_GSI and return the
692 final gcond. */
693
694 static gcond *
vect_set_loop_condition_partial_vectors(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)695 vect_set_loop_condition_partial_vectors (class loop *loop,
696 loop_vec_info loop_vinfo, tree niters,
697 tree final_iv, bool niters_maybe_zero,
698 gimple_stmt_iterator loop_cond_gsi)
699 {
700 gimple_seq preheader_seq = NULL;
701 gimple_seq header_seq = NULL;
702
703 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
704 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
705 unsigned int compare_precision = TYPE_PRECISION (compare_type);
706 tree orig_niters = niters;
707
708 /* Type of the initial value of NITERS. */
709 tree ni_actual_type = TREE_TYPE (niters);
710 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
711 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
712
713 /* Convert NITERS to the same size as the compare. */
714 if (compare_precision > ni_actual_precision
715 && niters_maybe_zero)
716 {
717 /* We know that there is always at least one iteration, so if the
718 count is zero then it must have wrapped. Cope with this by
719 subtracting 1 before the conversion and adding 1 to the result. */
720 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
721 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
722 niters, build_minus_one_cst (ni_actual_type));
723 niters = gimple_convert (&preheader_seq, compare_type, niters);
724 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
725 niters, build_one_cst (compare_type));
726 }
727 else
728 niters = gimple_convert (&preheader_seq, compare_type, niters);
729
730 /* Iterate over all the rgroups and fill in their controls. We could use
731 the first control from any rgroup for the loop condition; here we
732 arbitrarily pick the last. */
733 tree test_ctrl = NULL_TREE;
734 rgroup_controls *rgc;
735 unsigned int i;
736 auto_vec<rgroup_controls> *controls = use_masks_p
737 ? &LOOP_VINFO_MASKS (loop_vinfo)
738 : &LOOP_VINFO_LENS (loop_vinfo);
739 FOR_EACH_VEC_ELT (*controls, i, rgc)
740 if (!rgc->controls.is_empty ())
741 {
742 /* First try using permutes. This adds a single vector
743 instruction to the loop for each mask, but needs no extra
744 loop invariants or IVs. */
745 unsigned int nmasks = i + 1;
746 if (use_masks_p && (nmasks & 1) == 0)
747 {
748 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
749 if (!half_rgc->controls.is_empty ()
750 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
751 continue;
752 }
753
754 /* See whether zero-based IV would ever generate all-false masks
755 or zero length before wrapping around. */
756 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
757
758 /* Set up all controls for this group. */
759 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
760 &preheader_seq,
761 &header_seq,
762 loop_cond_gsi, rgc,
763 niters, niters_skip,
764 might_wrap_p);
765 }
766
767 /* Emit all accumulated statements. */
768 add_preheader_seq (loop, preheader_seq);
769 add_header_seq (loop, header_seq);
770
771 /* Get a boolean result that tells us whether to iterate. */
772 edge exit_edge = single_exit (loop);
773 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
774 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
775 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
776 NULL_TREE, NULL_TREE);
777 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
778
779 /* The loop iterates (NITERS - 1) / VF + 1 times.
780 Subtract one from this to get the latch count. */
781 tree step = build_int_cst (compare_type,
782 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
783 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
784 build_minus_one_cst (compare_type));
785 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
786 niters_minus_one, step);
787
788 if (final_iv)
789 {
790 gassign *assign = gimple_build_assign (final_iv, orig_niters);
791 gsi_insert_on_edge_immediate (single_exit (loop), assign);
792 }
793
794 return cond_stmt;
795 }
796
797 /* Like vect_set_loop_condition, but handle the case in which the vector
798 loop handles exactly VF scalars per iteration. */
799
800 static gcond *
vect_set_loop_condition_normal(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)801 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
802 tree final_iv, bool niters_maybe_zero,
803 gimple_stmt_iterator loop_cond_gsi)
804 {
805 tree indx_before_incr, indx_after_incr;
806 gcond *cond_stmt;
807 gcond *orig_cond;
808 edge pe = loop_preheader_edge (loop);
809 edge exit_edge = single_exit (loop);
810 gimple_stmt_iterator incr_gsi;
811 bool insert_after;
812 enum tree_code code;
813 tree niters_type = TREE_TYPE (niters);
814
815 orig_cond = get_loop_exit_condition (loop);
816 gcc_assert (orig_cond);
817 loop_cond_gsi = gsi_for_stmt (orig_cond);
818
819 tree init, limit;
820 if (!niters_maybe_zero && integer_onep (step))
821 {
822 /* In this case we can use a simple 0-based IV:
823
824 A:
825 x = 0;
826 do
827 {
828 ...
829 x += 1;
830 }
831 while (x < NITERS); */
832 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
833 init = build_zero_cst (niters_type);
834 limit = niters;
835 }
836 else
837 {
838 /* The following works for all values of NITERS except 0:
839
840 B:
841 x = 0;
842 do
843 {
844 ...
845 x += STEP;
846 }
847 while (x <= NITERS - STEP);
848
849 so that the loop continues to iterate if x + STEP - 1 < NITERS
850 but stops if x + STEP - 1 >= NITERS.
851
852 However, if NITERS is zero, x never hits a value above NITERS - STEP
853 before wrapping around. There are two obvious ways of dealing with
854 this:
855
856 - start at STEP - 1 and compare x before incrementing it
857 - start at -1 and compare x after incrementing it
858
859 The latter is simpler and is what we use. The loop in this case
860 looks like:
861
862 C:
863 x = -1;
864 do
865 {
866 ...
867 x += STEP;
868 }
869 while (x < NITERS - STEP);
870
871 In both cases the loop limit is NITERS - STEP. */
872 gimple_seq seq = NULL;
873 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
874 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
875 if (seq)
876 {
877 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
878 gcc_assert (!new_bb);
879 }
880 if (niters_maybe_zero)
881 {
882 /* Case C. */
883 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
884 init = build_all_ones_cst (niters_type);
885 }
886 else
887 {
888 /* Case B. */
889 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
890 init = build_zero_cst (niters_type);
891 }
892 }
893
894 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
895 create_iv (init, step, NULL_TREE, loop,
896 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
897 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
898 true, NULL_TREE, true,
899 GSI_SAME_STMT);
900 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
901 true, GSI_SAME_STMT);
902
903 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
904 NULL_TREE);
905
906 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
907
908 /* Record the number of latch iterations. */
909 if (limit == niters)
910 /* Case A: the loop iterates NITERS times. Subtract one to get the
911 latch count. */
912 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
913 build_int_cst (niters_type, 1));
914 else
915 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
916 Subtract one from this to get the latch count. */
917 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
918 limit, step);
919
920 if (final_iv)
921 {
922 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
923 indx_after_incr, init);
924 gsi_insert_on_edge_immediate (single_exit (loop), assign);
925 }
926
927 return cond_stmt;
928 }
929
930 /* If we're using fully-masked loops, make LOOP iterate:
931
932 N == (NITERS - 1) / STEP + 1
933
934 times. When NITERS is zero, this is equivalent to making the loop
935 execute (1 << M) / STEP times, where M is the precision of NITERS.
936 NITERS_MAYBE_ZERO is true if this last case might occur.
937
938 If we're not using fully-masked loops, make LOOP iterate:
939
940 N == (NITERS - STEP) / STEP + 1
941
942 times, where NITERS is known to be outside the range [1, STEP - 1].
943 This is equivalent to making the loop execute NITERS / STEP times
944 when NITERS is nonzero and (1 << M) / STEP times otherwise.
945 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
946
947 If FINAL_IV is nonnull, it is an SSA name that should be set to
948 N * STEP on exit from the loop.
949
950 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
951
952 void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)953 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
954 tree niters, tree step, tree final_iv,
955 bool niters_maybe_zero)
956 {
957 gcond *cond_stmt;
958 gcond *orig_cond = get_loop_exit_condition (loop);
959 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
960
961 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
962 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
963 niters, final_iv,
964 niters_maybe_zero,
965 loop_cond_gsi);
966 else
967 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
968 niters_maybe_zero,
969 loop_cond_gsi);
970
971 /* Remove old loop exit test. */
972 stmt_vec_info orig_cond_info;
973 if (loop_vinfo
974 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
975 loop_vinfo->remove_stmt (orig_cond_info);
976 else
977 gsi_remove (&loop_cond_gsi, true);
978
979 if (dump_enabled_p ())
980 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
981 cond_stmt);
982 }
983
984 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
985 For all PHI arguments in FROM->dest and TO->dest from those
986 edges ensure that TO->dest PHI arguments have current_def
987 to that in from. */
988
989 static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)990 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
991 {
992 gimple_stmt_iterator gsi_from, gsi_to;
993
994 for (gsi_from = gsi_start_phis (from->dest),
995 gsi_to = gsi_start_phis (to->dest);
996 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
997 {
998 gimple *from_phi = gsi_stmt (gsi_from);
999 gimple *to_phi = gsi_stmt (gsi_to);
1000 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1001 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
1002 if (virtual_operand_p (from_arg))
1003 {
1004 gsi_next (&gsi_from);
1005 continue;
1006 }
1007 if (virtual_operand_p (to_arg))
1008 {
1009 gsi_next (&gsi_to);
1010 continue;
1011 }
1012 if (TREE_CODE (from_arg) != SSA_NAME)
1013 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1014 else if (TREE_CODE (to_arg) == SSA_NAME
1015 && from_arg != to_arg)
1016 {
1017 if (get_current_def (to_arg) == NULL_TREE)
1018 {
1019 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1020 TREE_TYPE (get_current_def
1021 (from_arg))));
1022 set_current_def (to_arg, get_current_def (from_arg));
1023 }
1024 }
1025 gsi_next (&gsi_from);
1026 gsi_next (&gsi_to);
1027 }
1028
1029 gphi *from_phi = get_virtual_phi (from->dest);
1030 gphi *to_phi = get_virtual_phi (to->dest);
1031 if (from_phi)
1032 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1033 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1034 }
1035
1036
1037 /* Given LOOP this function generates a new copy of it and puts it
1038 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1039 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1040 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1041 entry or exit of LOOP. */
1042
1043 class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)1044 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1045 class loop *scalar_loop, edge e)
1046 {
1047 class loop *new_loop;
1048 basic_block *new_bbs, *bbs, *pbbs;
1049 bool at_exit;
1050 bool was_imm_dom;
1051 basic_block exit_dest;
1052 edge exit, new_exit;
1053 bool duplicate_outer_loop = false;
1054
1055 exit = single_exit (loop);
1056 at_exit = (e == exit);
1057 if (!at_exit && e != loop_preheader_edge (loop))
1058 return NULL;
1059
1060 if (scalar_loop == NULL)
1061 scalar_loop = loop;
1062
1063 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1064 pbbs = bbs + 1;
1065 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1066 /* Allow duplication of outer loops. */
1067 if (scalar_loop->inner)
1068 duplicate_outer_loop = true;
1069 /* Check whether duplication is possible. */
1070 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1071 {
1072 free (bbs);
1073 return NULL;
1074 }
1075
1076 /* Generate new loop structure. */
1077 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1078 duplicate_subloops (scalar_loop, new_loop);
1079
1080 exit_dest = exit->dest;
1081 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1082 exit_dest) == loop->header ?
1083 true : false);
1084
1085 /* Also copy the pre-header, this avoids jumping through hoops to
1086 duplicate the loop entry PHI arguments. Create an empty
1087 pre-header unconditionally for this. */
1088 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1089 edge entry_e = single_pred_edge (preheader);
1090 bbs[0] = preheader;
1091 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1092
1093 exit = single_exit (scalar_loop);
1094 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1095 &exit, 1, &new_exit, NULL,
1096 at_exit ? loop->latch : e->src, true);
1097 exit = single_exit (loop);
1098 basic_block new_preheader = new_bbs[0];
1099
1100 /* Before installing PHI arguments make sure that the edges
1101 into them match that of the scalar loop we analyzed. This
1102 makes sure the SLP tree matches up between the main vectorized
1103 loop and the epilogue vectorized copies. */
1104 if (single_succ_edge (preheader)->dest_idx
1105 != single_succ_edge (new_bbs[0])->dest_idx)
1106 {
1107 basic_block swap_bb = new_bbs[1];
1108 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1109 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1110 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1111 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1112 }
1113 if (duplicate_outer_loop)
1114 {
1115 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1116 if (loop_preheader_edge (scalar_loop)->dest_idx
1117 != loop_preheader_edge (new_inner_loop)->dest_idx)
1118 {
1119 basic_block swap_bb = new_inner_loop->header;
1120 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1121 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1122 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1123 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1124 }
1125 }
1126
1127 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1128
1129 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1130 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1131 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1132
1133 if (scalar_loop != loop)
1134 {
1135 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1136 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1137 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1138 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1139 header) to have current_def set, so copy them over. */
1140 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1141 exit);
1142 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1143 0),
1144 EDGE_SUCC (loop->latch, 0));
1145 }
1146
1147 if (at_exit) /* Add the loop copy at exit. */
1148 {
1149 if (scalar_loop != loop)
1150 {
1151 gphi_iterator gsi;
1152 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1153
1154 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1155 gsi_next (&gsi))
1156 {
1157 gphi *phi = gsi.phi ();
1158 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1159 location_t orig_locus
1160 = gimple_phi_arg_location_from_edge (phi, e);
1161
1162 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1163 }
1164 }
1165 redirect_edge_and_branch_force (e, new_preheader);
1166 flush_pending_stmts (e);
1167 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1168 if (was_imm_dom || duplicate_outer_loop)
1169 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1170
1171 /* And remove the non-necessary forwarder again. Keep the other
1172 one so we have a proper pre-header for the loop at the exit edge. */
1173 redirect_edge_pred (single_succ_edge (preheader),
1174 single_pred (preheader));
1175 delete_basic_block (preheader);
1176 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1177 loop_preheader_edge (scalar_loop)->src);
1178 }
1179 else /* Add the copy at entry. */
1180 {
1181 if (scalar_loop != loop)
1182 {
1183 /* Remove the non-necessary forwarder of scalar_loop again. */
1184 redirect_edge_pred (single_succ_edge (preheader),
1185 single_pred (preheader));
1186 delete_basic_block (preheader);
1187 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1188 loop_preheader_edge (scalar_loop)->src);
1189 preheader = split_edge (loop_preheader_edge (loop));
1190 entry_e = single_pred_edge (preheader);
1191 }
1192
1193 redirect_edge_and_branch_force (entry_e, new_preheader);
1194 flush_pending_stmts (entry_e);
1195 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1196
1197 redirect_edge_and_branch_force (new_exit, preheader);
1198 flush_pending_stmts (new_exit);
1199 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1200
1201 /* And remove the non-necessary forwarder again. Keep the other
1202 one so we have a proper pre-header for the loop at the exit edge. */
1203 redirect_edge_pred (single_succ_edge (new_preheader),
1204 single_pred (new_preheader));
1205 delete_basic_block (new_preheader);
1206 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1207 loop_preheader_edge (new_loop)->src);
1208 }
1209
1210 if (scalar_loop != loop)
1211 {
1212 /* Update new_loop->header PHIs, so that on the preheader
1213 edge they are the ones from loop rather than scalar_loop. */
1214 gphi_iterator gsi_orig, gsi_new;
1215 edge orig_e = loop_preheader_edge (loop);
1216 edge new_e = loop_preheader_edge (new_loop);
1217
1218 for (gsi_orig = gsi_start_phis (loop->header),
1219 gsi_new = gsi_start_phis (new_loop->header);
1220 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1221 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1222 {
1223 gphi *orig_phi = gsi_orig.phi ();
1224 gphi *new_phi = gsi_new.phi ();
1225 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1226 location_t orig_locus
1227 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1228
1229 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1230 }
1231 }
1232
1233 free (new_bbs);
1234 free (bbs);
1235
1236 checking_verify_dominators (CDI_DOMINATORS);
1237
1238 return new_loop;
1239 }
1240
1241
1242 /* Given the condition expression COND, put it as the last statement of
1243 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1244 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1245 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1246 new edge as irreducible if IRREDUCIBLE_P is true. */
1247
1248 static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)1249 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1250 basic_block guard_to, basic_block dom_bb,
1251 profile_probability probability, bool irreducible_p)
1252 {
1253 gimple_stmt_iterator gsi;
1254 edge new_e, enter_e;
1255 gcond *cond_stmt;
1256 gimple_seq gimplify_stmt_list = NULL;
1257
1258 enter_e = EDGE_SUCC (guard_bb, 0);
1259 enter_e->flags &= ~EDGE_FALLTHRU;
1260 enter_e->flags |= EDGE_FALSE_VALUE;
1261 gsi = gsi_last_bb (guard_bb);
1262
1263 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1264 NULL_TREE);
1265 if (gimplify_stmt_list)
1266 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1267
1268 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1269 gsi = gsi_last_bb (guard_bb);
1270 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1271
1272 /* Add new edge to connect guard block to the merge/loop-exit block. */
1273 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1274
1275 new_e->probability = probability;
1276 if (irreducible_p)
1277 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1278
1279 enter_e->probability = probability.invert ();
1280 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1281
1282 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1283 if (enter_e->dest->loop_father->header == enter_e->dest)
1284 split_edge (enter_e);
1285
1286 return new_e;
1287 }
1288
1289
1290 /* This function verifies that the following restrictions apply to LOOP:
1291 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1292 for innermost loop and 5 basic blocks for outer-loop.
1293 (2) it is single entry, single exit
1294 (3) its exit condition is the last stmt in the header
1295 (4) E is the entry/exit edge of LOOP.
1296 */
1297
1298 bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1299 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1300 {
1301 edge exit_e = single_exit (loop);
1302 edge entry_e = loop_preheader_edge (loop);
1303 gcond *orig_cond = get_loop_exit_condition (loop);
1304 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1305 unsigned int num_bb = loop->inner? 5 : 2;
1306
1307 /* All loops have an outer scope; the only case loop->outer is NULL is for
1308 the function itself. */
1309 if (!loop_outer (loop)
1310 || loop->num_nodes != num_bb
1311 || !empty_block_p (loop->latch)
1312 || !single_exit (loop)
1313 /* Verify that new loop exit condition can be trivially modified. */
1314 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1315 || (e != exit_e && e != entry_e))
1316 return false;
1317
1318 return true;
1319 }
1320
1321 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1322 in the exit bb and rename all the uses after the loop. This simplifies
1323 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1324 (but normally loop closed SSA form doesn't require virtual PHIs to be
1325 in the same form). Doing this early simplifies the checking what
1326 uses should be renamed.
1327
1328 If we create a new phi after the loop, return the definition that
1329 applies on entry to the loop, otherwise return null. */
1330
1331 static tree
create_lcssa_for_virtual_phi(class loop * loop)1332 create_lcssa_for_virtual_phi (class loop *loop)
1333 {
1334 gphi_iterator gsi;
1335 edge exit_e = single_exit (loop);
1336
1337 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1338 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1339 {
1340 gphi *phi = gsi.phi ();
1341 for (gsi = gsi_start_phis (exit_e->dest);
1342 !gsi_end_p (gsi); gsi_next (&gsi))
1343 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1344 break;
1345 if (gsi_end_p (gsi))
1346 {
1347 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1348 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1349 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1350 imm_use_iterator imm_iter;
1351 gimple *stmt;
1352 use_operand_p use_p;
1353
1354 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1355 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1356 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1357 gimple_phi_set_result (new_phi, new_vop);
1358 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1359 if (stmt != new_phi
1360 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1361 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1362 SET_USE (use_p, new_vop);
1363
1364 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1365 }
1366 break;
1367 }
1368 return NULL_TREE;
1369 }
1370
1371 /* Function vect_get_loop_location.
1372
1373 Extract the location of the loop in the source code.
1374 If the loop is not well formed for vectorization, an estimated
1375 location is calculated.
1376 Return the loop location if succeed and NULL if not. */
1377
1378 dump_user_location_t
find_loop_location(class loop * loop)1379 find_loop_location (class loop *loop)
1380 {
1381 gimple *stmt = NULL;
1382 basic_block bb;
1383 gimple_stmt_iterator si;
1384
1385 if (!loop)
1386 return dump_user_location_t ();
1387
1388 stmt = get_loop_exit_condition (loop);
1389
1390 if (stmt
1391 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1392 return stmt;
1393
1394 /* If we got here the loop is probably not "well formed",
1395 try to estimate the loop location */
1396
1397 if (!loop->header)
1398 return dump_user_location_t ();
1399
1400 bb = loop->header;
1401
1402 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1403 {
1404 stmt = gsi_stmt (si);
1405 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1406 return stmt;
1407 }
1408
1409 return dump_user_location_t ();
1410 }
1411
1412 /* Return true if the phi described by STMT_INFO defines an IV of the
1413 loop to be vectorized. */
1414
1415 static bool
iv_phi_p(stmt_vec_info stmt_info)1416 iv_phi_p (stmt_vec_info stmt_info)
1417 {
1418 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1419 if (virtual_operand_p (PHI_RESULT (phi)))
1420 return false;
1421
1422 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1423 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1424 return false;
1425
1426 return true;
1427 }
1428
1429 /* Function vect_can_advance_ivs_p
1430
1431 In case the number of iterations that LOOP iterates is unknown at compile
1432 time, an epilog loop will be generated, and the loop induction variables
1433 (IVs) will be "advanced" to the value they are supposed to take just before
1434 the epilog loop. Here we check that the access function of the loop IVs
1435 and the expression that represents the loop bound are simple enough.
1436 These restrictions will be relaxed in the future. */
1437
1438 bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)1439 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1440 {
1441 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1442 basic_block bb = loop->header;
1443 gphi_iterator gsi;
1444
1445 /* Analyze phi functions of the loop header. */
1446
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1449 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1450 {
1451 tree evolution_part;
1452
1453 gphi *phi = gsi.phi ();
1454 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1457 phi_info->stmt);
1458
1459 /* Skip virtual phi's. The data dependences that are associated with
1460 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1461
1462 Skip reduction phis. */
1463 if (!iv_phi_p (phi_info))
1464 {
1465 if (dump_enabled_p ())
1466 dump_printf_loc (MSG_NOTE, vect_location,
1467 "reduc or virtual phi. skip.\n");
1468 continue;
1469 }
1470
1471 /* Analyze the evolution function. */
1472
1473 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1474 if (evolution_part == NULL_TREE)
1475 {
1476 if (dump_enabled_p ())
1477 dump_printf (MSG_MISSED_OPTIMIZATION,
1478 "No access function or evolution.\n");
1479 return false;
1480 }
1481
1482 /* FORNOW: We do not transform initial conditions of IVs
1483 which evolution functions are not invariants in the loop. */
1484
1485 if (!expr_invariant_in_loop_p (loop, evolution_part))
1486 {
1487 if (dump_enabled_p ())
1488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489 "evolution not invariant in loop.\n");
1490 return false;
1491 }
1492
1493 /* FORNOW: We do not transform initial conditions of IVs
1494 which evolution functions are a polynomial of degree >= 2. */
1495
1496 if (tree_is_chrec (evolution_part))
1497 {
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1500 "evolution is chrec.\n");
1501 return false;
1502 }
1503 }
1504
1505 return true;
1506 }
1507
1508
1509 /* Function vect_update_ivs_after_vectorizer.
1510
1511 "Advance" the induction variables of LOOP to the value they should take
1512 after the execution of LOOP. This is currently necessary because the
1513 vectorizer does not handle induction variables that are used after the
1514 loop. Such a situation occurs when the last iterations of LOOP are
1515 peeled, because:
1516 1. We introduced new uses after LOOP for IVs that were not originally used
1517 after LOOP: the IVs of LOOP are now used by an epilog loop.
1518 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1519 times, whereas the loop IVs should be bumped N times.
1520
1521 Input:
1522 - LOOP - a loop that is going to be vectorized. The last few iterations
1523 of LOOP were peeled.
1524 - NITERS - the number of iterations that LOOP executes (before it is
1525 vectorized). i.e, the number of times the ivs should be bumped.
1526 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1527 coming out from LOOP on which there are uses of the LOOP ivs
1528 (this is the path from LOOP->exit to epilog_loop->preheader).
1529
1530 The new definitions of the ivs are placed in LOOP->exit.
1531 The phi args associated with the edge UPDATE_E in the bb
1532 UPDATE_E->dest are updated accordingly.
1533
1534 Assumption 1: Like the rest of the vectorizer, this function assumes
1535 a single loop exit that has a single predecessor.
1536
1537 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1538 organized in the same order.
1539
1540 Assumption 3: The access function of the ivs is simple enough (see
1541 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1542
1543 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1544 coming out of LOOP on which the ivs of LOOP are used (this is the path
1545 that leads to the epilog loop; other paths skip the epilog loop). This
1546 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1547 needs to have its phis updated.
1548 */
1549
1550 static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)1551 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1552 tree niters, edge update_e)
1553 {
1554 gphi_iterator gsi, gsi1;
1555 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1556 basic_block update_bb = update_e->dest;
1557 basic_block exit_bb = single_exit (loop)->dest;
1558
1559 /* Make sure there exists a single-predecessor exit bb: */
1560 gcc_assert (single_pred_p (exit_bb));
1561 gcc_assert (single_succ_edge (exit_bb) == update_e);
1562
1563 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1564 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1565 gsi_next (&gsi), gsi_next (&gsi1))
1566 {
1567 tree init_expr;
1568 tree step_expr, off;
1569 tree type;
1570 tree var, ni, ni_name;
1571 gimple_stmt_iterator last_gsi;
1572
1573 gphi *phi = gsi.phi ();
1574 gphi *phi1 = gsi1.phi ();
1575 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE, vect_location,
1578 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1579
1580 /* Skip reduction and virtual phis. */
1581 if (!iv_phi_p (phi_info))
1582 {
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE, vect_location,
1585 "reduc or virtual phi. skip.\n");
1586 continue;
1587 }
1588
1589 type = TREE_TYPE (gimple_phi_result (phi));
1590 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1591 step_expr = unshare_expr (step_expr);
1592
1593 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1594 of degree >= 2 or exponential. */
1595 gcc_assert (!tree_is_chrec (step_expr));
1596
1597 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1598
1599 tree stype = TREE_TYPE (step_expr);
1600 off = fold_build2 (MULT_EXPR, stype,
1601 fold_convert (stype, niters), step_expr);
1602 if (POINTER_TYPE_P (type))
1603 ni = fold_build_pointer_plus (init_expr, off);
1604 else
1605 ni = fold_convert (type,
1606 fold_build2 (PLUS_EXPR, stype,
1607 fold_convert (stype, init_expr),
1608 off));
1609
1610 var = create_tmp_var (type, "tmp");
1611
1612 last_gsi = gsi_last_bb (exit_bb);
1613 gimple_seq new_stmts = NULL;
1614 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1615 /* Exit_bb shouldn't be empty. */
1616 if (!gsi_end_p (last_gsi))
1617 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1618 else
1619 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1620
1621 /* Fix phi expressions in the successor bb. */
1622 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1623 }
1624 }
1625
1626 /* Return a gimple value containing the misalignment (measured in vector
1627 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1628 it is away from a perfectly aligned address. Add any new statements
1629 to SEQ. */
1630
1631 static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1632 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1633 {
1634 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1635 stmt_vec_info stmt_info = dr_info->stmt;
1636 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1637
1638 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1639 unsigned HOST_WIDE_INT target_align_c;
1640 tree target_align_minus_1;
1641
1642 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1643 size_zero_node) < 0;
1644 tree offset = (negative
1645 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1646 * TREE_INT_CST_LOW
1647 (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
1648 : size_zero_node);
1649 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1650 stmt_info, seq,
1651 offset);
1652 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1653 if (target_align.is_constant (&target_align_c))
1654 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1655 else
1656 {
1657 tree vla = build_int_cst (type, target_align);
1658 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1659 fold_build2 (MINUS_EXPR, type,
1660 build_int_cst (type, 0), vla));
1661 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1662 build_int_cst (type, 1));
1663 }
1664
1665 HOST_WIDE_INT elem_size
1666 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1667 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1668
1669 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1670 tree int_start_addr = fold_convert (type, start_addr);
1671 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1672 target_align_minus_1);
1673
1674 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1675 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1676 elem_size_log);
1677
1678 return misalign_in_elems;
1679 }
1680
1681 /* Function vect_gen_prolog_loop_niters
1682
1683 Generate the number of iterations which should be peeled as prolog for the
1684 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1685 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1686 As a result, after the execution of this loop, the data reference DR will
1687 refer to an aligned location. The following computation is generated:
1688
1689 If the misalignment of DR is known at compile time:
1690 addr_mis = int mis = DR_MISALIGNMENT (dr);
1691 Else, compute address misalignment in bytes:
1692 addr_mis = addr & (target_align - 1)
1693
1694 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1695
1696 (elem_size = element type size; an element is the scalar element whose type
1697 is the inner type of the vectype)
1698
1699 The computations will be emitted at the end of BB. We also compute and
1700 store upper bound (included) of the result in BOUND.
1701
1702 When the step of the data-ref in the loop is not 1 (as in interleaved data
1703 and SLP), the number of iterations of the prolog must be divided by the step
1704 (which is equal to the size of interleaved group).
1705
1706 The above formulas assume that VF == number of elements in the vector. This
1707 may not hold when there are multiple-types in the loop.
1708 In this case, for some data-references in the loop the VF does not represent
1709 the number of elements that fit in the vector. Therefore, instead of VF we
1710 use TYPE_VECTOR_SUBPARTS. */
1711
1712 static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)1713 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1714 basic_block bb, int *bound)
1715 {
1716 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1717 tree var;
1718 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1719 gimple_seq stmts = NULL, new_stmts = NULL;
1720 tree iters, iters_name;
1721 stmt_vec_info stmt_info = dr_info->stmt;
1722 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1723 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1724
1725 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1726 {
1727 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1728
1729 if (dump_enabled_p ())
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "known peeling = %d.\n", npeel);
1732
1733 iters = build_int_cst (niters_type, npeel);
1734 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1735 }
1736 else
1737 {
1738 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1739 tree type = TREE_TYPE (misalign_in_elems);
1740 HOST_WIDE_INT elem_size
1741 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1742 /* We only do prolog peeling if the target alignment is known at compile
1743 time. */
1744 poly_uint64 align_in_elems =
1745 exact_div (target_align, elem_size);
1746 tree align_in_elems_minus_1 =
1747 build_int_cst (type, align_in_elems - 1);
1748 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1749
1750 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1751 & (align_in_elems - 1)). */
1752 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1753 size_zero_node) < 0;
1754 if (negative)
1755 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1756 align_in_elems_tree);
1757 else
1758 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1759 misalign_in_elems);
1760 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1761 iters = fold_convert (niters_type, iters);
1762 unsigned HOST_WIDE_INT align_in_elems_c;
1763 if (align_in_elems.is_constant (&align_in_elems_c))
1764 *bound = align_in_elems_c - 1;
1765 else
1766 *bound = -1;
1767 }
1768
1769 if (dump_enabled_p ())
1770 dump_printf_loc (MSG_NOTE, vect_location,
1771 "niters for prolog loop: %T\n", iters);
1772
1773 var = create_tmp_var (niters_type, "prolog_loop_niters");
1774 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1775
1776 if (new_stmts)
1777 gimple_seq_add_seq (&stmts, new_stmts);
1778 if (stmts)
1779 {
1780 gcc_assert (single_succ_p (bb));
1781 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1782 if (gsi_end_p (gsi))
1783 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1784 else
1785 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1786 }
1787 return iters_name;
1788 }
1789
1790
1791 /* Function vect_update_init_of_dr
1792
1793 If CODE is PLUS, the vector loop starts NITERS iterations after the
1794 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1795 iterations before the scalar one (using masking to skip inactive
1796 elements). This function updates the information recorded in DR to
1797 account for the difference. Specifically, it updates the OFFSET
1798 field of DR_INFO. */
1799
1800 static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1801 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1802 {
1803 struct data_reference *dr = dr_info->dr;
1804 tree offset = dr_info->offset;
1805 if (!offset)
1806 offset = build_zero_cst (sizetype);
1807
1808 niters = fold_build2 (MULT_EXPR, sizetype,
1809 fold_convert (sizetype, niters),
1810 fold_convert (sizetype, DR_STEP (dr)));
1811 offset = fold_build2 (code, sizetype,
1812 fold_convert (sizetype, offset), niters);
1813 dr_info->offset = offset;
1814 }
1815
1816
1817 /* Function vect_update_inits_of_drs
1818
1819 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1820 CODE and NITERS are as for vect_update_inits_of_dr. */
1821
1822 void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1823 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1824 tree_code code)
1825 {
1826 unsigned int i;
1827 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1828 struct data_reference *dr;
1829
1830 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1831
1832 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1833 here, but since we might use these niters to update the epilogues niters
1834 and data references we can't insert them here as this definition might not
1835 always dominate its uses. */
1836 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1837 niters = fold_convert (sizetype, niters);
1838
1839 FOR_EACH_VEC_ELT (datarefs, i, dr)
1840 {
1841 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1842 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1843 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1844 vect_update_init_of_dr (dr_info, niters, code);
1845 }
1846 }
1847
1848 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1849 by masking. This involves calculating the number of iterations to
1850 be peeled and then aligning all memory references appropriately. */
1851
1852 void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1853 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1854 {
1855 tree misalign_in_elems;
1856 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1857
1858 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1859
1860 /* From the information recorded in LOOP_VINFO get the number of iterations
1861 that need to be skipped via masking. */
1862 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1863 {
1864 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1865 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1866 misalign_in_elems = build_int_cst (type, misalign);
1867 }
1868 else
1869 {
1870 gimple_seq seq1 = NULL, seq2 = NULL;
1871 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1872 misalign_in_elems = fold_convert (type, misalign_in_elems);
1873 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1874 &seq2, true, NULL_TREE);
1875 gimple_seq_add_seq (&seq1, seq2);
1876 if (seq1)
1877 {
1878 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1879 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1880 gcc_assert (!new_bb);
1881 }
1882 }
1883
1884 if (dump_enabled_p ())
1885 dump_printf_loc (MSG_NOTE, vect_location,
1886 "misalignment for fully-masked loop: %T\n",
1887 misalign_in_elems);
1888
1889 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1890
1891 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1892 }
1893
1894 /* This function builds ni_name = number of iterations. Statements
1895 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1896 it to TRUE if new ssa_var is generated. */
1897
1898 tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1899 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1900 {
1901 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1902 if (TREE_CODE (ni) == INTEGER_CST)
1903 return ni;
1904 else
1905 {
1906 tree ni_name, var;
1907 gimple_seq stmts = NULL;
1908 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1909
1910 var = create_tmp_var (TREE_TYPE (ni), "niters");
1911 ni_name = force_gimple_operand (ni, &stmts, false, var);
1912 if (stmts)
1913 {
1914 gsi_insert_seq_on_edge_immediate (pe, stmts);
1915 if (new_var_p != NULL)
1916 *new_var_p = true;
1917 }
1918
1919 return ni_name;
1920 }
1921 }
1922
1923 /* Calculate the number of iterations above which vectorized loop will be
1924 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1925 of prolog loop. If it's integer const, the integer number is also passed
1926 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1927 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1928 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1929 threshold below which the scalar (rather than vectorized) loop will be
1930 executed. This function stores the upper bound (inclusive) of the result
1931 in BOUND_SCALAR. */
1932
1933 static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)1934 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1935 int bound_prolog, poly_int64 bound_epilog, int th,
1936 poly_uint64 *bound_scalar,
1937 bool check_profitability)
1938 {
1939 tree type = TREE_TYPE (niters_prolog);
1940 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1941 build_int_cst (type, bound_epilog));
1942
1943 *bound_scalar = bound_prolog + bound_epilog;
1944 if (check_profitability)
1945 {
1946 /* TH indicates the minimum niters of vectorized loop, while we
1947 compute the maximum niters of scalar loop. */
1948 th--;
1949 /* Peeling for constant times. */
1950 if (int_niters_prolog >= 0)
1951 {
1952 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1953 return build_int_cst (type, *bound_scalar);
1954 }
1955 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1956 and BOUND_EPILOG are inclusive upper bounds. */
1957 if (known_ge (th, bound_prolog + bound_epilog))
1958 {
1959 *bound_scalar = th;
1960 return build_int_cst (type, th);
1961 }
1962 /* Need to do runtime comparison. */
1963 else if (maybe_gt (th, bound_epilog))
1964 {
1965 *bound_scalar = upper_bound (*bound_scalar, th);
1966 return fold_build2 (MAX_EXPR, type,
1967 build_int_cst (type, th), niters);
1968 }
1969 }
1970 return niters;
1971 }
1972
1973 /* NITERS is the number of times that the original scalar loop executes
1974 after peeling. Work out the maximum number of iterations N that can
1975 be handled by the vectorized form of the loop and then either:
1976
1977 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1978
1979 niters_vector = N
1980
1981 b) set *STEP_VECTOR_PTR to one and generate:
1982
1983 niters_vector = N / vf
1984
1985 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1986 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1987 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1988
1989 void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)1990 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1991 tree *niters_vector_ptr, tree *step_vector_ptr,
1992 bool niters_no_overflow)
1993 {
1994 tree ni_minus_gap, var;
1995 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1996 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1998 tree log_vf = NULL_TREE;
1999
2000 /* If epilogue loop is required because of data accesses with gaps, we
2001 subtract one iteration from the total number of iterations here for
2002 correct calculation of RATIO. */
2003 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2004 {
2005 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2006 build_one_cst (type));
2007 if (!is_gimple_val (ni_minus_gap))
2008 {
2009 var = create_tmp_var (type, "ni_gap");
2010 gimple *stmts = NULL;
2011 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2012 true, var);
2013 gsi_insert_seq_on_edge_immediate (pe, stmts);
2014 }
2015 }
2016 else
2017 ni_minus_gap = niters;
2018
2019 unsigned HOST_WIDE_INT const_vf;
2020 if (vf.is_constant (&const_vf)
2021 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2022 {
2023 /* Create: niters >> log2(vf) */
2024 /* If it's known that niters == number of latch executions + 1 doesn't
2025 overflow, we can generate niters >> log2(vf); otherwise we generate
2026 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2027 will be at least one. */
2028 log_vf = build_int_cst (type, exact_log2 (const_vf));
2029 if (niters_no_overflow)
2030 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2031 else
2032 niters_vector
2033 = fold_build2 (PLUS_EXPR, type,
2034 fold_build2 (RSHIFT_EXPR, type,
2035 fold_build2 (MINUS_EXPR, type,
2036 ni_minus_gap,
2037 build_int_cst (type, vf)),
2038 log_vf),
2039 build_int_cst (type, 1));
2040 step_vector = build_one_cst (type);
2041 }
2042 else
2043 {
2044 niters_vector = ni_minus_gap;
2045 step_vector = build_int_cst (type, vf);
2046 }
2047
2048 if (!is_gimple_val (niters_vector))
2049 {
2050 var = create_tmp_var (type, "bnd");
2051 gimple_seq stmts = NULL;
2052 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2053 gsi_insert_seq_on_edge_immediate (pe, stmts);
2054 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2055 we set range information to make niters analyzer's life easier.
2056 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2057 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2058 if (stmts != NULL && log_vf)
2059 {
2060 if (niters_no_overflow)
2061 set_range_info (niters_vector, VR_RANGE,
2062 wi::one (TYPE_PRECISION (type)),
2063 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2064 TYPE_SIGN (type)),
2065 exact_log2 (const_vf),
2066 TYPE_SIGN (type)));
2067 /* For VF == 1 the vector IV might also overflow so we cannot
2068 assert a minimum value of 1. */
2069 else if (const_vf > 1)
2070 set_range_info (niters_vector, VR_RANGE,
2071 wi::one (TYPE_PRECISION (type)),
2072 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2073 TYPE_SIGN (type))
2074 - (const_vf - 1),
2075 exact_log2 (const_vf), TYPE_SIGN (type))
2076 + 1);
2077 }
2078 }
2079 *niters_vector_ptr = niters_vector;
2080 *step_vector_ptr = step_vector;
2081
2082 return;
2083 }
2084
2085 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2086 loop specified by LOOP_VINFO after vectorization, compute the number
2087 of iterations before vectorization (niters_vector * vf) and store it
2088 to NITERS_VECTOR_MULT_VF_PTR. */
2089
2090 static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)2091 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2092 tree niters_vector,
2093 tree *niters_vector_mult_vf_ptr)
2094 {
2095 /* We should be using a step_vector of VF if VF is variable. */
2096 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2097 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2098 tree type = TREE_TYPE (niters_vector);
2099 tree log_vf = build_int_cst (type, exact_log2 (vf));
2100 basic_block exit_bb = single_exit (loop)->dest;
2101
2102 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2103 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2104 niters_vector, log_vf);
2105 if (!is_gimple_val (niters_vector_mult_vf))
2106 {
2107 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2108 gimple_seq stmts = NULL;
2109 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2110 &stmts, true, var);
2111 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2112 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2113 }
2114 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2115 }
2116
2117 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2118 this function searches for the corresponding lcssa phi node in exit
2119 bb of LOOP. If it is found, return the phi result; otherwise return
2120 NULL. */
2121
2122 static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2123 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2124 gphi *lcssa_phi)
2125 {
2126 gphi_iterator gsi;
2127 edge e = single_exit (loop);
2128
2129 gcc_assert (single_pred_p (e->dest));
2130 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2131 {
2132 gphi *phi = gsi.phi ();
2133 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2134 PHI_ARG_DEF (lcssa_phi, 0), 0))
2135 return PHI_RESULT (phi);
2136 }
2137 return NULL_TREE;
2138 }
2139
2140 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2141 from SECOND/FIRST and puts it at the original loop's preheader/exit
2142 edge, the two loops are arranged as below:
2143
2144 preheader_a:
2145 first_loop:
2146 header_a:
2147 i_1 = PHI<i_0, i_2>;
2148 ...
2149 i_2 = i_1 + 1;
2150 if (cond_a)
2151 goto latch_a;
2152 else
2153 goto between_bb;
2154 latch_a:
2155 goto header_a;
2156
2157 between_bb:
2158 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2159
2160 second_loop:
2161 header_b:
2162 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2163 or with i_2 if no LCSSA phi is created
2164 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2165 ...
2166 i_4 = i_3 + 1;
2167 if (cond_b)
2168 goto latch_b;
2169 else
2170 goto exit_bb;
2171 latch_b:
2172 goto header_b;
2173
2174 exit_bb:
2175
2176 This function creates loop closed SSA for the first loop; update the
2177 second loop's PHI nodes by replacing argument on incoming edge with the
2178 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2179 is false, Loop closed ssa phis will only be created for non-iv phis for
2180 the first loop.
2181
2182 This function assumes exit bb of the first loop is preheader bb of the
2183 second loop, i.e, between_bb in the example code. With PHIs updated,
2184 the second loop will execute rest iterations of the first. */
2185
2186 static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)2187 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2188 class loop *first, class loop *second,
2189 bool create_lcssa_for_iv_phis)
2190 {
2191 gphi_iterator gsi_update, gsi_orig;
2192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2193
2194 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2195 edge second_preheader_e = loop_preheader_edge (second);
2196 basic_block between_bb = single_exit (first)->dest;
2197
2198 gcc_assert (between_bb == second_preheader_e->src);
2199 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2200 /* Either the first loop or the second is the loop to be vectorized. */
2201 gcc_assert (loop == first || loop == second);
2202
2203 for (gsi_orig = gsi_start_phis (first->header),
2204 gsi_update = gsi_start_phis (second->header);
2205 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2206 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2207 {
2208 gphi *orig_phi = gsi_orig.phi ();
2209 gphi *update_phi = gsi_update.phi ();
2210
2211 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2212 /* Generate lcssa PHI node for the first loop. */
2213 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2214 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2215 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2216 {
2217 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2218 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2219 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2220 arg = new_res;
2221 }
2222
2223 /* Update PHI node in the second loop by replacing arg on the loop's
2224 incoming edge. */
2225 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2226 }
2227
2228 /* For epilogue peeling we have to make sure to copy all LC PHIs
2229 for correct vectorization of live stmts. */
2230 if (loop == first)
2231 {
2232 basic_block orig_exit = single_exit (second)->dest;
2233 for (gsi_orig = gsi_start_phis (orig_exit);
2234 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2235 {
2236 gphi *orig_phi = gsi_orig.phi ();
2237 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2238 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2239 continue;
2240
2241 /* Already created in the above loop. */
2242 if (find_guard_arg (first, second, orig_phi))
2243 continue;
2244
2245 tree new_res = copy_ssa_name (orig_arg);
2246 gphi *lcphi = create_phi_node (new_res, between_bb);
2247 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2248 }
2249 }
2250 }
2251
2252 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2253 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2254 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2255 appear like below:
2256
2257 guard_bb:
2258 if (cond)
2259 goto merge_bb;
2260 else
2261 goto skip_loop;
2262
2263 skip_loop:
2264 header_a:
2265 i_1 = PHI<i_0, i_2>;
2266 ...
2267 i_2 = i_1 + 1;
2268 if (cond_a)
2269 goto latch_a;
2270 else
2271 goto exit_a;
2272 latch_a:
2273 goto header_a;
2274
2275 exit_a:
2276 i_5 = PHI<i_2>;
2277
2278 merge_bb:
2279 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2280
2281 update_loop:
2282 header_b:
2283 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2284 ...
2285 i_4 = i_3 + 1;
2286 if (cond_b)
2287 goto latch_b;
2288 else
2289 goto exit_bb;
2290 latch_b:
2291 goto header_b;
2292
2293 exit_bb:
2294
2295 This function creates PHI nodes at merge_bb and replaces the use of i_5
2296 in the update_loop's PHI node with the result of new PHI result. */
2297
2298 static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2299 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2300 class loop *update_loop,
2301 edge guard_edge, edge merge_edge)
2302 {
2303 location_t merge_loc, guard_loc;
2304 edge orig_e = loop_preheader_edge (skip_loop);
2305 edge update_e = loop_preheader_edge (update_loop);
2306 gphi_iterator gsi_orig, gsi_update;
2307
2308 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2309 gsi_update = gsi_start_phis (update_loop->header));
2310 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2311 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2312 {
2313 gphi *orig_phi = gsi_orig.phi ();
2314 gphi *update_phi = gsi_update.phi ();
2315
2316 /* Generate new phi node at merge bb of the guard. */
2317 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2318 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2319
2320 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2321 args in NEW_PHI for these edges. */
2322 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2323 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2324 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2325 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2326 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2327 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2328
2329 /* Update phi in UPDATE_PHI. */
2330 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2331 }
2332 }
2333
2334 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2335 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2336 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2337 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2338 The CFG looks like:
2339
2340 loop:
2341 header_a:
2342 i_1 = PHI<i_0, i_2>;
2343 ...
2344 i_2 = i_1 + 1;
2345 if (cond_a)
2346 goto latch_a;
2347 else
2348 goto exit_a;
2349 latch_a:
2350 goto header_a;
2351
2352 exit_a:
2353
2354 guard_bb:
2355 if (cond)
2356 goto merge_bb;
2357 else
2358 goto epilog_loop;
2359
2360 ;; fall_through_bb
2361
2362 epilog_loop:
2363 header_b:
2364 i_3 = PHI<i_2, i_4>;
2365 ...
2366 i_4 = i_3 + 1;
2367 if (cond_b)
2368 goto latch_b;
2369 else
2370 goto merge_bb;
2371 latch_b:
2372 goto header_b;
2373
2374 merge_bb:
2375 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2376
2377 exit_bb:
2378 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2379
2380 For each name used out side EPILOG (i.e - for each name that has a lcssa
2381 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2382 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2383 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2384 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2385 in exit_bb will also be updated. */
2386
2387 static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2388 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2389 edge guard_edge, edge merge_edge)
2390 {
2391 gphi_iterator gsi;
2392 basic_block merge_bb = guard_edge->dest;
2393
2394 gcc_assert (single_succ_p (merge_bb));
2395 edge e = single_succ_edge (merge_bb);
2396 basic_block exit_bb = e->dest;
2397 gcc_assert (single_pred_p (exit_bb));
2398 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2399
2400 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2401 {
2402 gphi *update_phi = gsi.phi ();
2403 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2404
2405 tree merge_arg = NULL_TREE;
2406
2407 /* If the old argument is a SSA_NAME use its current_def. */
2408 if (TREE_CODE (old_arg) == SSA_NAME)
2409 merge_arg = get_current_def (old_arg);
2410 /* If it's a constant or doesn't have a current_def, just use the old
2411 argument. */
2412 if (!merge_arg)
2413 merge_arg = old_arg;
2414
2415 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2416 /* If the var is live after loop but not a reduction, we simply
2417 use the old arg. */
2418 if (!guard_arg)
2419 guard_arg = old_arg;
2420
2421 /* Create new phi node in MERGE_BB: */
2422 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2423 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2424
2425 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2426 the two PHI args in merge_phi for these edges. */
2427 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2428 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2429
2430 /* Update the original phi in exit_bb. */
2431 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2432 }
2433 }
2434
2435 /* EPILOG loop is duplicated from the original loop for vectorizing,
2436 the arg of its loop closed ssa PHI needs to be updated. */
2437
2438 static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2439 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2440 {
2441 gphi_iterator gsi;
2442 basic_block exit_bb = single_exit (epilog)->dest;
2443
2444 gcc_assert (single_pred_p (exit_bb));
2445 edge e = EDGE_PRED (exit_bb, 0);
2446 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2447 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2448 }
2449
2450 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2451 iterate exactly CONST_NITERS times. Make a final decision about
2452 whether the epilogue loop should be used, returning true if so. */
2453
2454 static bool
vect_update_epilogue_niters(loop_vec_info epilogue_vinfo,unsigned HOST_WIDE_INT const_niters)2455 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2456 unsigned HOST_WIDE_INT const_niters)
2457 {
2458 /* Avoid wrap-around when computing const_niters - 1. Also reject
2459 using an epilogue loop for a single scalar iteration, even if
2460 we could in principle implement that using partial vectors. */
2461 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2462 if (const_niters <= gap_niters + 1)
2463 return false;
2464
2465 /* Install the number of iterations. */
2466 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2467 tree niters_tree = build_int_cst (niters_type, const_niters);
2468 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2469
2470 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2471 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2472
2473 /* Decide what to do if the number of epilogue iterations is not
2474 a multiple of the epilogue loop's vectorization factor. */
2475 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2476 }
2477
2478 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2479 Return a value that equals:
2480
2481 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2482 - SKIP_VALUE when the main loop is skipped. */
2483
2484 tree
vect_get_main_loop_result(loop_vec_info loop_vinfo,tree main_loop_value,tree skip_value)2485 vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2486 tree skip_value)
2487 {
2488 gcc_assert (loop_vinfo->main_loop_edge);
2489
2490 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2491 basic_block bb = loop_vinfo->main_loop_edge->dest;
2492 gphi *new_phi = create_phi_node (phi_result, bb);
2493 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2494 UNKNOWN_LOCATION);
2495 add_phi_arg (new_phi, skip_value,
2496 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2497 return phi_result;
2498 }
2499
2500 /* Function vect_do_peeling.
2501
2502 Input:
2503 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2504
2505 preheader:
2506 LOOP:
2507 header_bb:
2508 loop_body
2509 if (exit_loop_cond) goto exit_bb
2510 else goto header_bb
2511 exit_bb:
2512
2513 - NITERS: The number of iterations of the loop.
2514 - NITERSM1: The number of iterations of the loop's latch.
2515 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2516 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2517 CHECK_PROFITABILITY is true.
2518 Output:
2519 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2520 iterate after vectorization; see vect_set_loop_condition for details.
2521 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2522 should be set to the number of scalar iterations handled by the
2523 vector loop. The SSA name is only used on exit from the loop.
2524
2525 This function peels prolog and epilog from the loop, adds guards skipping
2526 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2527 would look like:
2528
2529 guard_bb_1:
2530 if (prefer_scalar_loop) goto merge_bb_1
2531 else goto guard_bb_2
2532
2533 guard_bb_2:
2534 if (skip_prolog) goto merge_bb_2
2535 else goto prolog_preheader
2536
2537 prolog_preheader:
2538 PROLOG:
2539 prolog_header_bb:
2540 prolog_body
2541 if (exit_prolog_cond) goto prolog_exit_bb
2542 else goto prolog_header_bb
2543 prolog_exit_bb:
2544
2545 merge_bb_2:
2546
2547 vector_preheader:
2548 VECTOR LOOP:
2549 vector_header_bb:
2550 vector_body
2551 if (exit_vector_cond) goto vector_exit_bb
2552 else goto vector_header_bb
2553 vector_exit_bb:
2554
2555 guard_bb_3:
2556 if (skip_epilog) goto merge_bb_3
2557 else goto epilog_preheader
2558
2559 merge_bb_1:
2560
2561 epilog_preheader:
2562 EPILOG:
2563 epilog_header_bb:
2564 epilog_body
2565 if (exit_epilog_cond) goto merge_bb_3
2566 else goto epilog_header_bb
2567
2568 merge_bb_3:
2569
2570 Note this function peels prolog and epilog only if it's necessary,
2571 as well as guards.
2572 This function returns the epilogue loop if a decision was made to vectorize
2573 it, otherwise NULL.
2574
2575 The analysis resulting in this epilogue loop's loop_vec_info was performed
2576 in the same vect_analyze_loop call as the main loop's. At that time
2577 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2578 vectorization factors than the main loop. This list is stored in the main
2579 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2580 vectorize the epilogue loop for a lower vectorization factor, the
2581 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2582 updated and linked to the epilogue loop. This is later used to vectorize
2583 the epilogue. The reason the loop_vec_info needs updating is that it was
2584 constructed based on the original main loop, and the epilogue loop is a
2585 copy of this loop, so all links pointing to statements in the original loop
2586 need updating. Furthermore, these loop_vec_infos share the
2587 data_reference's records, which will also need to be updated.
2588
2589 TODO: Guard for prefer_scalar_loop should be emitted along with
2590 versioning conditions if loop versioning is needed. */
2591
2592
2593 class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)2594 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2595 tree *niters_vector, tree *step_vector,
2596 tree *niters_vector_mult_vf_var, int th,
2597 bool check_profitability, bool niters_no_overflow,
2598 tree *advance)
2599 {
2600 edge e, guard_e;
2601 tree type = TREE_TYPE (niters), guard_cond;
2602 basic_block guard_bb, guard_to;
2603 profile_probability prob_prolog, prob_vector, prob_epilog;
2604 int estimated_vf;
2605 int prolog_peeling = 0;
2606 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2607 bool vect_epilogues_updated_niters = false;
2608 /* We currently do not support prolog peeling if the target alignment is not
2609 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2610 target alignment being constant. */
2611 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2612 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2613 return NULL;
2614
2615 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2616 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2617
2618 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2619 poly_uint64 bound_epilog = 0;
2620 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2621 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2622 bound_epilog += vf - 1;
2623 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2624 bound_epilog += 1;
2625 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2626 poly_uint64 bound_scalar = bound_epilog;
2627
2628 if (!prolog_peeling && !epilog_peeling)
2629 return NULL;
2630
2631 /* Before doing any peeling make sure to reset debug binds outside of
2632 the loop refering to defs not in LC SSA. */
2633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2634 for (unsigned i = 0; i < loop->num_nodes; ++i)
2635 {
2636 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2637 imm_use_iterator ui;
2638 gimple *use_stmt;
2639 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2640 gsi_next (&gsi))
2641 {
2642 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2643 if (gimple_debug_bind_p (use_stmt)
2644 && loop != gimple_bb (use_stmt)->loop_father
2645 && !flow_loop_nested_p (loop,
2646 gimple_bb (use_stmt)->loop_father))
2647 {
2648 gimple_debug_bind_reset_value (use_stmt);
2649 update_stmt (use_stmt);
2650 }
2651 }
2652 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2653 gsi_next (&gsi))
2654 {
2655 ssa_op_iter op_iter;
2656 def_operand_p def_p;
2657 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2658 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2659 if (gimple_debug_bind_p (use_stmt)
2660 && loop != gimple_bb (use_stmt)->loop_father
2661 && !flow_loop_nested_p (loop,
2662 gimple_bb (use_stmt)->loop_father))
2663 {
2664 gimple_debug_bind_reset_value (use_stmt);
2665 update_stmt (use_stmt);
2666 }
2667 }
2668 }
2669
2670 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2671 estimated_vf = vect_vf_for_cost (loop_vinfo);
2672 if (estimated_vf == 2)
2673 estimated_vf = 3;
2674 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2675 .apply_scale (estimated_vf - 1, estimated_vf);
2676
2677 class loop *prolog, *epilog = NULL;
2678 class loop *first_loop = loop;
2679 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2680
2681 /* We might have a queued need to update virtual SSA form. As we
2682 delete the update SSA machinery below after doing a regular
2683 incremental SSA update during loop copying make sure we don't
2684 lose that fact.
2685 ??? Needing to update virtual SSA form by renaming is unfortunate
2686 but not all of the vectorizer code inserting new loads / stores
2687 properly assigns virtual operands to those statements. */
2688 update_ssa (TODO_update_ssa_only_virtuals);
2689
2690 create_lcssa_for_virtual_phi (loop);
2691
2692 /* If we're vectorizing an epilogue loop, the update_ssa above will
2693 have ensured that the virtual operand is in SSA form throughout the
2694 vectorized main loop. Normally it is possible to trace the updated
2695 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2696 back to scalar-stmt vuses, meaning that the effect of the SSA update
2697 remains local to the main loop. However, there are rare cases in
2698 which the vectorized loop has vdefs even when the original scalar
2699 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2700 introduces clobbers of the temporary vector array, which in turn
2701 needs new vdefs. If the scalar loop doesn't write to memory, these
2702 new vdefs will be the only ones in the vector loop.
2703
2704 In that case, update_ssa will have added a new virtual phi to the
2705 main loop, which previously didn't need one. Ensure that we (locally)
2706 maintain LCSSA form for the virtual operand, just as we would have
2707 done if the virtual phi had existed from the outset. This makes it
2708 easier to duplicate the scalar epilogue loop below. */
2709 tree vop_to_rename = NULL_TREE;
2710 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2711 {
2712 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2713 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2714 }
2715
2716 if (MAY_HAVE_DEBUG_BIND_STMTS)
2717 {
2718 gcc_assert (!adjust_vec.exists ());
2719 adjust_vec.create (32);
2720 }
2721 initialize_original_copy_tables ();
2722
2723 /* Record the anchor bb at which the guard should be placed if the scalar
2724 loop might be preferred. */
2725 basic_block anchor = loop_preheader_edge (loop)->src;
2726
2727 /* Generate the number of iterations for the prolog loop. We do this here
2728 so that we can also get the upper bound on the number of iterations. */
2729 tree niters_prolog;
2730 int bound_prolog = 0;
2731 if (prolog_peeling)
2732 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2733 &bound_prolog);
2734 else
2735 niters_prolog = build_int_cst (type, 0);
2736
2737 loop_vec_info epilogue_vinfo = NULL;
2738 if (vect_epilogues)
2739 {
2740 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2741 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2742 }
2743
2744 tree niters_vector_mult_vf = NULL_TREE;
2745 /* Saving NITERs before the loop, as this may be changed by prologue. */
2746 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2747 edge update_e = NULL, skip_e = NULL;
2748 unsigned int lowest_vf = constant_lower_bound (vf);
2749 /* If we know the number of scalar iterations for the main loop we should
2750 check whether after the main loop there are enough iterations left over
2751 for the epilogue. */
2752 if (vect_epilogues
2753 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2754 && prolog_peeling >= 0
2755 && known_eq (vf, lowest_vf))
2756 {
2757 unsigned HOST_WIDE_INT eiters
2758 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2759 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2760
2761 eiters -= prolog_peeling;
2762 eiters
2763 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2764
2765 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2766 {
2767 delete epilogue_vinfo;
2768 epilogue_vinfo = NULL;
2769 if (loop_vinfo->epilogue_vinfos.length () == 0)
2770 {
2771 vect_epilogues = false;
2772 break;
2773 }
2774 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2775 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2776 }
2777 vect_epilogues_updated_niters = true;
2778 }
2779 /* Prolog loop may be skipped. */
2780 bool skip_prolog = (prolog_peeling != 0);
2781 /* Skip this loop to epilog when there are not enough iterations to enter this
2782 vectorized loop. If true we should perform runtime checks on the NITERS
2783 to check whether we should skip the current vectorized loop. If we know
2784 the number of scalar iterations we may choose to add a runtime check if
2785 this number "maybe" smaller than the number of iterations required
2786 when we know the number of scalar iterations may potentially
2787 be smaller than the number of iterations required to enter this loop, for
2788 this we use the upper bounds on the prolog and epilog peeling. When we
2789 don't know the number of iterations and don't require versioning it is
2790 because we have asserted that there are enough scalar iterations to enter
2791 the main loop, so this skip is not necessary. When we are versioning then
2792 we only add such a skip if we have chosen to vectorize the epilogue. */
2793 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2794 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2795 bound_prolog + bound_epilog)
2796 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2797 || vect_epilogues));
2798 /* Epilog loop must be executed if the number of iterations for epilog
2799 loop is known at compile time, otherwise we need to add a check at
2800 the end of vector loop and skip to the end of epilog loop. */
2801 bool skip_epilog = (prolog_peeling < 0
2802 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2803 || !vf.is_constant ());
2804 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2805 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2806 skip_epilog = false;
2807
2808 if (skip_vector)
2809 {
2810 split_edge (loop_preheader_edge (loop));
2811
2812 /* Due to the order in which we peel prolog and epilog, we first
2813 propagate probability to the whole loop. The purpose is to
2814 avoid adjusting probabilities of both prolog and vector loops
2815 separately. Note in this case, the probability of epilog loop
2816 needs to be scaled back later. */
2817 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2818 if (prob_vector.initialized_p ())
2819 {
2820 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2821 scale_loop_profile (loop, prob_vector, 0);
2822 }
2823 }
2824
2825 dump_user_location_t loop_loc = find_loop_location (loop);
2826 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2827 if (vect_epilogues)
2828 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2829 use the original scalar loop as remaining epilogue if necessary. */
2830 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2831 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2832
2833 if (prolog_peeling)
2834 {
2835 e = loop_preheader_edge (loop);
2836 if (!slpeel_can_duplicate_loop_p (loop, e))
2837 {
2838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2839 "loop can't be duplicated to preheader edge.\n");
2840 gcc_unreachable ();
2841 }
2842 /* Peel prolog and put it on preheader edge of loop. */
2843 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2844 if (!prolog)
2845 {
2846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2847 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2848 gcc_unreachable ();
2849 }
2850 prolog->force_vectorize = false;
2851 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2852 first_loop = prolog;
2853 reset_original_copy_tables ();
2854
2855 /* Update the number of iterations for prolog loop. */
2856 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2857 vect_set_loop_condition (prolog, NULL, niters_prolog,
2858 step_prolog, NULL_TREE, false);
2859
2860 /* Skip the prolog loop. */
2861 if (skip_prolog)
2862 {
2863 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2864 niters_prolog, build_int_cst (type, 0));
2865 guard_bb = loop_preheader_edge (prolog)->src;
2866 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2867 guard_to = split_edge (loop_preheader_edge (loop));
2868 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2869 guard_to, guard_bb,
2870 prob_prolog.invert (),
2871 irred_flag);
2872 e = EDGE_PRED (guard_to, 0);
2873 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2874 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2875
2876 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2877 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2878 }
2879
2880 /* Update init address of DRs. */
2881 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2882 /* Update niters for vector loop. */
2883 LOOP_VINFO_NITERS (loop_vinfo)
2884 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2885 LOOP_VINFO_NITERSM1 (loop_vinfo)
2886 = fold_build2 (MINUS_EXPR, type,
2887 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2888 bool new_var_p = false;
2889 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2890 /* It's guaranteed that vector loop bound before vectorization is at
2891 least VF, so set range information for newly generated var. */
2892 if (new_var_p)
2893 set_range_info (niters, VR_RANGE,
2894 wi::to_wide (build_int_cst (type, vf)),
2895 wi::to_wide (TYPE_MAX_VALUE (type)));
2896
2897 /* Prolog iterates at most bound_prolog times, latch iterates at
2898 most bound_prolog - 1 times. */
2899 record_niter_bound (prolog, bound_prolog - 1, false, true);
2900 delete_update_ssa ();
2901 adjust_vec_debug_stmts ();
2902 scev_reset ();
2903 }
2904
2905 if (epilog_peeling)
2906 {
2907 e = single_exit (loop);
2908 if (!slpeel_can_duplicate_loop_p (loop, e))
2909 {
2910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2911 "loop can't be duplicated to exit edge.\n");
2912 gcc_unreachable ();
2913 }
2914 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2915 said epilog then we should use a copy of the main loop as a starting
2916 point. This loop may have already had some preliminary transformations
2917 to allow for more optimal vectorization, for example if-conversion.
2918 If we are not vectorizing the epilog then we should use the scalar loop
2919 as the transformations mentioned above make less or no sense when not
2920 vectorizing. */
2921 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2922 if (vop_to_rename)
2923 {
2924 /* Vectorizing the main loop can sometimes introduce a vdef to
2925 a loop that previously didn't have one; see the comment above
2926 the definition of VOP_TO_RENAME for details. The definition
2927 D that holds on E will then be different from the definition
2928 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2929 rename VOP_TO_RENAME to D when copying the loop.
2930
2931 The virtual operand is in LCSSA form for the main loop,
2932 and no stmt between the main loop and E needs a vdef,
2933 so we know that D is provided by a phi rather than by a
2934 vdef on a normal gimple stmt. */
2935 basic_block vdef_bb = e->src;
2936 gphi *vphi;
2937 while (!(vphi = get_virtual_phi (vdef_bb)))
2938 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2939 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2940 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2941 }
2942 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2943 if (!epilog)
2944 {
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2946 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2947 gcc_unreachable ();
2948 }
2949 epilog->force_vectorize = false;
2950 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2951
2952 /* Scalar version loop may be preferred. In this case, add guard
2953 and skip to epilog. Note this only happens when the number of
2954 iterations of loop is unknown at compile time, otherwise this
2955 won't be vectorized. */
2956 if (skip_vector)
2957 {
2958 /* Additional epilogue iteration is peeled if gap exists. */
2959 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2960 bound_prolog, bound_epilog,
2961 th, &bound_scalar,
2962 check_profitability);
2963 /* Build guard against NITERSM1 since NITERS may overflow. */
2964 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2965 guard_bb = anchor;
2966 guard_to = split_edge (loop_preheader_edge (epilog));
2967 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2968 guard_to, guard_bb,
2969 prob_vector.invert (),
2970 irred_flag);
2971 skip_e = guard_e;
2972 e = EDGE_PRED (guard_to, 0);
2973 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2974 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2975
2976 /* Simply propagate profile info from guard_bb to guard_to which is
2977 a merge point of control flow. */
2978 guard_to->count = guard_bb->count;
2979
2980 /* Scale probability of epilog loop back.
2981 FIXME: We should avoid scaling down and back up. Profile may
2982 get lost if we scale down to 0. */
2983 basic_block *bbs = get_loop_body (epilog);
2984 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2985 bbs[i]->count = bbs[i]->count.apply_scale
2986 (bbs[i]->count,
2987 bbs[i]->count.apply_probability
2988 (prob_vector));
2989 free (bbs);
2990 }
2991
2992 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2993 /* If loop is peeled for non-zero constant times, now niters refers to
2994 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2995 overflows. */
2996 niters_no_overflow |= (prolog_peeling > 0);
2997 vect_gen_vector_loop_niters (loop_vinfo, niters,
2998 niters_vector, step_vector,
2999 niters_no_overflow);
3000 if (!integer_onep (*step_vector))
3001 {
3002 /* On exit from the loop we will have an easy way of calcalating
3003 NITERS_VECTOR / STEP * STEP. Install a dummy definition
3004 until then. */
3005 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3006 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3007 *niters_vector_mult_vf_var = niters_vector_mult_vf;
3008 }
3009 else
3010 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3011 &niters_vector_mult_vf);
3012 /* Update IVs of original loop as if they were advanced by
3013 niters_vector_mult_vf steps. */
3014 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3015 update_e = skip_vector ? e : loop_preheader_edge (epilog);
3016 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3017 update_e);
3018
3019 if (skip_epilog)
3020 {
3021 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3022 niters, niters_vector_mult_vf);
3023 guard_bb = single_exit (loop)->dest;
3024 guard_to = split_edge (single_exit (epilog));
3025 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3026 skip_vector ? anchor : guard_bb,
3027 prob_epilog.invert (),
3028 irred_flag);
3029 if (vect_epilogues)
3030 epilogue_vinfo->skip_this_loop_edge = guard_e;
3031 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3032 single_exit (epilog));
3033 /* Only need to handle basic block before epilog loop if it's not
3034 the guard_bb, which is the case when skip_vector is true. */
3035 if (guard_bb != bb_before_epilog)
3036 {
3037 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3038
3039 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3040 }
3041 scale_loop_profile (epilog, prob_epilog, 0);
3042 }
3043 else
3044 slpeel_update_phi_nodes_for_lcssa (epilog);
3045
3046 unsigned HOST_WIDE_INT bound;
3047 if (bound_scalar.is_constant (&bound))
3048 {
3049 gcc_assert (bound != 0);
3050 /* -1 to convert loop iterations to latch iterations. */
3051 record_niter_bound (epilog, bound - 1, false, true);
3052 }
3053
3054 delete_update_ssa ();
3055 adjust_vec_debug_stmts ();
3056 scev_reset ();
3057 }
3058
3059 if (vect_epilogues)
3060 {
3061 epilog->aux = epilogue_vinfo;
3062 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3063
3064 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3065
3066 /* We now must calculate the number of NITERS performed by the previous
3067 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3068 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3069 niters_prolog, niters_vector_mult_vf);
3070
3071 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3072 determine whether we are coming from the previous vectorized loop
3073 using the update_e edge or the skip_vector basic block using the
3074 skip_e edge. */
3075 if (skip_vector)
3076 {
3077 gcc_assert (update_e != NULL
3078 && skip_e != NULL
3079 && !vect_epilogues_updated_niters);
3080 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3081 update_e->dest);
3082 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3083 gimple *stmt = gimple_build_assign (new_ssa, niters);
3084 gimple_stmt_iterator gsi;
3085 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3086 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3087 {
3088 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3089 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3090 }
3091 else
3092 {
3093 gsi = gsi_last_bb (update_e->src);
3094 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3095 }
3096
3097 niters = new_ssa;
3098 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3099 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3100 UNKNOWN_LOCATION);
3101 niters = PHI_RESULT (new_phi);
3102 epilogue_vinfo->main_loop_edge = update_e;
3103 epilogue_vinfo->skip_main_loop_edge = skip_e;
3104 }
3105
3106 /* Set ADVANCE to the number of iterations performed by the previous
3107 loop and its prologue. */
3108 *advance = niters;
3109
3110 if (!vect_epilogues_updated_niters)
3111 {
3112 /* Subtract the number of iterations performed by the vectorized loop
3113 from the number of total iterations. */
3114 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3115 before_loop_niters,
3116 niters);
3117
3118 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3119 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3120 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3121 epilogue_niters,
3122 build_one_cst (TREE_TYPE (epilogue_niters)));
3123
3124 /* Decide what to do if the number of epilogue iterations is not
3125 a multiple of the epilogue loop's vectorization factor.
3126 We should have rejected the loop during the analysis phase
3127 if this fails. */
3128 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3129 true))
3130 gcc_unreachable ();
3131 }
3132 }
3133
3134 adjust_vec.release ();
3135 free_original_copy_tables ();
3136
3137 return vect_epilogues ? epilog : NULL;
3138 }
3139
3140 /* Function vect_create_cond_for_niters_checks.
3141
3142 Create a conditional expression that represents the run-time checks for
3143 loop's niter. The loop is guaranteed to terminate if the run-time
3144 checks hold.
3145
3146 Input:
3147 COND_EXPR - input conditional expression. New conditions will be chained
3148 with logical AND operation. If it is NULL, then the function
3149 is used to return the number of alias checks.
3150 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3151 to be checked.
3152
3153 Output:
3154 COND_EXPR - conditional expression.
3155
3156 The returned COND_EXPR is the conditional expression to be used in the
3157 if statement that controls which version of the loop gets executed at
3158 runtime. */
3159
3160 static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)3161 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3162 {
3163 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3164
3165 if (*cond_expr)
3166 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3167 *cond_expr, part_cond_expr);
3168 else
3169 *cond_expr = part_cond_expr;
3170 }
3171
3172 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3173 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3174
3175 static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3176 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3177 {
3178 if (*cond_expr)
3179 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3180 *cond_expr, part_cond_expr);
3181 else
3182 *cond_expr = part_cond_expr;
3183 }
3184
3185 /* Function vect_create_cond_for_align_checks.
3186
3187 Create a conditional expression that represents the alignment checks for
3188 all of data references (array element references) whose alignment must be
3189 checked at runtime.
3190
3191 Input:
3192 COND_EXPR - input conditional expression. New conditions will be chained
3193 with logical AND operation.
3194 LOOP_VINFO - two fields of the loop information are used.
3195 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3196 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3197
3198 Output:
3199 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3200 expression.
3201 The returned value is the conditional expression to be used in the if
3202 statement that controls which version of the loop gets executed at runtime.
3203
3204 The algorithm makes two assumptions:
3205 1) The number of bytes "n" in a vector is a power of 2.
3206 2) An address "a" is aligned if a%n is zero and that this
3207 test can be done as a&(n-1) == 0. For example, for 16
3208 byte vectors the test is a&0xf == 0. */
3209
3210 static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)3211 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3212 tree *cond_expr,
3213 gimple_seq *cond_expr_stmt_list)
3214 {
3215 const vec<stmt_vec_info> &may_misalign_stmts
3216 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3217 stmt_vec_info stmt_info;
3218 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3219 tree mask_cst;
3220 unsigned int i;
3221 tree int_ptrsize_type;
3222 char tmp_name[20];
3223 tree or_tmp_name = NULL_TREE;
3224 tree and_tmp_name;
3225 gimple *and_stmt;
3226 tree ptrsize_zero;
3227 tree part_cond_expr;
3228
3229 /* Check that mask is one less than a power of 2, i.e., mask is
3230 all zeros followed by all ones. */
3231 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3232
3233 int_ptrsize_type = signed_type_for (ptr_type_node);
3234
3235 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3236 of the first vector of the i'th data reference. */
3237
3238 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3239 {
3240 gimple_seq new_stmt_list = NULL;
3241 tree addr_base;
3242 tree addr_tmp_name;
3243 tree new_or_tmp_name;
3244 gimple *addr_stmt, *or_stmt;
3245 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3246 bool negative = tree_int_cst_compare
3247 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3248 tree offset = negative
3249 ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3250 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3251 : size_zero_node;
3252
3253 /* create: addr_tmp = (int)(address_of_first_vector) */
3254 addr_base =
3255 vect_create_addr_base_for_vector_ref (loop_vinfo,
3256 stmt_info, &new_stmt_list,
3257 offset);
3258 if (new_stmt_list != NULL)
3259 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3260
3261 sprintf (tmp_name, "addr2int%d", i);
3262 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3263 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3264 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3265
3266 /* The addresses are OR together. */
3267
3268 if (or_tmp_name != NULL_TREE)
3269 {
3270 /* create: or_tmp = or_tmp | addr_tmp */
3271 sprintf (tmp_name, "orptrs%d", i);
3272 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3273 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3274 or_tmp_name, addr_tmp_name);
3275 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3276 or_tmp_name = new_or_tmp_name;
3277 }
3278 else
3279 or_tmp_name = addr_tmp_name;
3280
3281 } /* end for i */
3282
3283 mask_cst = build_int_cst (int_ptrsize_type, mask);
3284
3285 /* create: and_tmp = or_tmp & mask */
3286 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3287
3288 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3289 or_tmp_name, mask_cst);
3290 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3291
3292 /* Make and_tmp the left operand of the conditional test against zero.
3293 if and_tmp has a nonzero bit then some address is unaligned. */
3294 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3295 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3296 and_tmp_name, ptrsize_zero);
3297 chain_cond_expr (cond_expr, part_cond_expr);
3298 }
3299
3300 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3301 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3302 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3303 and this new condition are true. Treat a null *COND_EXPR as "true". */
3304
3305 static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3306 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3307 {
3308 const vec<vec_object_pair> &pairs
3309 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3310 unsigned int i;
3311 vec_object_pair *pair;
3312 FOR_EACH_VEC_ELT (pairs, i, pair)
3313 {
3314 tree addr1 = build_fold_addr_expr (pair->first);
3315 tree addr2 = build_fold_addr_expr (pair->second);
3316 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3317 addr1, addr2);
3318 chain_cond_expr (cond_expr, part_cond_expr);
3319 }
3320 }
3321
3322 /* Create an expression that is true when all lower-bound conditions for
3323 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3324
3325 static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3326 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3327 {
3328 const vec<vec_lower_bound> &lower_bounds
3329 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3330 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3331 {
3332 tree expr = lower_bounds[i].expr;
3333 tree type = unsigned_type_for (TREE_TYPE (expr));
3334 expr = fold_convert (type, expr);
3335 poly_uint64 bound = lower_bounds[i].min_value;
3336 if (!lower_bounds[i].unsigned_p)
3337 {
3338 expr = fold_build2 (PLUS_EXPR, type, expr,
3339 build_int_cstu (type, bound - 1));
3340 bound += bound - 1;
3341 }
3342 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3343 build_int_cstu (type, bound));
3344 chain_cond_expr (cond_expr, part_cond_expr);
3345 }
3346 }
3347
3348 /* Function vect_create_cond_for_alias_checks.
3349
3350 Create a conditional expression that represents the run-time checks for
3351 overlapping of address ranges represented by a list of data references
3352 relations passed as input.
3353
3354 Input:
3355 COND_EXPR - input conditional expression. New conditions will be chained
3356 with logical AND operation. If it is NULL, then the function
3357 is used to return the number of alias checks.
3358 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3359 to be checked.
3360
3361 Output:
3362 COND_EXPR - conditional expression.
3363
3364 The returned COND_EXPR is the conditional expression to be used in the if
3365 statement that controls which version of the loop gets executed at runtime.
3366 */
3367
3368 void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)3369 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3370 {
3371 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3372 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3373
3374 if (comp_alias_ddrs.is_empty ())
3375 return;
3376
3377 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3378 &comp_alias_ddrs, cond_expr);
3379 if (dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE, vect_location,
3381 "created %u versioning for alias checks.\n",
3382 comp_alias_ddrs.length ());
3383 }
3384
3385
3386 /* Function vect_loop_versioning.
3387
3388 If the loop has data references that may or may not be aligned or/and
3389 has data reference relations whose independence was not proven then
3390 two versions of the loop need to be generated, one which is vectorized
3391 and one which isn't. A test is then generated to control which of the
3392 loops is executed. The test checks for the alignment of all of the
3393 data references that may or may not be aligned. An additional
3394 sequence of runtime tests is generated for each pairs of DDRs whose
3395 independence was not proven. The vectorized version of loop is
3396 executed only if both alias and alignment tests are passed.
3397
3398 The test generated to check which version of loop is executed
3399 is modified to also check for profitability as indicated by the
3400 cost model threshold TH.
3401
3402 The versioning precondition(s) are placed in *COND_EXPR and
3403 *COND_EXPR_STMT_LIST. */
3404
3405 class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)3406 vect_loop_versioning (loop_vec_info loop_vinfo,
3407 gimple *loop_vectorized_call)
3408 {
3409 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3410 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3411 basic_block condition_bb;
3412 gphi_iterator gsi;
3413 gimple_stmt_iterator cond_exp_gsi;
3414 basic_block merge_bb;
3415 basic_block new_exit_bb;
3416 edge new_exit_e, e;
3417 gphi *orig_phi, *new_phi;
3418 tree cond_expr = NULL_TREE;
3419 gimple_seq cond_expr_stmt_list = NULL;
3420 tree arg;
3421 profile_probability prob = profile_probability::likely ();
3422 gimple_seq gimplify_stmt_list = NULL;
3423 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3424 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3425 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3426 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3427 poly_uint64 versioning_threshold
3428 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3429 tree version_simd_if_cond
3430 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3431 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3432
3433 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3434 && !ordered_p (th, versioning_threshold))
3435 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3436 build_int_cst (TREE_TYPE (scalar_loop_iters),
3437 th - 1));
3438 if (maybe_ne (versioning_threshold, 0U))
3439 {
3440 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3441 build_int_cst (TREE_TYPE (scalar_loop_iters),
3442 versioning_threshold - 1));
3443 if (cond_expr)
3444 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3445 expr, cond_expr);
3446 else
3447 cond_expr = expr;
3448 }
3449
3450 tree cost_name = NULL_TREE;
3451 profile_probability prob2 = profile_probability::uninitialized ();
3452 if (cond_expr
3453 && EXPR_P (cond_expr)
3454 && (version_niter
3455 || version_align
3456 || version_alias
3457 || version_simd_if_cond))
3458 {
3459 cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3460 &cond_expr_stmt_list,
3461 is_gimple_val, NULL_TREE);
3462 /* Split prob () into two so that the overall probability of passing
3463 both the cost-model and versioning checks is the orig prob. */
3464 prob2 = prob.split (prob);
3465 }
3466
3467 if (version_niter)
3468 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3469
3470 if (cond_expr)
3471 {
3472 gimple_seq tem = NULL;
3473 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3474 &tem,
3475 is_gimple_condexpr, NULL_TREE);
3476 gimple_seq_add_seq (&cond_expr_stmt_list, tem);
3477 }
3478
3479 if (version_align)
3480 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3481 &cond_expr_stmt_list);
3482
3483 if (version_alias)
3484 {
3485 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3486 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3487 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3488 }
3489
3490 if (version_simd_if_cond)
3491 {
3492 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3493 if (flag_checking)
3494 if (basic_block bb
3495 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3496 gcc_assert (bb != loop->header
3497 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3498 && (scalar_loop == NULL
3499 || (bb != scalar_loop->header
3500 && dominated_by_p (CDI_DOMINATORS,
3501 scalar_loop->header, bb))));
3502 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3503 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3504 version_simd_if_cond, zero);
3505 if (cond_expr)
3506 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3507 c, cond_expr);
3508 else
3509 cond_expr = c;
3510 if (dump_enabled_p ())
3511 dump_printf_loc (MSG_NOTE, vect_location,
3512 "created versioning for simd if condition check.\n");
3513 }
3514
3515 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3516 &gimplify_stmt_list,
3517 is_gimple_condexpr, NULL_TREE);
3518 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3519
3520 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3521 invariant in. */
3522 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3523 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3524 !gsi_end_p (gsi); gsi_next (&gsi))
3525 {
3526 gimple *stmt = gsi_stmt (gsi);
3527 update_stmt (stmt);
3528 ssa_op_iter iter;
3529 use_operand_p use_p;
3530 basic_block def_bb;
3531 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3532 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3533 && flow_bb_inside_loop_p (outermost, def_bb))
3534 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3535 }
3536
3537 /* Search for the outermost loop we can version. Avoid versioning of
3538 non-perfect nests but allow if-conversion versioned loops inside. */
3539 class loop *loop_to_version = loop;
3540 if (flow_loop_nested_p (outermost, loop))
3541 {
3542 if (dump_enabled_p ())
3543 dump_printf_loc (MSG_NOTE, vect_location,
3544 "trying to apply versioning to outer loop %d\n",
3545 outermost->num);
3546 if (outermost->num == 0)
3547 outermost = superloop_at_depth (loop, 1);
3548 /* And avoid applying versioning on non-perfect nests. */
3549 while (loop_to_version != outermost
3550 && (e = single_exit (loop_outer (loop_to_version)))
3551 && !(e->flags & EDGE_COMPLEX)
3552 && (!loop_outer (loop_to_version)->inner->next
3553 || vect_loop_vectorized_call (loop_to_version))
3554 && (!loop_outer (loop_to_version)->inner->next
3555 || !loop_outer (loop_to_version)->inner->next->next))
3556 loop_to_version = loop_outer (loop_to_version);
3557 }
3558
3559 /* Apply versioning. If there is already a scalar version created by
3560 if-conversion re-use that. Note we cannot re-use the copy of
3561 an if-converted outer-loop when vectorizing the inner loop only. */
3562 gcond *cond;
3563 if ((!loop_to_version->inner || loop == loop_to_version)
3564 && loop_vectorized_call)
3565 {
3566 gcc_assert (scalar_loop);
3567 condition_bb = gimple_bb (loop_vectorized_call);
3568 cond = as_a <gcond *> (last_stmt (condition_bb));
3569 gimple_cond_set_condition_from_tree (cond, cond_expr);
3570 update_stmt (cond);
3571
3572 if (cond_expr_stmt_list)
3573 {
3574 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3575 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3576 GSI_SAME_STMT);
3577 }
3578
3579 /* if-conversion uses profile_probability::always () for both paths,
3580 reset the paths probabilities appropriately. */
3581 edge te, fe;
3582 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3583 te->probability = prob;
3584 fe->probability = prob.invert ();
3585 /* We can scale loops counts immediately but have to postpone
3586 scaling the scalar loop because we re-use it during peeling. */
3587 scale_loop_frequencies (loop_to_version, te->probability);
3588 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3589
3590 nloop = scalar_loop;
3591 if (dump_enabled_p ())
3592 dump_printf_loc (MSG_NOTE, vect_location,
3593 "reusing %sloop version created by if conversion\n",
3594 loop_to_version != loop ? "outer " : "");
3595 }
3596 else
3597 {
3598 if (loop_to_version != loop
3599 && dump_enabled_p ())
3600 dump_printf_loc (MSG_NOTE, vect_location,
3601 "applying loop versioning to outer loop %d\n",
3602 loop_to_version->num);
3603
3604 unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
3605
3606 initialize_original_copy_tables ();
3607 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3608 prob, prob.invert (), prob, prob.invert (), true);
3609 gcc_assert (nloop);
3610 nloop = get_loop_copy (loop);
3611
3612 /* For cycle vectorization with SLP we rely on the PHI arguments
3613 appearing in the same order as the SLP node operands which for the
3614 loop PHI nodes means the preheader edge dest index needs to remain
3615 the same for the analyzed loop which also becomes the vectorized one.
3616 Make it so in case the state after versioning differs by redirecting
3617 the first edge into the header to the same destination which moves
3618 it last. */
3619 if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
3620 {
3621 edge e = EDGE_PRED (loop->header, 0);
3622 ssa_redirect_edge (e, e->dest);
3623 flush_pending_stmts (e);
3624 }
3625 gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
3626
3627 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3628 reap those otherwise; they also refer to the original
3629 loops. */
3630 class loop *l = loop;
3631 while (gimple *call = vect_loop_vectorized_call (l))
3632 {
3633 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3634 fold_loop_internal_call (call, boolean_false_node);
3635 l = loop_outer (l);
3636 }
3637 free_original_copy_tables ();
3638
3639 if (cond_expr_stmt_list)
3640 {
3641 cond_exp_gsi = gsi_last_bb (condition_bb);
3642 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3643 GSI_SAME_STMT);
3644 }
3645
3646 /* Loop versioning violates an assumption we try to maintain during
3647 vectorization - that the loop exit block has a single predecessor.
3648 After versioning, the exit block of both loop versions is the same
3649 basic block (i.e. it has two predecessors). Just in order to simplify
3650 following transformations in the vectorizer, we fix this situation
3651 here by adding a new (empty) block on the exit-edge of the loop,
3652 with the proper loop-exit phis to maintain loop-closed-form.
3653 If loop versioning wasn't done from loop, but scalar_loop instead,
3654 merge_bb will have already just a single successor. */
3655
3656 merge_bb = single_exit (loop_to_version)->dest;
3657 if (EDGE_COUNT (merge_bb->preds) >= 2)
3658 {
3659 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3660 new_exit_bb = split_edge (single_exit (loop_to_version));
3661 new_exit_e = single_exit (loop_to_version);
3662 e = EDGE_SUCC (new_exit_bb, 0);
3663
3664 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3665 gsi_next (&gsi))
3666 {
3667 tree new_res;
3668 orig_phi = gsi.phi ();
3669 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3670 new_phi = create_phi_node (new_res, new_exit_bb);
3671 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3672 add_phi_arg (new_phi, arg, new_exit_e,
3673 gimple_phi_arg_location_from_edge (orig_phi, e));
3674 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3675 }
3676 }
3677
3678 update_ssa (TODO_update_ssa);
3679 }
3680
3681 /* Split the cost model check off to a separate BB. Costing assumes
3682 this is the only thing we perform when we enter the scalar loop
3683 from a failed cost decision. */
3684 if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
3685 {
3686 gimple *def = SSA_NAME_DEF_STMT (cost_name);
3687 gcc_assert (gimple_bb (def) == condition_bb);
3688 /* All uses of the cost check are 'true' after the check we
3689 are going to insert. */
3690 replace_uses_by (cost_name, boolean_true_node);
3691 /* And we're going to build the new single use of it. */
3692 gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
3693 NULL_TREE, NULL_TREE);
3694 edge e = split_block (gimple_bb (def), def);
3695 gimple_stmt_iterator gsi = gsi_for_stmt (def);
3696 gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
3697 edge true_e, false_e;
3698 extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
3699 e->flags &= ~EDGE_FALLTHRU;
3700 e->flags |= EDGE_TRUE_VALUE;
3701 edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
3702 e->probability = prob2;
3703 e2->probability = prob2.invert ();
3704 set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
3705 auto_vec<basic_block, 3> adj;
3706 for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
3707 son;
3708 son = next_dom_son (CDI_DOMINATORS, son))
3709 if (EDGE_COUNT (son->preds) > 1)
3710 adj.safe_push (son);
3711 for (auto son : adj)
3712 set_immediate_dominator (CDI_DOMINATORS, son, e->src);
3713 }
3714
3715 if (version_niter)
3716 {
3717 /* The versioned loop could be infinite, we need to clear existing
3718 niter information which is copied from the original loop. */
3719 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3720 vect_free_loop_info_assumptions (nloop);
3721 }
3722
3723 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3724 && dump_enabled_p ())
3725 {
3726 if (version_alias)
3727 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3728 vect_location,
3729 "loop versioned for vectorization because of "
3730 "possible aliasing\n");
3731 if (version_align)
3732 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3733 vect_location,
3734 "loop versioned for vectorization to enhance "
3735 "alignment\n");
3736
3737 }
3738
3739 return nloop;
3740 }
3741