1 /* Loop Vectorization
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58
59 /* Loop Vectorization Pass.
60
61 This pass tries to vectorize loops.
62
63 For example, the vectorizer transforms the following simple loop:
64
65 short a[N]; short b[N]; short c[N]; int i;
66
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
69 }
70
71 as if it was manually vectorized by rewriting the source code into:
72
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
77
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
83 }
84
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
96
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
102
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
107
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
118
119 For example, say stmt S1 was vectorized into stmt VS1:
120
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
124
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
129
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
137
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
145
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
152
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 bool *, bool *);
161
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
165
166 static opt_result
vect_determine_vf_for_stmt_1(vec_info * vinfo,stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 bool vectype_maybe_set_p,
169 poly_uint64 *vf)
170 {
171 gimple *stmt = stmt_info->stmt;
172
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174 && !STMT_VINFO_LIVE_P (stmt_info))
175 || gimple_clobber_p (stmt))
176 {
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179 return opt_result::success ();
180 }
181
182 tree stmt_vectype, nunits_vectype;
183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 &stmt_vectype,
185 &nunits_vectype);
186 if (!res)
187 return res;
188
189 if (stmt_vectype)
190 {
191 if (STMT_VINFO_VECTYPE (stmt_info))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 || vectype_maybe_set_p)
197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 }
201
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
204
205 return opt_result::success ();
206 }
207
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
212
213 static opt_result
vect_determine_vf_for_stmt(vec_info * vinfo,stmt_vec_info stmt_info,poly_uint64 * vf)214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 stmt_vec_info stmt_info, poly_uint64 *vf)
216 {
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221 if (!res)
222 return res;
223
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
226 {
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
229
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
233 {
234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE, vect_location,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info->stmt);
239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 if (!res)
241 return res;
242 }
243
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "==> examining pattern statement: %G",
247 stmt_info->stmt);
248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249 if (!res)
250 return res;
251 }
252
253 return opt_result::success ();
254 }
255
256 /* Function vect_determine_vectorization_factor
257
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
263
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
267 in the loop.
268
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
270 original loop:
271 for (i=0; i<N; i++){
272 a[i] = b[i] + c[i];
273 }
274
275 vectorized loop:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
278 }
279 */
280
281 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
283 {
284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286 unsigned nbbs = loop->num_nodes;
287 poly_uint64 vectorization_factor = 1;
288 tree scalar_type = NULL_TREE;
289 gphi *phi;
290 tree vectype;
291 stmt_vec_info stmt_info;
292 unsigned i;
293
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
295
296 for (i = 0; i < nbbs; i++)
297 {
298 basic_block bb = bbs[i];
299
300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 gsi_next (&si))
302 {
303 phi = si.phi ();
304 stmt_info = loop_vinfo->lookup_stmt (phi);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 phi);
308
309 gcc_assert (stmt_info);
310
311 if (STMT_VINFO_RELEVANT_P (stmt_info)
312 || STMT_VINFO_LIVE_P (stmt_info))
313 {
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315 scalar_type = TREE_TYPE (PHI_RESULT (phi));
316
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE, vect_location,
319 "get vectype for scalar type: %T\n",
320 scalar_type);
321
322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 if (!vectype)
324 return opt_result::failure_at (phi,
325 "not vectorized: unsupported "
326 "data-type %T\n",
327 scalar_type);
328 STMT_VINFO_VECTYPE (stmt_info) = vectype;
329
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 vectype);
333
334 if (dump_enabled_p ())
335 {
336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 dump_printf (MSG_NOTE, "\n");
339 }
340
341 vect_update_max_nunits (&vectorization_factor, vectype);
342 }
343 }
344
345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 gsi_next (&si))
347 {
348 if (is_gimple_debug (gsi_stmt (si)))
349 continue;
350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 opt_result res
352 = vect_determine_vf_for_stmt (loop_vinfo,
353 stmt_info, &vectorization_factor);
354 if (!res)
355 return res;
356 }
357 }
358
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
361 {
362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363 dump_dec (MSG_NOTE, vectorization_factor);
364 dump_printf (MSG_NOTE, "\n");
365 }
366
367 if (known_le (vectorization_factor, 1U))
368 return opt_result::failure_at (vect_location,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371 return opt_result::success ();
372 }
373
374
375 /* Function vect_is_simple_iv_evolution.
376
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
379
380 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382 tree * step)
383 {
384 tree init_expr;
385 tree step_expr;
386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387 basic_block bb;
388
389 /* When there is no evolution in this loop, the evolution function
390 is not "simple". */
391 if (evolution_part == NULL_TREE)
392 return false;
393
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part))
397 return false;
398
399 step_expr = evolution_part;
400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
401
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
404 step_expr, init_expr);
405
406 *init = init_expr;
407 *step = step_expr;
408
409 if (TREE_CODE (step_expr) != INTEGER_CST
410 && (TREE_CODE (step_expr) != SSA_NAME
411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 || !flag_associative_math)))
416 && (TREE_CODE (step_expr) != REAL_CST
417 || !flag_associative_math))
418 {
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "step unknown.\n");
422 return false;
423 }
424
425 return true;
426 }
427
428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
429 what we are assuming is a double reduction. For example, given
430 a structure like this:
431
432 outer1:
433 x_1 = PHI <x_4(outer2), ...>;
434 ...
435
436 inner:
437 x_2 = PHI <x_1(outer1), ...>;
438 ...
439 x_3 = ...;
440 ...
441
442 outer2:
443 x_4 = PHI <x_3(inner)>;
444 ...
445
446 outer loop analysis would treat x_1 as a double reduction phi and
447 this function would then return true for x_2. */
448
449 static bool
vect_inner_phi_in_double_reduction_p(loop_vec_info loop_vinfo,gphi * phi)450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
451 {
452 use_operand_p use_p;
453 ssa_op_iter op_iter;
454 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
455 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
456 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
457 return true;
458 return false;
459 }
460
461 /* Function vect_analyze_scalar_cycles_1.
462
463 Examine the cross iteration def-use cycles of scalar variables
464 in LOOP. LOOP_VINFO represents the loop that is now being
465 considered for vectorization (can be LOOP, or an outer-loop
466 enclosing LOOP). */
467
468 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
470 {
471 basic_block bb = loop->header;
472 tree init, step;
473 auto_vec<stmt_vec_info, 64> worklist;
474 gphi_iterator gsi;
475 bool double_reduc, reduc_chain;
476
477 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
478
479 /* First - identify all inductions. Reduction detection assumes that all the
480 inductions have been identified, therefore, this order must not be
481 changed. */
482 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
483 {
484 gphi *phi = gsi.phi ();
485 tree access_fn = NULL;
486 tree def = PHI_RESULT (phi);
487 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
488
489 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
491
492 /* Skip virtual phi's. The data dependences that are associated with
493 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
494 if (virtual_operand_p (def))
495 continue;
496
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
498
499 /* Analyze the evolution function. */
500 access_fn = analyze_scalar_evolution (loop, def);
501 if (access_fn)
502 {
503 STRIP_NOPS (access_fn);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location,
506 "Access function of PHI: %T\n", access_fn);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
508 = initial_condition_in_loop_num (access_fn, loop->num);
509 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
510 = evolution_part_in_loop_num (access_fn, loop->num);
511 }
512
513 if (!access_fn
514 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
515 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
516 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
517 && TREE_CODE (step) != INTEGER_CST))
518 {
519 worklist.safe_push (stmt_vinfo);
520 continue;
521 }
522
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
524 != NULL_TREE);
525 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
526
527 if (dump_enabled_p ())
528 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 }
531
532
533 /* Second - identify all reductions and nested cycles. */
534 while (worklist.length () > 0)
535 {
536 stmt_vec_info stmt_vinfo = worklist.pop ();
537 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
538 tree def = PHI_RESULT (phi);
539
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
542
543 gcc_assert (!virtual_operand_p (def)
544 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
545
546 stmt_vec_info reduc_stmt_info
547 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
548 &reduc_chain);
549 if (reduc_stmt_info)
550 {
551 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
552 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
553 if (double_reduc)
554 {
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location,
557 "Detected double reduction.\n");
558
559 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
560 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
561 }
562 else
563 {
564 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
565 {
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "Detected vectorizable nested cycle.\n");
569
570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
571 }
572 else
573 {
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_NOTE, vect_location,
576 "Detected reduction.\n");
577
578 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
579 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
580 /* Store the reduction cycles for possible vectorization in
581 loop-aware SLP if it was not detected as reduction
582 chain. */
583 if (! reduc_chain)
584 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
585 (reduc_stmt_info);
586 }
587 }
588 }
589 else
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592 "Unknown def-use cycle pattern.\n");
593 }
594 }
595
596
597 /* Function vect_analyze_scalar_cycles.
598
599 Examine the cross iteration def-use cycles of scalar variables, by
600 analyzing the loop-header PHIs of scalar variables. Classify each
601 cycle as one of the following: invariant, induction, reduction, unknown.
602 We do that for the loop represented by LOOP_VINFO, and also to its
603 inner-loop, if exists.
604 Examples for scalar cycles:
605
606 Example1: reduction:
607
608 loop1:
609 for (i=0; i<N; i++)
610 sum += a[i];
611
612 Example2: induction:
613
614 loop2:
615 for (i=0; i<N; i++)
616 a[i] = i; */
617
618 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
620 {
621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
622
623 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
624
625 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
626 Reductions in such inner-loop therefore have different properties than
627 the reductions in the nest that gets vectorized:
628 1. When vectorized, they are executed in the same order as in the original
629 scalar loop, so we can't change the order of computation when
630 vectorizing them.
631 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
632 current checks are too strict. */
633
634 if (loop->inner)
635 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 }
637
638 /* Transfer group and reduction information from STMT_INFO to its
639 pattern stmt. */
640
641 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
643 {
644 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
645 stmt_vec_info stmtp;
646 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
647 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
648 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649 do
650 {
651 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
652 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
653 == STMT_VINFO_DEF_TYPE (stmt_info));
654 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
655 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
656 if (stmt_info)
657 REDUC_GROUP_NEXT_ELEMENT (stmtp)
658 = STMT_VINFO_RELATED_STMT (stmt_info);
659 }
660 while (stmt_info);
661 }
662
663 /* Fixup scalar cycles that now have their stmts detected as patterns. */
664
665 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
667 {
668 stmt_vec_info first;
669 unsigned i;
670
671 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
672 {
673 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
674 while (next)
675 {
676 if ((STMT_VINFO_IN_PATTERN_P (next)
677 != STMT_VINFO_IN_PATTERN_P (first))
678 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
679 break;
680 next = REDUC_GROUP_NEXT_ELEMENT (next);
681 }
682 /* If all reduction chain members are well-formed patterns adjust
683 the group to group the pattern stmts instead. */
684 if (! next
685 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
686 {
687 if (STMT_VINFO_IN_PATTERN_P (first))
688 {
689 vect_fixup_reduc_chain (first);
690 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
691 = STMT_VINFO_RELATED_STMT (first);
692 }
693 }
694 /* If not all stmt in the chain are patterns or if we failed
695 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
696 it as regular reduction instead. */
697 else
698 {
699 stmt_vec_info vinfo = first;
700 stmt_vec_info last = NULL;
701 while (vinfo)
702 {
703 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
704 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
705 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
706 last = vinfo;
707 vinfo = next;
708 }
709 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
710 = vect_internal_def;
711 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
712 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
713 --i;
714 }
715 }
716 }
717
718 /* Function vect_get_loop_niters.
719
720 Determine how many iterations the loop is executed and place it
721 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
722 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
723 niter information holds in ASSUMPTIONS.
724
725 Return the loop exit condition. */
726
727
728 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)729 vect_get_loop_niters (class loop *loop, tree *assumptions,
730 tree *number_of_iterations, tree *number_of_iterationsm1)
731 {
732 edge exit = single_exit (loop);
733 class tree_niter_desc niter_desc;
734 tree niter_assumptions, niter, may_be_zero;
735 gcond *cond = get_loop_exit_condition (loop);
736
737 *assumptions = boolean_true_node;
738 *number_of_iterationsm1 = chrec_dont_know;
739 *number_of_iterations = chrec_dont_know;
740 DUMP_VECT_SCOPE ("get_loop_niters");
741
742 if (!exit)
743 return cond;
744
745 may_be_zero = NULL_TREE;
746 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
747 || chrec_contains_undetermined (niter_desc.niter))
748 return cond;
749
750 niter_assumptions = niter_desc.assumptions;
751 may_be_zero = niter_desc.may_be_zero;
752 niter = niter_desc.niter;
753
754 if (may_be_zero && integer_zerop (may_be_zero))
755 may_be_zero = NULL_TREE;
756
757 if (may_be_zero)
758 {
759 if (COMPARISON_CLASS_P (may_be_zero))
760 {
761 /* Try to combine may_be_zero with assumptions, this can simplify
762 computation of niter expression. */
763 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
764 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
765 niter_assumptions,
766 fold_build1 (TRUTH_NOT_EXPR,
767 boolean_type_node,
768 may_be_zero));
769 else
770 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
771 build_int_cst (TREE_TYPE (niter), 0),
772 rewrite_to_non_trapping_overflow (niter));
773
774 may_be_zero = NULL_TREE;
775 }
776 else if (integer_nonzerop (may_be_zero))
777 {
778 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
779 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
780 return cond;
781 }
782 else
783 return cond;
784 }
785
786 *assumptions = niter_assumptions;
787 *number_of_iterationsm1 = niter;
788
789 /* We want the number of loop header executions which is the number
790 of latch executions plus one.
791 ??? For UINT_MAX latch executions this number overflows to zero
792 for loops like do { n++; } while (n != 0); */
793 if (niter && !chrec_contains_undetermined (niter))
794 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
795 build_int_cst (TREE_TYPE (niter), 1));
796 *number_of_iterations = niter;
797
798 return cond;
799 }
800
801 /* Function bb_in_loop_p
802
803 Used as predicate for dfs order traversal of the loop bbs. */
804
805 static bool
bb_in_loop_p(const_basic_block bb,const void * data)806 bb_in_loop_p (const_basic_block bb, const void *data)
807 {
808 const class loop *const loop = (const class loop *)data;
809 if (flow_bb_inside_loop_p (loop, bb))
810 return true;
811 return false;
812 }
813
814
815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
816 stmt_vec_info structs for all the stmts in LOOP_IN. */
817
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
819 : vec_info (vec_info::loop, shared),
820 loop (loop_in),
821 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
822 num_itersm1 (NULL_TREE),
823 num_iters (NULL_TREE),
824 num_iters_unchanged (NULL_TREE),
825 num_iters_assumptions (NULL_TREE),
826 vector_costs (nullptr),
827 scalar_costs (nullptr),
828 th (0),
829 versioning_threshold (0),
830 vectorization_factor (0),
831 main_loop_edge (nullptr),
832 skip_main_loop_edge (nullptr),
833 skip_this_loop_edge (nullptr),
834 reusable_accumulators (),
835 suggested_unroll_factor (1),
836 max_vectorization_factor (0),
837 mask_skip_niters (NULL_TREE),
838 rgroup_compare_type (NULL_TREE),
839 simd_if_cond (NULL_TREE),
840 unaligned_dr (NULL),
841 peeling_for_alignment (0),
842 ptr_mask (0),
843 ivexpr_map (NULL),
844 scan_map (NULL),
845 slp_unrolling_factor (1),
846 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
847 vectorizable (false),
848 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
849 using_partial_vectors_p (false),
850 epil_using_partial_vectors_p (false),
851 partial_load_store_bias (0),
852 peeling_for_gaps (false),
853 peeling_for_niter (false),
854 no_data_dependencies (false),
855 has_mask_store (false),
856 scalar_loop_scaling (profile_probability::uninitialized ()),
857 scalar_loop (NULL),
858 orig_loop_info (NULL)
859 {
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
864
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
868
869 for (unsigned int i = 0; i < nbbs; i++)
870 {
871 basic_block bb = bbs[i];
872 gimple_stmt_iterator si;
873
874 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
875 {
876 gimple *phi = gsi_stmt (si);
877 gimple_set_uid (phi, 0);
878 add_stmt (phi);
879 }
880
881 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
882 {
883 gimple *stmt = gsi_stmt (si);
884 gimple_set_uid (stmt, 0);
885 if (is_gimple_debug (stmt))
886 continue;
887 add_stmt (stmt);
888 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
889 third argument is the #pragma omp simd if (x) condition, when 0,
890 loop shouldn't be vectorized, when non-zero constant, it should
891 be vectorized normally, otherwise versioned with vectorized loop
892 done if the condition is non-zero at runtime. */
893 if (loop_in->simduid
894 && is_gimple_call (stmt)
895 && gimple_call_internal_p (stmt)
896 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
897 && gimple_call_num_args (stmt) >= 3
898 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
899 && (loop_in->simduid
900 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
901 {
902 tree arg = gimple_call_arg (stmt, 2);
903 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
904 simd_if_cond = arg;
905 else
906 gcc_assert (integer_nonzerop (arg));
907 }
908 }
909 }
910
911 epilogue_vinfos.create (6);
912 }
913
914 /* Free all levels of rgroup CONTROLS. */
915
916 void
release_vec_loop_controls(vec<rgroup_controls> * controls)917 release_vec_loop_controls (vec<rgroup_controls> *controls)
918 {
919 rgroup_controls *rgc;
920 unsigned int i;
921 FOR_EACH_VEC_ELT (*controls, i, rgc)
922 rgc->controls.release ();
923 controls->release ();
924 }
925
926 /* Free all memory used by the _loop_vec_info, as well as all the
927 stmt_vec_info structs of all the stmts in the loop. */
928
~_loop_vec_info()929 _loop_vec_info::~_loop_vec_info ()
930 {
931 free (bbs);
932
933 release_vec_loop_controls (&masks);
934 release_vec_loop_controls (&lens);
935 delete ivexpr_map;
936 delete scan_map;
937 epilogue_vinfos.release ();
938 delete scalar_costs;
939 delete vector_costs;
940
941 /* When we release an epiloge vinfo that we do not intend to use
942 avoid clearing AUX of the main loop which should continue to
943 point to the main loop vinfo since otherwise we'll leak that. */
944 if (loop->aux == this)
945 loop->aux = NULL;
946 }
947
948 /* Return an invariant or register for EXPR and emit necessary
949 computations in the LOOP_VINFO loop preheader. */
950
951 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
953 {
954 if (is_gimple_reg (expr)
955 || is_gimple_min_invariant (expr))
956 return expr;
957
958 if (! loop_vinfo->ivexpr_map)
959 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
960 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
961 if (! cached)
962 {
963 gimple_seq stmts = NULL;
964 cached = force_gimple_operand (unshare_expr (expr),
965 &stmts, true, NULL_TREE);
966 if (stmts)
967 {
968 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
969 gsi_insert_seq_on_edge_immediate (e, stmts);
970 }
971 }
972 return cached;
973 }
974
975 /* Return true if we can use CMP_TYPE as the comparison type to produce
976 all masks required to mask LOOP_VINFO. */
977
978 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
980 {
981 rgroup_controls *rgm;
982 unsigned int i;
983 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
984 if (rgm->type != NULL_TREE
985 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
986 cmp_type, rgm->type,
987 OPTIMIZE_FOR_SPEED))
988 return false;
989 return true;
990 }
991
992 /* Calculate the maximum number of scalars per iteration for every
993 rgroup in LOOP_VINFO. */
994
995 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
997 {
998 unsigned int res = 1;
999 unsigned int i;
1000 rgroup_controls *rgm;
1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002 res = MAX (res, rgm->max_nscalars_per_iter);
1003 return res;
1004 }
1005
1006 /* Calculate the minimum precision necessary to represent:
1007
1008 MAX_NITERS * FACTOR
1009
1010 as an unsigned integer, where MAX_NITERS is the maximum number of
1011 loop header iterations for the original scalar form of LOOP_VINFO. */
1012
1013 static unsigned
vect_min_prec_for_max_niters(loop_vec_info loop_vinfo,unsigned int factor)1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1015 {
1016 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018 /* Get the maximum number of iterations that is representable
1019 in the counter type. */
1020 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023 /* Get a more refined estimate for the number of iterations. */
1024 widest_int max_back_edges;
1025 if (max_loop_iterations (loop, &max_back_edges))
1026 max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028 /* Work out how many bits we need to represent the limit. */
1029 return wi::min_precision (max_ni * factor, UNSIGNED);
1030 }
1031
1032 /* True if the loop needs peeling or partial vectors when vectorized. */
1033
1034 static bool
vect_need_peeling_or_partial_vectors_p(loop_vec_info loop_vinfo)1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1036 {
1037 unsigned HOST_WIDE_INT const_vf;
1038 HOST_WIDE_INT max_niter
1039 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1040
1041 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044 (loop_vinfo));
1045
1046 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1048 {
1049 /* Work out the (constant) number of iterations that need to be
1050 peeled for reasons other than niters. */
1051 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053 peel_niter += 1;
1054 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056 return true;
1057 }
1058 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059 /* ??? When peeling for gaps but not alignment, we could
1060 try to check whether the (variable) niters is known to be
1061 VF * N + 1. That's something of a niche case though. */
1062 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065 < (unsigned) exact_log2 (const_vf))
1066 /* In case of versioning, check if the maximum number of
1067 iterations is greater than th. If they are identical,
1068 the epilogue is unnecessary. */
1069 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070 || ((unsigned HOST_WIDE_INT) max_niter
1071 > (th / const_vf) * const_vf))))
1072 return true;
1073
1074 return false;
1075 }
1076
1077 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1078 whether we can actually generate the masks required. Return true if so,
1079 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1080
1081 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1083 {
1084 unsigned int min_ni_width;
1085 unsigned int max_nscalars_per_iter
1086 = vect_get_max_nscalars_per_iter (loop_vinfo);
1087
1088 /* Use a normal loop if there are no statements that need masking.
1089 This only happens in rare degenerate cases: it means that the loop
1090 has no loads, no stores, and no live-out values. */
1091 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092 return false;
1093
1094 /* Work out how many bits we need to represent the limit. */
1095 min_ni_width
1096 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1097
1098 /* Find a scalar mode for which WHILE_ULT is supported. */
1099 opt_scalar_int_mode cmp_mode_iter;
1100 tree cmp_type = NULL_TREE;
1101 tree iv_type = NULL_TREE;
1102 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103 unsigned int iv_precision = UINT_MAX;
1104
1105 if (iv_limit != -1)
1106 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107 UNSIGNED);
1108
1109 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1110 {
1111 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112 if (cmp_bits >= min_ni_width
1113 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1114 {
1115 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116 if (this_type
1117 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1118 {
1119 /* Although we could stop as soon as we find a valid mode,
1120 there are at least two reasons why that's not always the
1121 best choice:
1122
1123 - An IV that's Pmode or wider is more likely to be reusable
1124 in address calculations than an IV that's narrower than
1125 Pmode.
1126
1127 - Doing the comparison in IV_PRECISION or wider allows
1128 a natural 0-based IV, whereas using a narrower comparison
1129 type requires mitigations against wrap-around.
1130
1131 Conversely, if the IV limit is variable, doing the comparison
1132 in a wider type than the original type can introduce
1133 unnecessary extensions, so picking the widest valid mode
1134 is not always a good choice either.
1135
1136 Here we prefer the first IV type that's Pmode or wider,
1137 and the first comparison type that's IV_PRECISION or wider.
1138 (The comparison type must be no wider than the IV type,
1139 to avoid extensions in the vector loop.)
1140
1141 ??? We might want to try continuing beyond Pmode for ILP32
1142 targets if CMP_BITS < IV_PRECISION. */
1143 iv_type = this_type;
1144 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145 cmp_type = this_type;
1146 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147 break;
1148 }
1149 }
1150 }
1151
1152 if (!cmp_type)
1153 return false;
1154
1155 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157 return true;
1158 }
1159
1160 /* Check whether we can use vector access with length based on precison
1161 comparison. So far, to keep it simple, we only allow the case that the
1162 precision of the target supported length is larger than the precision
1163 required by loop niters. */
1164
1165 static bool
vect_verify_loop_lens(loop_vec_info loop_vinfo)1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1167 {
1168 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169 return false;
1170
1171 machine_mode len_load_mode = get_len_load_store_mode
1172 (loop_vinfo->vector_mode, true).require ();
1173 machine_mode len_store_mode = get_len_load_store_mode
1174 (loop_vinfo->vector_mode, false).require ();
1175
1176 signed char partial_load_bias = internal_len_load_store_bias
1177 (IFN_LEN_LOAD, len_load_mode);
1178
1179 signed char partial_store_bias = internal_len_load_store_bias
1180 (IFN_LEN_STORE, len_store_mode);
1181
1182 gcc_assert (partial_load_bias == partial_store_bias);
1183
1184 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185 return false;
1186
1187 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188 len_loads with a length of zero. In order to avoid that we prohibit
1189 more than one loop length here. */
1190 if (partial_load_bias == -1
1191 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192 return false;
1193
1194 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1195
1196 unsigned int max_nitems_per_iter = 1;
1197 unsigned int i;
1198 rgroup_controls *rgl;
1199 /* Find the maximum number of items per iteration for every rgroup. */
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1201 {
1202 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1204 }
1205
1206 /* Work out how many bits we need to represent the length limit. */
1207 unsigned int min_ni_prec
1208 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1209
1210 /* Now use the maximum of below precisions for one suitable IV type:
1211 - the IV's natural precision
1212 - the precision needed to hold: the maximum number of scalar
1213 iterations multiplied by the scale factor (min_ni_prec above)
1214 - the Pmode precision
1215
1216 If min_ni_prec is less than the precision of the current niters,
1217 we perfer to still use the niters type. Prefer to use Pmode and
1218 wider IV to avoid narrow conversions. */
1219
1220 unsigned int ni_prec
1221 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222 min_ni_prec = MAX (min_ni_prec, ni_prec);
1223 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1224
1225 tree iv_type = NULL_TREE;
1226 opt_scalar_int_mode tmode_iter;
1227 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1228 {
1229 scalar_mode tmode = tmode_iter.require ();
1230 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1231
1232 /* ??? Do we really want to construct one IV whose precision exceeds
1233 BITS_PER_WORD? */
1234 if (tbits > BITS_PER_WORD)
1235 break;
1236
1237 /* Find the first available standard integral type. */
1238 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1239 {
1240 iv_type = build_nonstandard_integer_type (tbits, true);
1241 break;
1242 }
1243 }
1244
1245 if (!iv_type)
1246 {
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "can't vectorize with length-based partial vectors"
1250 " because there is no suitable iv type.\n");
1251 return false;
1252 }
1253
1254 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1256
1257 return true;
1258 }
1259
1260 /* Calculate the cost of one scalar iteration of the loop. */
1261 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1263 {
1264 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266 int nbbs = loop->num_nodes, factor;
1267 int innerloop_iters, i;
1268
1269 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1270
1271 /* Gather costs for statements in the scalar loop. */
1272
1273 /* FORNOW. */
1274 innerloop_iters = 1;
1275 if (loop->inner)
1276 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1277
1278 for (i = 0; i < nbbs; i++)
1279 {
1280 gimple_stmt_iterator si;
1281 basic_block bb = bbs[i];
1282
1283 if (bb->loop_father == loop->inner)
1284 factor = innerloop_iters;
1285 else
1286 factor = 1;
1287
1288 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1289 {
1290 gimple *stmt = gsi_stmt (si);
1291 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1292
1293 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294 continue;
1295
1296 /* Skip stmts that are not vectorized inside the loop. */
1297 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299 && (!STMT_VINFO_LIVE_P (vstmt_info)
1300 || !VECTORIZABLE_CYCLE_DEF
1301 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1302 continue;
1303
1304 vect_cost_for_stmt kind;
1305 if (STMT_VINFO_DATA_REF (stmt_info))
1306 {
1307 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308 kind = scalar_load;
1309 else
1310 kind = scalar_store;
1311 }
1312 else if (vect_nop_conversion_p (stmt_info))
1313 continue;
1314 else
1315 kind = scalar_stmt;
1316
1317 /* We are using vect_prologue here to avoid scaling twice
1318 by the inner loop factor. */
1319 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320 factor, kind, stmt_info, 0, vect_prologue);
1321 }
1322 }
1323
1324 /* Now accumulate cost. */
1325 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326 add_stmt_costs (loop_vinfo->scalar_costs,
1327 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328 loop_vinfo->scalar_costs->finish_cost (nullptr);
1329 }
1330
1331
1332 /* Function vect_analyze_loop_form.
1333
1334 Verify that certain CFG restrictions hold, including:
1335 - the loop has a pre-header
1336 - the loop has a single entry and exit
1337 - the loop exit condition is simple enough
1338 - the number of iterations can be analyzed, i.e, a countable loop. The
1339 niter could be analyzed under some assumptions. */
1340
1341 opt_result
vect_analyze_loop_form(class loop * loop,vect_loop_form_info * info)1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1343 {
1344 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1345
1346 /* Different restrictions apply when we are considering an inner-most loop,
1347 vs. an outer (nested) loop.
1348 (FORNOW. May want to relax some of these restrictions in the future). */
1349
1350 info->inner_loop_cond = NULL;
1351 if (!loop->inner)
1352 {
1353 /* Inner-most loop. We currently require that the number of BBs is
1354 exactly 2 (the header and latch). Vectorizable inner-most loops
1355 look like this:
1356
1357 (pre-header)
1358 |
1359 header <--------+
1360 | | |
1361 | +--> latch --+
1362 |
1363 (exit-bb) */
1364
1365 if (loop->num_nodes != 2)
1366 return opt_result::failure_at (vect_location,
1367 "not vectorized:"
1368 " control flow in loop.\n");
1369
1370 if (empty_block_p (loop->header))
1371 return opt_result::failure_at (vect_location,
1372 "not vectorized: empty loop.\n");
1373 }
1374 else
1375 {
1376 class loop *innerloop = loop->inner;
1377 edge entryedge;
1378
1379 /* Nested loop. We currently require that the loop is doubly-nested,
1380 contains a single inner loop, and the number of BBs is exactly 5.
1381 Vectorizable outer-loops look like this:
1382
1383 (pre-header)
1384 |
1385 header <---+
1386 | |
1387 inner-loop |
1388 | |
1389 tail ------+
1390 |
1391 (exit-bb)
1392
1393 The inner-loop has the properties expected of inner-most loops
1394 as described above. */
1395
1396 if ((loop->inner)->inner || (loop->inner)->next)
1397 return opt_result::failure_at (vect_location,
1398 "not vectorized:"
1399 " multiple nested loops.\n");
1400
1401 if (loop->num_nodes != 5)
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized:"
1404 " control flow in loop.\n");
1405
1406 entryedge = loop_preheader_edge (innerloop);
1407 if (entryedge->src != loop->header
1408 || !single_exit (innerloop)
1409 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410 return opt_result::failure_at (vect_location,
1411 "not vectorized:"
1412 " unsupported outerloop form.\n");
1413
1414 /* Analyze the inner-loop. */
1415 vect_loop_form_info inner;
1416 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417 if (!res)
1418 {
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421 "not vectorized: Bad inner loop.\n");
1422 return res;
1423 }
1424
1425 /* Don't support analyzing niter under assumptions for inner
1426 loop. */
1427 if (!integer_onep (inner.assumptions))
1428 return opt_result::failure_at (vect_location,
1429 "not vectorized: Bad inner loop.\n");
1430
1431 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432 return opt_result::failure_at (vect_location,
1433 "not vectorized: inner-loop count not"
1434 " invariant.\n");
1435
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location,
1438 "Considering outer-loop vectorization.\n");
1439 info->inner_loop_cond = inner.loop_cond;
1440 }
1441
1442 if (!single_exit (loop))
1443 return opt_result::failure_at (vect_location,
1444 "not vectorized: multiple exits.\n");
1445 if (EDGE_COUNT (loop->header->preds) != 2)
1446 return opt_result::failure_at (vect_location,
1447 "not vectorized:"
1448 " too many incoming edges.\n");
1449
1450 /* We assume that the loop exit condition is at the end of the loop. i.e,
1451 that the loop is represented as a do-while (with a proper if-guard
1452 before the loop if needed), where the loop header contains all the
1453 executable statements, and the latch is empty. */
1454 if (!empty_block_p (loop->latch)
1455 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456 return opt_result::failure_at (vect_location,
1457 "not vectorized: latch block not empty.\n");
1458
1459 /* Make sure the exit is not abnormal. */
1460 edge e = single_exit (loop);
1461 if (e->flags & EDGE_ABNORMAL)
1462 return opt_result::failure_at (vect_location,
1463 "not vectorized:"
1464 " abnormal loop exit edge.\n");
1465
1466 info->loop_cond
1467 = vect_get_loop_niters (loop, &info->assumptions,
1468 &info->number_of_iterations,
1469 &info->number_of_iterationsm1);
1470 if (!info->loop_cond)
1471 return opt_result::failure_at
1472 (vect_location,
1473 "not vectorized: complicated exit condition.\n");
1474
1475 if (integer_zerop (info->assumptions)
1476 || !info->number_of_iterations
1477 || chrec_contains_undetermined (info->number_of_iterations))
1478 return opt_result::failure_at
1479 (info->loop_cond,
1480 "not vectorized: number of iterations cannot be computed.\n");
1481
1482 if (integer_zerop (info->number_of_iterations))
1483 return opt_result::failure_at
1484 (info->loop_cond,
1485 "not vectorized: number of iterations = 0.\n");
1486
1487 if (!(tree_fits_shwi_p (info->number_of_iterations)
1488 && tree_to_shwi (info->number_of_iterations) > 0))
1489 {
1490 if (dump_enabled_p ())
1491 {
1492 dump_printf_loc (MSG_NOTE, vect_location,
1493 "Symbolic number of iterations is ");
1494 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495 dump_printf (MSG_NOTE, "\n");
1496 }
1497 }
1498
1499 return opt_result::success ();
1500 }
1501
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503 vect_analyze_loop_form result. */
1504
1505 loop_vec_info
vect_create_loop_vinfo(class loop * loop,vec_info_shared * shared,const vect_loop_form_info * info,loop_vec_info main_loop_info)1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507 const vect_loop_form_info *info,
1508 loop_vec_info main_loop_info)
1509 {
1510 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515 /* Also record the assumptions for versioning. */
1516 if (!integer_onep (info->assumptions) && !main_loop_info)
1517 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1518
1519 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521 if (info->inner_loop_cond)
1522 {
1523 stmt_vec_info inner_loop_cond_info
1524 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526 /* If we have an estimate on the number of iterations of the inner
1527 loop use that to limit the scale for costing, otherwise use
1528 --param vect-inner-loop-cost-factor literally. */
1529 widest_int nit;
1530 if (estimated_stmt_executions (loop->inner, &nit))
1531 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1533 }
1534
1535 return loop_vinfo;
1536 }
1537
1538
1539
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541 statements update the vectorization factor. */
1542
1543 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1545 {
1546 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548 int nbbs = loop->num_nodes;
1549 poly_uint64 vectorization_factor;
1550 int i;
1551
1552 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1553
1554 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555 gcc_assert (known_ne (vectorization_factor, 0U));
1556
1557 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558 vectorization factor of the loop is the unrolling factor required by
1559 the SLP instances. If that unrolling factor is 1, we say, that we
1560 perform pure SLP on loop - cross iteration parallelism is not
1561 exploited. */
1562 bool only_slp_in_loop = true;
1563 for (i = 0; i < nbbs; i++)
1564 {
1565 basic_block bb = bbs[i];
1566 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567 gsi_next (&si))
1568 {
1569 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570 if (!stmt_info)
1571 continue;
1572 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 && !PURE_SLP_STMT (stmt_info))
1575 /* STMT needs both SLP and loop-based vectorization. */
1576 only_slp_in_loop = false;
1577 }
1578 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 gsi_next (&si))
1580 {
1581 if (is_gimple_debug (gsi_stmt (si)))
1582 continue;
1583 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584 stmt_info = vect_stmt_to_vectorize (stmt_info);
1585 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587 && !PURE_SLP_STMT (stmt_info))
1588 /* STMT needs both SLP and loop-based vectorization. */
1589 only_slp_in_loop = false;
1590 }
1591 }
1592
1593 if (only_slp_in_loop)
1594 {
1595 if (dump_enabled_p ())
1596 dump_printf_loc (MSG_NOTE, vect_location,
1597 "Loop contains only SLP stmts\n");
1598 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1599 }
1600 else
1601 {
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Loop contains SLP and non-SLP stmts\n");
1605 /* Both the vectorization factor and unroll factor have the form
1606 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607 so they must have a common multiple. */
1608 vectorization_factor
1609 = force_common_multiple (vectorization_factor,
1610 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611 }
1612
1613 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614 if (dump_enabled_p ())
1615 {
1616 dump_printf_loc (MSG_NOTE, vect_location,
1617 "Updating vectorization factor to ");
1618 dump_dec (MSG_NOTE, vectorization_factor);
1619 dump_printf (MSG_NOTE, ".\n");
1620 }
1621 }
1622
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624 the other phi in the reduction is also relevant for vectorization.
1625 This rejects cases such as:
1626
1627 outer1:
1628 x_1 = PHI <x_3(outer2), ...>;
1629 ...
1630
1631 inner:
1632 x_2 = ...;
1633 ...
1634
1635 outer2:
1636 x_3 = PHI <x_2(inner)>;
1637
1638 if nothing in x_2 or elsewhere makes x_1 relevant. */
1639
1640 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1642 {
1643 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644 return false;
1645
1646 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 }
1648
1649 /* Function vect_analyze_loop_operations.
1650
1651 Scan the loop stmts and make sure they are all vectorizable. */
1652
1653 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1655 {
1656 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658 int nbbs = loop->num_nodes;
1659 int i;
1660 stmt_vec_info stmt_info;
1661 bool need_to_vectorize = false;
1662 bool ok;
1663
1664 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1665
1666 auto_vec<stmt_info_for_cost> cost_vec;
1667
1668 for (i = 0; i < nbbs; i++)
1669 {
1670 basic_block bb = bbs[i];
1671
1672 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673 gsi_next (&si))
1674 {
1675 gphi *phi = si.phi ();
1676 ok = true;
1677
1678 stmt_info = loop_vinfo->lookup_stmt (phi);
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681 if (virtual_operand_p (gimple_phi_result (phi)))
1682 continue;
1683
1684 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685 (i.e., a phi in the tail of the outer-loop). */
1686 if (! is_loop_header_bb_p (bb))
1687 {
1688 /* FORNOW: we currently don't support the case that these phis
1689 are not used in the outerloop (unless it is double reduction,
1690 i.e., this phi is vect_reduction_def), cause this case
1691 requires to actually do something here. */
1692 if (STMT_VINFO_LIVE_P (stmt_info)
1693 && !vect_active_double_reduction_p (stmt_info))
1694 return opt_result::failure_at (phi,
1695 "Unsupported loop-closed phi"
1696 " in outer-loop.\n");
1697
1698 /* If PHI is used in the outer loop, we check that its operand
1699 is defined in the inner loop. */
1700 if (STMT_VINFO_RELEVANT_P (stmt_info))
1701 {
1702 tree phi_op;
1703
1704 if (gimple_phi_num_args (phi) != 1)
1705 return opt_result::failure_at (phi, "unsupported phi");
1706
1707 phi_op = PHI_ARG_DEF (phi, 0);
1708 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709 if (!op_def_info)
1710 return opt_result::failure_at (phi, "unsupported phi\n");
1711
1712 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713 && (STMT_VINFO_RELEVANT (op_def_info)
1714 != vect_used_in_outer_by_reduction))
1715 return opt_result::failure_at (phi, "unsupported phi\n");
1716
1717 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718 || (STMT_VINFO_DEF_TYPE (stmt_info)
1719 == vect_double_reduction_def))
1720 && !vectorizable_lc_phi (loop_vinfo,
1721 stmt_info, NULL, NULL))
1722 return opt_result::failure_at (phi, "unsupported phi\n");
1723 }
1724
1725 continue;
1726 }
1727
1728 gcc_assert (stmt_info);
1729
1730 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731 || STMT_VINFO_LIVE_P (stmt_info))
1732 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733 /* A scalar-dependence cycle that we don't support. */
1734 return opt_result::failure_at (phi,
1735 "not vectorized:"
1736 " scalar dependence cycle.\n");
1737
1738 if (STMT_VINFO_RELEVANT_P (stmt_info))
1739 {
1740 need_to_vectorize = true;
1741 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742 && ! PURE_SLP_STMT (stmt_info))
1743 ok = vectorizable_induction (loop_vinfo,
1744 stmt_info, NULL, NULL,
1745 &cost_vec);
1746 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747 || (STMT_VINFO_DEF_TYPE (stmt_info)
1748 == vect_double_reduction_def)
1749 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750 && ! PURE_SLP_STMT (stmt_info))
1751 ok = vectorizable_reduction (loop_vinfo,
1752 stmt_info, NULL, NULL, &cost_vec);
1753 }
1754
1755 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1756 if (ok
1757 && STMT_VINFO_LIVE_P (stmt_info)
1758 && !PURE_SLP_STMT (stmt_info))
1759 ok = vectorizable_live_operation (loop_vinfo,
1760 stmt_info, NULL, NULL, NULL,
1761 -1, false, &cost_vec);
1762
1763 if (!ok)
1764 return opt_result::failure_at (phi,
1765 "not vectorized: relevant phi not "
1766 "supported: %G",
1767 static_cast <gimple *> (phi));
1768 }
1769
1770 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771 gsi_next (&si))
1772 {
1773 gimple *stmt = gsi_stmt (si);
1774 if (!gimple_clobber_p (stmt)
1775 && !is_gimple_debug (stmt))
1776 {
1777 opt_result res
1778 = vect_analyze_stmt (loop_vinfo,
1779 loop_vinfo->lookup_stmt (stmt),
1780 &need_to_vectorize,
1781 NULL, NULL, &cost_vec);
1782 if (!res)
1783 return res;
1784 }
1785 }
1786 } /* bbs */
1787
1788 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1789
1790 /* All operations in the loop are either irrelevant (deal with loop
1791 control, or dead), or only used outside the loop and can be moved
1792 out of the loop (e.g. invariants, inductions). The loop can be
1793 optimized away by scalar optimizations. We're better off not
1794 touching this loop. */
1795 if (!need_to_vectorize)
1796 {
1797 if (dump_enabled_p ())
1798 dump_printf_loc (MSG_NOTE, vect_location,
1799 "All the computation can be taken out of the loop.\n");
1800 return opt_result::failure_at
1801 (vect_location,
1802 "not vectorized: redundant loop. no profit to vectorize.\n");
1803 }
1804
1805 return opt_result::success ();
1806 }
1807
1808 /* Return true if we know that the iteration count is smaller than the
1809 vectorization factor. Return false if it isn't, or if we can't be sure
1810 either way. */
1811
1812 static bool
vect_known_niters_smaller_than_vf(loop_vec_info loop_vinfo)1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1814 {
1815 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816
1817 HOST_WIDE_INT max_niter;
1818 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820 else
1821 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1822
1823 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824 return true;
1825
1826 return false;
1827 }
1828
1829 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1830 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1831 definitely no, or -1 if it's worth retrying. */
1832
1833 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo,unsigned * suggested_unroll_factor)1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835 unsigned *suggested_unroll_factor)
1836 {
1837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1839
1840 /* Only loops that can handle partially-populated vectors can have iteration
1841 counts less than the vectorization factor. */
1842 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1843 {
1844 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1845 {
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "not vectorized: iteration count smaller than "
1849 "vectorization factor.\n");
1850 return 0;
1851 }
1852 }
1853
1854 /* If using the "very cheap" model. reject cases in which we'd keep
1855 a copy of the scalar code (even if we might be able to vectorize it). */
1856 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1860 {
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "some scalar iterations would need to be peeled\n");
1864 return 0;
1865 }
1866
1867 int min_profitable_iters, min_profitable_estimate;
1868 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869 &min_profitable_estimate,
1870 suggested_unroll_factor);
1871
1872 if (min_profitable_iters < 0)
1873 {
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 "not vectorized: vectorization not profitable.\n");
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879 "not vectorized: vector version will never be "
1880 "profitable.\n");
1881 return -1;
1882 }
1883
1884 int min_scalar_loop_bound = (param_min_vect_loop_bound
1885 * assumed_vf);
1886
1887 /* Use the cost model only if it is more conservative than user specified
1888 threshold. */
1889 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890 min_profitable_iters);
1891
1892 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1893
1894 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1896 {
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "not vectorized: vectorization not profitable.\n");
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "not vectorized: iteration count smaller than user "
1903 "specified loop bound parameter or minimum profitable "
1904 "iterations (whichever is more conservative).\n");
1905 return 0;
1906 }
1907
1908 /* The static profitablity threshold min_profitable_estimate includes
1909 the cost of having to check at runtime whether the scalar loop
1910 should be used instead. If it turns out that we don't need or want
1911 such a check, the threshold we should use for the static estimate
1912 is simply the point at which the vector loop becomes more profitable
1913 than the scalar loop. */
1914 if (min_profitable_estimate > min_profitable_iters
1915 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1919 {
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922 " choice between the scalar and vector loops\n");
1923 min_profitable_estimate = min_profitable_iters;
1924 }
1925
1926 /* If the vector loop needs multiple iterations to be beneficial then
1927 things are probably too close to call, and the conservative thing
1928 would be to stick with the scalar code. */
1929 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1931 {
1932 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934 "one iteration of the vector loop would be"
1935 " more expensive than the equivalent number of"
1936 " iterations of the scalar loop\n");
1937 return 0;
1938 }
1939
1940 HOST_WIDE_INT estimated_niter;
1941
1942 /* If we are vectorizing an epilogue then we know the maximum number of
1943 scalar iterations it will cover is at least one lower than the
1944 vectorization factor of the main loop. */
1945 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946 estimated_niter
1947 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948 else
1949 {
1950 estimated_niter = estimated_stmt_executions_int (loop);
1951 if (estimated_niter == -1)
1952 estimated_niter = likely_max_stmt_executions_int (loop);
1953 }
1954 if (estimated_niter != -1
1955 && ((unsigned HOST_WIDE_INT) estimated_niter
1956 < MAX (th, (unsigned) min_profitable_estimate)))
1957 {
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960 "not vectorized: estimated iteration count too "
1961 "small.\n");
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "not vectorized: estimated iteration count smaller "
1965 "than specified loop bound parameter or minimum "
1966 "profitable iterations (whichever is more "
1967 "conservative).\n");
1968 return -1;
1969 }
1970
1971 return 1;
1972 }
1973
1974 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976 vec<data_reference_p> *datarefs,
1977 unsigned int *n_stmts)
1978 {
1979 *n_stmts = 0;
1980 for (unsigned i = 0; i < loop->num_nodes; i++)
1981 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982 !gsi_end_p (gsi); gsi_next (&gsi))
1983 {
1984 gimple *stmt = gsi_stmt (gsi);
1985 if (is_gimple_debug (stmt))
1986 continue;
1987 ++(*n_stmts);
1988 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989 NULL, 0);
1990 if (!res)
1991 {
1992 if (is_gimple_call (stmt) && loop->safelen)
1993 {
1994 tree fndecl = gimple_call_fndecl (stmt), op;
1995 if (fndecl != NULL_TREE)
1996 {
1997 cgraph_node *node = cgraph_node::get (fndecl);
1998 if (node != NULL && node->simd_clones != NULL)
1999 {
2000 unsigned int j, n = gimple_call_num_args (stmt);
2001 for (j = 0; j < n; j++)
2002 {
2003 op = gimple_call_arg (stmt, j);
2004 if (DECL_P (op)
2005 || (REFERENCE_CLASS_P (op)
2006 && get_base_address (op)))
2007 break;
2008 }
2009 op = gimple_call_lhs (stmt);
2010 /* Ignore #pragma omp declare simd functions
2011 if they don't have data references in the
2012 call stmt itself. */
2013 if (j == n
2014 && !(op
2015 && (DECL_P (op)
2016 || (REFERENCE_CLASS_P (op)
2017 && get_base_address (op)))))
2018 continue;
2019 }
2020 }
2021 }
2022 return res;
2023 }
2024 /* If dependence analysis will give up due to the limit on the
2025 number of datarefs stop here and fail fatally. */
2026 if (datarefs->length ()
2027 > (unsigned)param_loop_max_datarefs_for_datadeps)
2028 return opt_result::failure_at (stmt, "exceeded param "
2029 "loop-max-datarefs-for-datadeps\n");
2030 }
2031 return opt_result::success ();
2032 }
2033
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035 group. */
2036 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2038 {
2039 unsigned int i;
2040 struct data_reference *dr;
2041
2042 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2043
2044 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045 FOR_EACH_VEC_ELT (datarefs, i, dr)
2046 {
2047 gcc_assert (DR_REF (dr));
2048 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2049
2050 /* Check if the load is a part of an interleaving chain. */
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 {
2053 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055 unsigned int group_size = DR_GROUP_SIZE (first_element);
2056
2057 /* Check if SLP-only groups. */
2058 if (!STMT_SLP_TYPE (stmt_info)
2059 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2060 {
2061 /* Dissolve the group. */
2062 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2063
2064 stmt_vec_info vinfo = first_element;
2065 while (vinfo)
2066 {
2067 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070 DR_GROUP_SIZE (vinfo) = 1;
2071 if (STMT_VINFO_STRIDED_P (first_element))
2072 DR_GROUP_GAP (vinfo) = 0;
2073 else
2074 DR_GROUP_GAP (vinfo) = group_size - 1;
2075 /* Duplicate and adjust alignment info, it needs to
2076 be present on each group leader, see dr_misalignment. */
2077 if (vinfo != first_element)
2078 {
2079 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080 dr_info2->target_alignment = dr_info->target_alignment;
2081 int misalignment = dr_info->misalignment;
2082 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2083 {
2084 HOST_WIDE_INT diff
2085 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087 unsigned HOST_WIDE_INT align_c
2088 = dr_info->target_alignment.to_constant ();
2089 misalignment = (misalignment + diff) % align_c;
2090 }
2091 dr_info2->misalignment = misalignment;
2092 }
2093 vinfo = next;
2094 }
2095 }
2096 }
2097 }
2098 }
2099
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101 some scalar iterations still to do. If so, decide how we should
2102 handle those scalar iterations. The possibilities are:
2103
2104 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105 In this case:
2106
2107 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109 LOOP_VINFO_PEELING_FOR_NITER == false
2110
2111 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112 to handle the remaining scalar iterations. In this case:
2113
2114 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115 LOOP_VINFO_PEELING_FOR_NITER == true
2116
2117 There are two choices:
2118
2119 (2a) Consider vectorizing the epilogue loop at the same VF as the
2120 main loop, but using partial vectors instead of full vectors.
2121 In this case:
2122
2123 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2124
2125 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126 In this case:
2127
2128 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2129
2130 When FOR_EPILOGUE_P is true, make this determination based on the
2131 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132 based on the assumption that LOOP_VINFO is the main loop. The caller
2133 has made sure that the number of iterations is set appropriately for
2134 this value of FOR_EPILOGUE_P. */
2135
2136 opt_result
vect_determine_partial_vectors_and_peeling(loop_vec_info loop_vinfo,bool for_epilogue_p)2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138 bool for_epilogue_p)
2139 {
2140 /* Determine whether there would be any scalar iterations left over. */
2141 bool need_peeling_or_partial_vectors_p
2142 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2143
2144 /* Decide whether to vectorize the loop with partial vectors. */
2145 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148 && need_peeling_or_partial_vectors_p)
2149 {
2150 /* For partial-vector-usage=1, try to push the handling of partial
2151 vectors to the epilogue, with the main loop continuing to operate
2152 on full vectors.
2153
2154 If we are unrolling we also do not want to use partial vectors. This
2155 is to avoid the overhead of generating multiple masks and also to
2156 avoid having to execute entire iterations of FALSE masked instructions
2157 when dealing with one or less full iterations.
2158
2159 ??? We could then end up failing to use partial vectors if we
2160 decide to peel iterations into a prologue, and if the main loop
2161 then ends up processing fewer than VF iterations. */
2162 if ((param_vect_partial_vector_usage == 1
2163 || loop_vinfo->suggested_unroll_factor > 1)
2164 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167 else
2168 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2169 }
2170
2171 if (dump_enabled_p ())
2172 {
2173 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174 dump_printf_loc (MSG_NOTE, vect_location,
2175 "operating on partial vectors%s.\n",
2176 for_epilogue_p ? " for epilogue loop" : "");
2177 else
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "operating only on full vectors%s.\n",
2180 for_epilogue_p ? " for epilogue loop" : "");
2181 }
2182
2183 if (for_epilogue_p)
2184 {
2185 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186 gcc_assert (orig_loop_vinfo);
2187 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2190 }
2191
2192 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2194 {
2195 /* Check that the loop processes at least one full vector. */
2196 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198 if (known_lt (wi::to_widest (scalar_niters), vf))
2199 return opt_result::failure_at (vect_location,
2200 "loop does not have enough iterations"
2201 " to support vectorization.\n");
2202
2203 /* If we need to peel an extra epilogue iteration to handle data
2204 accesses with gaps, check that there are enough scalar iterations
2205 available.
2206
2207 The check above is redundant with this one when peeling for gaps,
2208 but the distinction is useful for diagnostics. */
2209 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212 return opt_result::failure_at (vect_location,
2213 "loop does not have enough iterations"
2214 " to support peeling for gaps.\n");
2215 }
2216
2217 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219 && need_peeling_or_partial_vectors_p);
2220
2221 return opt_result::success ();
2222 }
2223
2224 /* Function vect_analyze_loop_2.
2225
2226 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227 for it. The different analyses will record information in the
2228 loop_vec_info struct. */
2229 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * suggested_unroll_factor)2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231 unsigned *suggested_unroll_factor)
2232 {
2233 opt_result ok = opt_result::success ();
2234 int res;
2235 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236 poly_uint64 min_vf = 2;
2237 loop_vec_info orig_loop_vinfo = NULL;
2238
2239 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240 loop_vec_info of the first vectorized loop. */
2241 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243 else
2244 orig_loop_vinfo = loop_vinfo;
2245 gcc_assert (orig_loop_vinfo);
2246
2247 /* The first group of checks is independent of the vector size. */
2248 fatal = true;
2249
2250 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252 return opt_result::failure_at (vect_location,
2253 "not vectorized: simd if(0)\n");
2254
2255 /* Find all data references in the loop (which correspond to vdefs/vuses)
2256 and analyze their evolution in the loop. */
2257
2258 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2259
2260 /* Gather the data references and count stmts in the loop. */
2261 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2262 {
2263 opt_result res
2264 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265 &LOOP_VINFO_DATAREFS (loop_vinfo),
2266 &LOOP_VINFO_N_STMTS (loop_vinfo));
2267 if (!res)
2268 {
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 "not vectorized: loop contains function "
2272 "calls or data references that cannot "
2273 "be analyzed\n");
2274 return res;
2275 }
2276 loop_vinfo->shared->save_datarefs ();
2277 }
2278 else
2279 loop_vinfo->shared->check_datarefs ();
2280
2281 /* Analyze the data references and also adjust the minimal
2282 vectorization factor according to the loads and stores. */
2283
2284 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285 if (!ok)
2286 {
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "bad data references.\n");
2290 return ok;
2291 }
2292
2293 /* Classify all cross-iteration scalar data-flow cycles.
2294 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2295 vect_analyze_scalar_cycles (loop_vinfo);
2296
2297 vect_pattern_recog (loop_vinfo);
2298
2299 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2300
2301 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2303
2304 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305 if (!ok)
2306 {
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "bad data access.\n");
2310 return ok;
2311 }
2312
2313 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2314
2315 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316 if (!ok)
2317 {
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 "unexpected pattern.\n");
2321 return ok;
2322 }
2323
2324 /* While the rest of the analysis below depends on it in some way. */
2325 fatal = false;
2326
2327 /* Analyze data dependences between the data-refs in the loop
2328 and adjust the maximum vectorization factor according to
2329 the dependences.
2330 FORNOW: fail at the first data dependence that we encounter. */
2331
2332 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333 if (!ok)
2334 {
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "bad data dependence.\n");
2338 return ok;
2339 }
2340 if (max_vf != MAX_VECTORIZATION_FACTOR
2341 && maybe_lt (max_vf, min_vf))
2342 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2344
2345 ok = vect_determine_vectorization_factor (loop_vinfo);
2346 if (!ok)
2347 {
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350 "can't determine vectorization factor.\n");
2351 return ok;
2352 }
2353 if (max_vf != MAX_VECTORIZATION_FACTOR
2354 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2356
2357 /* Compute the scalar iteration cost. */
2358 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2359
2360 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361
2362 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2363 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364 if (!ok)
2365 return ok;
2366
2367 /* If there are any SLP instances mark them as pure_slp. */
2368 bool slp = vect_make_slp_decision (loop_vinfo);
2369 if (slp)
2370 {
2371 /* Find stmts that need to be both vectorized and SLPed. */
2372 vect_detect_hybrid_slp (loop_vinfo);
2373
2374 /* Update the vectorization factor based on the SLP decision. */
2375 vect_update_vf_for_slp (loop_vinfo);
2376
2377 /* Optimize the SLP graph with the vectorization factor fixed. */
2378 vect_optimize_slp (loop_vinfo);
2379
2380 /* Gather the loads reachable from the SLP graph entries. */
2381 vect_gather_slp_loads (loop_vinfo);
2382 }
2383
2384 bool saved_can_use_partial_vectors_p
2385 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2386
2387 /* We don't expect to have to roll back to anything other than an empty
2388 set of rgroups. */
2389 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2390
2391 /* This is the point where we can re-start analysis with SLP forced off. */
2392 start_over:
2393
2394 /* Apply the suggested unrolling factor, this was determined by the backend
2395 during finish_cost the first time we ran the analyzis for this
2396 vector mode. */
2397 if (loop_vinfo->suggested_unroll_factor > 1)
2398 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2399
2400 /* Now the vectorization factor is final. */
2401 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402 gcc_assert (known_ne (vectorization_factor, 0U));
2403
2404 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2405 {
2406 dump_printf_loc (MSG_NOTE, vect_location,
2407 "vectorization_factor = ");
2408 dump_dec (MSG_NOTE, vectorization_factor);
2409 dump_printf (MSG_NOTE, ", niters = %wd\n",
2410 LOOP_VINFO_INT_NITERS (loop_vinfo));
2411 }
2412
2413 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2414
2415 /* Analyze the alignment of the data-refs in the loop.
2416 Fail if a data reference is found that cannot be vectorized. */
2417
2418 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419 if (!ok)
2420 {
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423 "bad data alignment.\n");
2424 return ok;
2425 }
2426
2427 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428 It is important to call pruning after vect_analyze_data_ref_accesses,
2429 since we use grouping information gathered by interleaving analysis. */
2430 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431 if (!ok)
2432 return ok;
2433
2434 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435 vectorization, since we do not want to add extra peeling or
2436 add versioning for alignment. */
2437 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438 /* This pass will decide on using loop versioning and/or loop peeling in
2439 order to enhance the alignment of data references in the loop. */
2440 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441 if (!ok)
2442 return ok;
2443
2444 if (slp)
2445 {
2446 /* Analyze operations in the SLP instances. Note this may
2447 remove unsupported SLP instances which makes the above
2448 SLP kind detection invalid. */
2449 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450 vect_slp_analyze_operations (loop_vinfo);
2451 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2452 {
2453 ok = opt_result::failure_at (vect_location,
2454 "unsupported SLP instances\n");
2455 goto again;
2456 }
2457
2458 /* Check whether any load in ALL SLP instances is possibly permuted. */
2459 slp_tree load_node, slp_root;
2460 unsigned i, x;
2461 slp_instance instance;
2462 bool can_use_lanes = true;
2463 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2464 {
2465 slp_root = SLP_INSTANCE_TREE (instance);
2466 int group_size = SLP_TREE_LANES (slp_root);
2467 tree vectype = SLP_TREE_VECTYPE (slp_root);
2468 bool loads_permuted = false;
2469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2470 {
2471 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472 continue;
2473 unsigned j;
2474 stmt_vec_info load_info;
2475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2477 {
2478 loads_permuted = true;
2479 break;
2480 }
2481 }
2482
2483 /* If the loads and stores can be handled with load/store-lane
2484 instructions record it and move on to the next instance. */
2485 if (loads_permuted
2486 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487 && vect_store_lanes_supported (vectype, group_size, false))
2488 {
2489 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2490 {
2491 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493 /* Use SLP for strided accesses (or if we can't
2494 load-lanes). */
2495 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496 || ! vect_load_lanes_supported
2497 (STMT_VINFO_VECTYPE (stmt_vinfo),
2498 DR_GROUP_SIZE (stmt_vinfo), false))
2499 break;
2500 }
2501
2502 can_use_lanes
2503 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2504
2505 if (can_use_lanes && dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE, vect_location,
2507 "SLP instance %p can use load/store-lanes\n",
2508 instance);
2509 }
2510 else
2511 {
2512 can_use_lanes = false;
2513 break;
2514 }
2515 }
2516
2517 /* If all SLP instances can use load/store-lanes abort SLP and try again
2518 with SLP disabled. */
2519 if (can_use_lanes)
2520 {
2521 ok = opt_result::failure_at (vect_location,
2522 "Built SLP cancelled: can use "
2523 "load/store-lanes\n");
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 "Built SLP cancelled: all SLP instances support "
2527 "load/store-lanes\n");
2528 goto again;
2529 }
2530 }
2531
2532 /* Dissolve SLP-only groups. */
2533 vect_dissolve_slp_only_groups (loop_vinfo);
2534
2535 /* Scan all the remaining operations in the loop that are not subject
2536 to SLP and make sure they are vectorizable. */
2537 ok = vect_analyze_loop_operations (loop_vinfo);
2538 if (!ok)
2539 {
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542 "bad operation or unsupported loop bound.\n");
2543 return ok;
2544 }
2545
2546 /* For now, we don't expect to mix both masking and length approaches for one
2547 loop, disable it if both are recorded. */
2548 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2551 {
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 "can't vectorize a loop with partial vectors"
2555 " because we don't expect to mix different"
2556 " approaches with partial vectors for the"
2557 " same loop.\n");
2558 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2559 }
2560
2561 /* If we still have the option of using partial vectors,
2562 check whether we can generate the necessary loop controls. */
2563 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564 && !vect_verify_full_masking (loop_vinfo)
2565 && !vect_verify_loop_lens (loop_vinfo))
2566 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2567
2568 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569 to be able to handle fewer than VF scalars, or needs to have a lower VF
2570 than the main loop. */
2571 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575 return opt_result::failure_at (vect_location,
2576 "Vectorization factor too high for"
2577 " epilogue loop.\n");
2578
2579 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580 assuming that the loop will be used as a main loop. We will redo
2581 this analysis later if we instead decide to use the loop as an
2582 epilogue loop. */
2583 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584 if (!ok)
2585 return ok;
2586
2587 /* Check the costings of the loop make vectorizing worthwhile. */
2588 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589 if (res < 0)
2590 {
2591 ok = opt_result::failure_at (vect_location,
2592 "Loop costings may not be worthwhile.\n");
2593 goto again;
2594 }
2595 if (!res)
2596 return opt_result::failure_at (vect_location,
2597 "Loop costings not worthwhile.\n");
2598
2599 /* If an epilogue loop is required make sure we can create one. */
2600 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602 {
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605 if (!vect_can_advance_ivs_p (loop_vinfo)
2606 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607 single_exit (LOOP_VINFO_LOOP
2608 (loop_vinfo))))
2609 {
2610 ok = opt_result::failure_at (vect_location,
2611 "not vectorized: can't create required "
2612 "epilog loop\n");
2613 goto again;
2614 }
2615 }
2616
2617 /* During peeling, we need to check if number of loop iterations is
2618 enough for both peeled prolog loop and vector loop. This check
2619 can be merged along with threshold check of loop versioning, so
2620 increase threshold for this case if necessary.
2621
2622 If we are analyzing an epilogue we still want to check what its
2623 versioning threshold would be. If we decide to vectorize the epilogues we
2624 will want to use the lowest versioning threshold of all epilogues and main
2625 loop. This will enable us to enter a vectorized epilogue even when
2626 versioning the loop. We can't simply check whether the epilogue requires
2627 versioning though since we may have skipped some versioning checks when
2628 analyzing the epilogue. For instance, checks for alias versioning will be
2629 skipped when dealing with epilogues as we assume we already checked them
2630 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2631 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2632 {
2633 poly_uint64 niters_th = 0;
2634 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2635
2636 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2637 {
2638 /* Niters for peeled prolog loop. */
2639 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2640 {
2641 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2644 }
2645 else
2646 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647 }
2648
2649 /* Niters for at least one iteration of vectorized loop. */
2650 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652 /* One additional iteration because of peeling for gap. */
2653 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654 niters_th += 1;
2655
2656 /* Use the same condition as vect_transform_loop to decide when to use
2657 the cost to determine a versioning threshold. */
2658 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659 && ordered_p (th, niters_th))
2660 niters_th = ordered_max (poly_uint64 (th), niters_th);
2661
2662 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2663 }
2664
2665 gcc_assert (known_eq (vectorization_factor,
2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2667
2668 /* Ok to vectorize! */
2669 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670 return opt_result::success ();
2671
2672 again:
2673 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2674 gcc_assert (!ok);
2675
2676 /* Try again with SLP forced off but if we didn't do any SLP there is
2677 no point in re-trying. */
2678 if (!slp)
2679 return ok;
2680
2681 /* If there are reduction chains re-trying will fail anyway. */
2682 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683 return ok;
2684
2685 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686 via interleaving or lane instructions. */
2687 slp_instance instance;
2688 slp_tree node;
2689 unsigned i, j;
2690 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691 {
2692 stmt_vec_info vinfo;
2693 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695 continue;
2696 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697 unsigned int size = DR_GROUP_SIZE (vinfo);
2698 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699 if (! vect_store_lanes_supported (vectype, size, false)
2700 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701 && ! vect_grouped_store_supported (vectype, size))
2702 return opt_result::failure_at (vinfo->stmt,
2703 "unsupported grouped store\n");
2704 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2705 {
2706 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709 size = DR_GROUP_SIZE (vinfo);
2710 vectype = STMT_VINFO_VECTYPE (vinfo);
2711 if (! vect_load_lanes_supported (vectype, size, false)
2712 && ! vect_grouped_load_supported (vectype, single_element_p,
2713 size))
2714 return opt_result::failure_at (vinfo->stmt,
2715 "unsupported grouped load\n");
2716 }
2717 }
2718
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE, vect_location,
2721 "re-trying with SLP disabled\n");
2722
2723 /* Roll back state appropriately. No SLP this time. */
2724 slp = false;
2725 /* Restore vectorization factor as it were without SLP. */
2726 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727 /* Free the SLP instances. */
2728 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729 vect_free_slp_instance (instance);
2730 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731 /* Reset SLP type to loop_vect on all stmts. */
2732 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2733 {
2734 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736 !gsi_end_p (si); gsi_next (&si))
2737 {
2738 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739 STMT_SLP_TYPE (stmt_info) = loop_vect;
2740 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2742 {
2743 /* vectorizable_reduction adjusts reduction stmt def-types,
2744 restore them to that of the PHI. */
2745 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746 = STMT_VINFO_DEF_TYPE (stmt_info);
2747 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748 (STMT_VINFO_REDUC_DEF (stmt_info)))
2749 = STMT_VINFO_DEF_TYPE (stmt_info);
2750 }
2751 }
2752 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753 !gsi_end_p (si); gsi_next (&si))
2754 {
2755 if (is_gimple_debug (gsi_stmt (si)))
2756 continue;
2757 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758 STMT_SLP_TYPE (stmt_info) = loop_vect;
2759 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2760 {
2761 stmt_vec_info pattern_stmt_info
2762 = STMT_VINFO_RELATED_STMT (stmt_info);
2763 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2765
2766 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769 !gsi_end_p (pi); gsi_next (&pi))
2770 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771 = loop_vect;
2772 }
2773 }
2774 }
2775 /* Free optimized alias test DDRS. */
2776 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779 /* Reset target cost data. */
2780 delete loop_vinfo->vector_costs;
2781 loop_vinfo->vector_costs = nullptr;
2782 /* Reset accumulated rgroup information. */
2783 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785 /* Reset assorted flags. */
2786 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791 = saved_can_use_partial_vectors_p;
2792
2793 goto start_over;
2794 }
2795
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2798 OLD_LOOP_VINFO is better unless something specifically indicates
2799 otherwise.
2800
2801 Note that this deliberately isn't a partial order. */
2802
2803 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805 loop_vec_info old_loop_vinfo)
2806 {
2807 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2809
2810 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2812
2813 /* Always prefer a VF of loop->simdlen over any other VF. */
2814 if (loop->simdlen)
2815 {
2816 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818 if (new_simdlen_p != old_simdlen_p)
2819 return new_simdlen_p;
2820 }
2821
2822 const auto *old_costs = old_loop_vinfo->vector_costs;
2823 const auto *new_costs = new_loop_vinfo->vector_costs;
2824 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2826
2827 return new_costs->better_main_loop_than_p (old_costs);
2828 }
2829
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2831 true if we should. */
2832
2833 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835 loop_vec_info old_loop_vinfo)
2836 {
2837 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838 return false;
2839
2840 if (dump_enabled_p ())
2841 dump_printf_loc (MSG_NOTE, vect_location,
2842 "***** Preferring vector mode %s to vector mode %s\n",
2843 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845 return true;
2846 }
2847
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850 MODE_I to the next mode useful to analyze.
2851 Return the loop_vinfo on success and wrapped null on failure. */
2852
2853 static opt_loop_vec_info
vect_analyze_loop_1(class loop * loop,vec_info_shared * shared,const vect_loop_form_info * loop_form_info,loop_vec_info main_loop_vinfo,const vector_modes & vector_modes,unsigned & mode_i,machine_mode & autodetected_vector_mode,bool & fatal)2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855 const vect_loop_form_info *loop_form_info,
2856 loop_vec_info main_loop_vinfo,
2857 const vector_modes &vector_modes, unsigned &mode_i,
2858 machine_mode &autodetected_vector_mode,
2859 bool &fatal)
2860 {
2861 loop_vec_info loop_vinfo
2862 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2863
2864 machine_mode vector_mode = vector_modes[mode_i];
2865 loop_vinfo->vector_mode = vector_mode;
2866 unsigned int suggested_unroll_factor = 1;
2867
2868 /* Run the main analysis. */
2869 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870 &suggested_unroll_factor);
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "***** Analysis %s with vector mode %s\n",
2874 res ? "succeeded" : " failed",
2875 GET_MODE_NAME (loop_vinfo->vector_mode));
2876
2877 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
2878 {
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_NOTE, vect_location,
2881 "***** Re-trying analysis for unrolling"
2882 " with unroll factor %d.\n",
2883 suggested_unroll_factor);
2884 loop_vec_info unroll_vinfo
2885 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886 unroll_vinfo->vector_mode = vector_mode;
2887 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889 if (new_res)
2890 {
2891 delete loop_vinfo;
2892 loop_vinfo = unroll_vinfo;
2893 }
2894 else
2895 delete unroll_vinfo;
2896 }
2897
2898 /* Remember the autodetected vector mode. */
2899 if (vector_mode == VOIDmode)
2900 autodetected_vector_mode = loop_vinfo->vector_mode;
2901
2902 /* Advance mode_i, first skipping modes that would result in the
2903 same analysis result. */
2904 while (mode_i + 1 < vector_modes.length ()
2905 && vect_chooses_same_modes_p (loop_vinfo,
2906 vector_modes[mode_i + 1]))
2907 {
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_NOTE, vect_location,
2910 "***** The result for vector mode %s would"
2911 " be the same\n",
2912 GET_MODE_NAME (vector_modes[mode_i + 1]));
2913 mode_i += 1;
2914 }
2915 if (mode_i + 1 < vector_modes.length ()
2916 && VECTOR_MODE_P (autodetected_vector_mode)
2917 && (related_vector_mode (vector_modes[mode_i + 1],
2918 GET_MODE_INNER (autodetected_vector_mode))
2919 == autodetected_vector_mode)
2920 && (related_vector_mode (autodetected_vector_mode,
2921 GET_MODE_INNER (vector_modes[mode_i + 1]))
2922 == vector_modes[mode_i + 1]))
2923 {
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_NOTE, vect_location,
2926 "***** Skipping vector mode %s, which would"
2927 " repeat the analysis for %s\n",
2928 GET_MODE_NAME (vector_modes[mode_i + 1]),
2929 GET_MODE_NAME (autodetected_vector_mode));
2930 mode_i += 1;
2931 }
2932 mode_i++;
2933
2934 if (!res)
2935 {
2936 delete loop_vinfo;
2937 if (fatal)
2938 gcc_checking_assert (main_loop_vinfo == NULL);
2939 return opt_loop_vec_info::propagate_failure (res);
2940 }
2941
2942 return opt_loop_vec_info::success (loop_vinfo);
2943 }
2944
2945 /* Function vect_analyze_loop.
2946
2947 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948 for it. The different analyses will record information in the
2949 loop_vec_info struct. */
2950 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2952 {
2953 DUMP_VECT_SCOPE ("analyze_loop_nest");
2954
2955 if (loop_outer (loop)
2956 && loop_vec_info_for_loop (loop_outer (loop))
2957 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958 return opt_loop_vec_info::failure_at (vect_location,
2959 "outer-loop already vectorized.\n");
2960
2961 if (!find_loop_nest (loop, &shared->loop_nest))
2962 return opt_loop_vec_info::failure_at
2963 (vect_location,
2964 "not vectorized: loop nest containing two or more consecutive inner"
2965 " loops cannot be vectorized\n");
2966
2967 /* Analyze the loop form. */
2968 vect_loop_form_info loop_form_info;
2969 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970 if (!res)
2971 {
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 "bad loop form.\n");
2975 return opt_loop_vec_info::propagate_failure (res);
2976 }
2977 if (!integer_onep (loop_form_info.assumptions))
2978 {
2979 /* We consider to vectorize this loop by versioning it under
2980 some assumptions. In order to do this, we need to clear
2981 existing information computed by scev and niter analyzer. */
2982 scev_reset_htab ();
2983 free_numbers_of_iterations_estimates (loop);
2984 /* Also set flag for this loop so that following scev and niter
2985 analysis are done under the assumptions. */
2986 loop_constraint_set (loop, LOOP_C_FINITE);
2987 }
2988
2989 auto_vector_modes vector_modes;
2990 /* Autodetect first vector size we try. */
2991 vector_modes.safe_push (VOIDmode);
2992 unsigned int autovec_flags
2993 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994 loop->simdlen != 0);
2995 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996 && !unlimited_cost_model (loop));
2997 machine_mode autodetected_vector_mode = VOIDmode;
2998 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999 unsigned int mode_i = 0;
3000 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3001
3002 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3003 a mode has not been analyzed. */
3004 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005 for (unsigned i = 0; i < vector_modes.length (); ++i)
3006 cached_vf_per_mode.safe_push (0);
3007
3008 /* First determine the main loop vectorization mode, either the first
3009 one that works, starting with auto-detecting the vector mode and then
3010 following the targets order of preference, or the one with the
3011 lowest cost if pick_lowest_cost_p. */
3012 while (1)
3013 {
3014 bool fatal;
3015 unsigned int last_mode_i = mode_i;
3016 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017 failed. */
3018 cached_vf_per_mode[last_mode_i] = -1;
3019 opt_loop_vec_info loop_vinfo
3020 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021 NULL, vector_modes, mode_i,
3022 autodetected_vector_mode, fatal);
3023 if (fatal)
3024 break;
3025
3026 if (loop_vinfo)
3027 {
3028 /* Analyzis has been successful so update the VF value. The
3029 VF should always be a multiple of unroll_factor and we want to
3030 capture the original VF here. */
3031 cached_vf_per_mode[last_mode_i]
3032 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033 loop_vinfo->suggested_unroll_factor);
3034 /* Once we hit the desired simdlen for the first time,
3035 discard any previous attempts. */
3036 if (simdlen
3037 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038 {
3039 delete first_loop_vinfo;
3040 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041 simdlen = 0;
3042 }
3043 else if (pick_lowest_cost_p
3044 && first_loop_vinfo
3045 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046 {
3047 /* Pick loop_vinfo over first_loop_vinfo. */
3048 delete first_loop_vinfo;
3049 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050 }
3051 if (first_loop_vinfo == NULL)
3052 first_loop_vinfo = loop_vinfo;
3053 else
3054 {
3055 delete loop_vinfo;
3056 loop_vinfo = opt_loop_vec_info::success (NULL);
3057 }
3058
3059 /* Commit to first_loop_vinfo if we have no reason to try
3060 alternatives. */
3061 if (!simdlen && !pick_lowest_cost_p)
3062 break;
3063 }
3064 if (mode_i == vector_modes.length ()
3065 || autodetected_vector_mode == VOIDmode)
3066 break;
3067
3068 /* Try the next biggest vector size. */
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location,
3071 "***** Re-trying analysis with vector mode %s\n",
3072 GET_MODE_NAME (vector_modes[mode_i]));
3073 }
3074 if (!first_loop_vinfo)
3075 return opt_loop_vec_info::propagate_failure (res);
3076
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 "***** Choosing vector mode %s\n",
3080 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3081
3082 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083 enabled, SIMDUID is not set, it is the innermost loop and we have
3084 either already found the loop's SIMDLEN or there was no SIMDLEN to
3085 begin with.
3086 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3087 bool vect_epilogues = (!simdlen
3088 && loop->inner == NULL
3089 && param_vect_epilogues_nomask
3090 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091 && !loop->simduid);
3092 if (!vect_epilogues)
3093 return first_loop_vinfo;
3094
3095 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3096 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3097
3098 /* For epilogues start the analysis from the first mode. The motivation
3099 behind starting from the beginning comes from cases where the VECTOR_MODES
3100 array may contain length-agnostic and length-specific modes. Their
3101 ordering is not guaranteed, so we could end up picking a mode for the main
3102 loop that is after the epilogue's optimal mode. */
3103 vector_modes[0] = autodetected_vector_mode;
3104 mode_i = 0;
3105
3106 bool supports_partial_vectors =
3107 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3109
3110 while (1)
3111 {
3112 /* If the target does not support partial vectors we can shorten the
3113 number of modes to analyze for the epilogue as we know we can't pick a
3114 mode that would lead to a VF at least as big as the
3115 FIRST_VINFO_VF. */
3116 if (!supports_partial_vectors
3117 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3118 {
3119 mode_i++;
3120 if (mode_i == vector_modes.length ())
3121 break;
3122 continue;
3123 }
3124
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "***** Re-trying epilogue analysis with vector "
3128 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3129
3130 bool fatal;
3131 opt_loop_vec_info loop_vinfo
3132 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133 first_loop_vinfo,
3134 vector_modes, mode_i,
3135 autodetected_vector_mode, fatal);
3136 if (fatal)
3137 break;
3138
3139 if (loop_vinfo)
3140 {
3141 if (pick_lowest_cost_p)
3142 {
3143 /* Keep trying to roll back vectorization attempts while the
3144 loop_vec_infos they produced were worse than this one. */
3145 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146 while (!vinfos.is_empty ()
3147 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3148 {
3149 gcc_assert (vect_epilogues);
3150 delete vinfos.pop ();
3151 }
3152 }
3153 /* For now only allow one epilogue loop. */
3154 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3155 {
3156 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159 || maybe_ne (lowest_th, 0U));
3160 /* Keep track of the known smallest versioning
3161 threshold. */
3162 if (ordered_p (lowest_th, th))
3163 lowest_th = ordered_min (lowest_th, th);
3164 }
3165 else
3166 {
3167 delete loop_vinfo;
3168 loop_vinfo = opt_loop_vec_info::success (NULL);
3169 }
3170
3171 /* For now only allow one epilogue loop, but allow
3172 pick_lowest_cost_p to replace it, so commit to the
3173 first epilogue if we have no reason to try alternatives. */
3174 if (!pick_lowest_cost_p)
3175 break;
3176 }
3177
3178 if (mode_i == vector_modes.length ())
3179 break;
3180
3181 }
3182
3183 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3184 {
3185 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186 if (dump_enabled_p ())
3187 dump_printf_loc (MSG_NOTE, vect_location,
3188 "***** Choosing epilogue vector mode %s\n",
3189 GET_MODE_NAME
3190 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3191 }
3192
3193 return first_loop_vinfo;
3194 }
3195
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197 it in *REDUC_FN if so. */
3198
3199 static bool
fold_left_reduction_fn(code_helper code,internal_fn * reduc_fn)3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3201 {
3202 if (code == PLUS_EXPR)
3203 {
3204 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205 return true;
3206 }
3207 return false;
3208 }
3209
3210 /* Function reduction_fn_for_scalar_code
3211
3212 Input:
3213 CODE - tree_code of a reduction operations.
3214
3215 Output:
3216 REDUC_FN - the corresponding internal function to be used to reduce the
3217 vector of partial results into a single scalar result, or IFN_LAST
3218 if the operation is a supported reduction operation, but does not have
3219 such an internal function.
3220
3221 Return FALSE if CODE currently cannot be vectorized as reduction. */
3222
3223 bool
reduction_fn_for_scalar_code(code_helper code,internal_fn * reduc_fn)3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3225 {
3226 if (code.is_tree_code ())
3227 switch (tree_code (code))
3228 {
3229 case MAX_EXPR:
3230 *reduc_fn = IFN_REDUC_MAX;
3231 return true;
3232
3233 case MIN_EXPR:
3234 *reduc_fn = IFN_REDUC_MIN;
3235 return true;
3236
3237 case PLUS_EXPR:
3238 *reduc_fn = IFN_REDUC_PLUS;
3239 return true;
3240
3241 case BIT_AND_EXPR:
3242 *reduc_fn = IFN_REDUC_AND;
3243 return true;
3244
3245 case BIT_IOR_EXPR:
3246 *reduc_fn = IFN_REDUC_IOR;
3247 return true;
3248
3249 case BIT_XOR_EXPR:
3250 *reduc_fn = IFN_REDUC_XOR;
3251 return true;
3252
3253 case MULT_EXPR:
3254 case MINUS_EXPR:
3255 *reduc_fn = IFN_LAST;
3256 return true;
3257
3258 default:
3259 return false;
3260 }
3261 else
3262 switch (combined_fn (code))
3263 {
3264 CASE_CFN_FMAX:
3265 *reduc_fn = IFN_REDUC_FMAX;
3266 return true;
3267
3268 CASE_CFN_FMIN:
3269 *reduc_fn = IFN_REDUC_FMIN;
3270 return true;
3271
3272 default:
3273 return false;
3274 }
3275 }
3276
3277 /* If there is a neutral value X such that a reduction would not be affected
3278 by the introduction of additional X elements, return that X, otherwise
3279 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3280 of the scalar elements. If the reduction has just a single initial value
3281 then INITIAL_VALUE is that value, otherwise it is null. */
3282
3283 tree
neutral_op_for_reduction(tree scalar_type,code_helper code,tree initial_value)3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285 tree initial_value)
3286 {
3287 if (code.is_tree_code ())
3288 switch (tree_code (code))
3289 {
3290 case WIDEN_SUM_EXPR:
3291 case DOT_PROD_EXPR:
3292 case SAD_EXPR:
3293 case PLUS_EXPR:
3294 case MINUS_EXPR:
3295 case BIT_IOR_EXPR:
3296 case BIT_XOR_EXPR:
3297 return build_zero_cst (scalar_type);
3298
3299 case MULT_EXPR:
3300 return build_one_cst (scalar_type);
3301
3302 case BIT_AND_EXPR:
3303 return build_all_ones_cst (scalar_type);
3304
3305 case MAX_EXPR:
3306 case MIN_EXPR:
3307 return initial_value;
3308
3309 default:
3310 return NULL_TREE;
3311 }
3312 else
3313 switch (combined_fn (code))
3314 {
3315 CASE_CFN_FMIN:
3316 CASE_CFN_FMAX:
3317 return initial_value;
3318
3319 default:
3320 return NULL_TREE;
3321 }
3322 }
3323
3324 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3325 STMT is printed with a message MSG. */
3326
3327 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3329 {
3330 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3331 }
3332
3333 /* Return true if we need an in-order reduction for operation CODE
3334 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335 overflow must wrap. */
3336
3337 bool
needs_fold_left_reduction_p(tree type,code_helper code)3338 needs_fold_left_reduction_p (tree type, code_helper code)
3339 {
3340 /* CHECKME: check for !flag_finite_math_only too? */
3341 if (SCALAR_FLOAT_TYPE_P (type))
3342 {
3343 if (code.is_tree_code ())
3344 switch (tree_code (code))
3345 {
3346 case MIN_EXPR:
3347 case MAX_EXPR:
3348 return false;
3349
3350 default:
3351 return !flag_associative_math;
3352 }
3353 else
3354 switch (combined_fn (code))
3355 {
3356 CASE_CFN_FMIN:
3357 CASE_CFN_FMAX:
3358 return false;
3359
3360 default:
3361 return !flag_associative_math;
3362 }
3363 }
3364
3365 if (INTEGRAL_TYPE_P (type))
3366 return (!code.is_tree_code ()
3367 || !operation_no_trapping_overflow (type, tree_code (code)));
3368
3369 if (SAT_FIXED_POINT_TYPE_P (type))
3370 return true;
3371
3372 return false;
3373 }
3374
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376 has a handled computation expression. Store the main reduction
3377 operation in *CODE. */
3378
3379 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,code_helper * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381 tree loop_arg, code_helper *code,
3382 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3383 {
3384 auto_bitmap visited;
3385 tree lookfor = PHI_RESULT (phi);
3386 ssa_op_iter curri;
3387 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388 while (USE_FROM_PTR (curr) != loop_arg)
3389 curr = op_iter_next_use (&curri);
3390 curri.i = curri.numops;
3391 do
3392 {
3393 path.safe_push (std::make_pair (curri, curr));
3394 tree use = USE_FROM_PTR (curr);
3395 if (use == lookfor)
3396 break;
3397 gimple *def = SSA_NAME_DEF_STMT (use);
3398 if (gimple_nop_p (def)
3399 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3400 {
3401 pop:
3402 do
3403 {
3404 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405 curri = x.first;
3406 curr = x.second;
3407 do
3408 curr = op_iter_next_use (&curri);
3409 /* Skip already visited or non-SSA operands (from iterating
3410 over PHI args). */
3411 while (curr != NULL_USE_OPERAND_P
3412 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413 || ! bitmap_set_bit (visited,
3414 SSA_NAME_VERSION
3415 (USE_FROM_PTR (curr)))));
3416 }
3417 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418 if (curr == NULL_USE_OPERAND_P)
3419 break;
3420 }
3421 else
3422 {
3423 if (gimple_code (def) == GIMPLE_PHI)
3424 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425 else
3426 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427 while (curr != NULL_USE_OPERAND_P
3428 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429 || ! bitmap_set_bit (visited,
3430 SSA_NAME_VERSION
3431 (USE_FROM_PTR (curr)))))
3432 curr = op_iter_next_use (&curri);
3433 if (curr == NULL_USE_OPERAND_P)
3434 goto pop;
3435 }
3436 }
3437 while (1);
3438 if (dump_file && (dump_flags & TDF_DETAILS))
3439 {
3440 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441 unsigned i;
3442 std::pair<ssa_op_iter, use_operand_p> *x;
3443 FOR_EACH_VEC_ELT (path, i, x)
3444 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445 dump_printf (MSG_NOTE, "\n");
3446 }
3447
3448 /* Check whether the reduction path detected is valid. */
3449 bool fail = path.length () == 0;
3450 bool neg = false;
3451 int sign = -1;
3452 *code = ERROR_MARK;
3453 for (unsigned i = 1; i < path.length (); ++i)
3454 {
3455 gimple *use_stmt = USE_STMT (path[i].second);
3456 gimple_match_op op;
3457 if (!gimple_extract_op (use_stmt, &op))
3458 {
3459 fail = true;
3460 break;
3461 }
3462 unsigned int opi = op.num_ops;
3463 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3464 {
3465 /* The following make sure we can compute the operand index
3466 easily plus it mostly disallows chaining via COND_EXPR condition
3467 operands. */
3468 for (opi = 0; opi < op.num_ops; ++opi)
3469 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470 break;
3471 }
3472 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3473 {
3474 for (opi = 0; opi < op.num_ops; ++opi)
3475 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476 break;
3477 }
3478 if (opi == op.num_ops)
3479 {
3480 fail = true;
3481 break;
3482 }
3483 op.code = canonicalize_code (op.code, op.type);
3484 if (op.code == MINUS_EXPR)
3485 {
3486 op.code = PLUS_EXPR;
3487 /* Track whether we negate the reduction value each iteration. */
3488 if (op.ops[1] == op.ops[opi])
3489 neg = ! neg;
3490 }
3491 if (CONVERT_EXPR_CODE_P (op.code)
3492 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3493 ;
3494 else if (*code == ERROR_MARK)
3495 {
3496 *code = op.code;
3497 sign = TYPE_SIGN (op.type);
3498 }
3499 else if (op.code != *code)
3500 {
3501 fail = true;
3502 break;
3503 }
3504 else if ((op.code == MIN_EXPR
3505 || op.code == MAX_EXPR)
3506 && sign != TYPE_SIGN (op.type))
3507 {
3508 fail = true;
3509 break;
3510 }
3511 /* Check there's only a single stmt the op is used on. For the
3512 not value-changing tail and the last stmt allow out-of-loop uses.
3513 ??? We could relax this and handle arbitrary live stmts by
3514 forcing a scalar epilogue for example. */
3515 imm_use_iterator imm_iter;
3516 use_operand_p use_p;
3517 gimple *op_use_stmt;
3518 unsigned cnt = 0;
3519 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3520 if (!is_gimple_debug (op_use_stmt)
3521 && (*code != ERROR_MARK
3522 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3523 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3524 cnt++;
3525 if (cnt != 1)
3526 {
3527 fail = true;
3528 break;
3529 }
3530 }
3531 return ! fail && ! neg && *code != ERROR_MARK;
3532 }
3533
3534 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3535 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3536 tree loop_arg, enum tree_code code)
3537 {
3538 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3539 code_helper code_;
3540 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3541 && code_ == code);
3542 }
3543
3544
3545
3546 /* Function vect_is_simple_reduction
3547
3548 (1) Detect a cross-iteration def-use cycle that represents a simple
3549 reduction computation. We look for the following pattern:
3550
3551 loop_header:
3552 a1 = phi < a0, a2 >
3553 a3 = ...
3554 a2 = operation (a3, a1)
3555
3556 or
3557
3558 a3 = ...
3559 loop_header:
3560 a1 = phi < a0, a2 >
3561 a2 = operation (a3, a1)
3562
3563 such that:
3564 1. operation is commutative and associative and it is safe to
3565 change the order of the computation
3566 2. no uses for a2 in the loop (a2 is used out of the loop)
3567 3. no uses of a1 in the loop besides the reduction operation
3568 4. no uses of a1 outside the loop.
3569
3570 Conditions 1,4 are tested here.
3571 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3572
3573 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3574 nested cycles.
3575
3576 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3577 reductions:
3578
3579 a1 = phi < a0, a2 >
3580 inner loop (def of a3)
3581 a2 = phi < a3 >
3582
3583 (4) Detect condition expressions, ie:
3584 for (int i = 0; i < N; i++)
3585 if (a[i] < val)
3586 ret_val = a[i];
3587
3588 */
3589
3590 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3591 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3592 bool *double_reduc, bool *reduc_chain_p)
3593 {
3594 gphi *phi = as_a <gphi *> (phi_info->stmt);
3595 gimple *phi_use_stmt = NULL;
3596 imm_use_iterator imm_iter;
3597 use_operand_p use_p;
3598
3599 *double_reduc = false;
3600 *reduc_chain_p = false;
3601 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3602
3603 tree phi_name = PHI_RESULT (phi);
3604 /* ??? If there are no uses of the PHI result the inner loop reduction
3605 won't be detected as possibly double-reduction by vectorizable_reduction
3606 because that tries to walk the PHI arg from the preheader edge which
3607 can be constant. See PR60382. */
3608 if (has_zero_uses (phi_name))
3609 return NULL;
3610 class loop *loop = (gimple_bb (phi))->loop_father;
3611 unsigned nphi_def_loop_uses = 0;
3612 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3613 {
3614 gimple *use_stmt = USE_STMT (use_p);
3615 if (is_gimple_debug (use_stmt))
3616 continue;
3617
3618 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3619 {
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "intermediate value used outside loop.\n");
3623
3624 return NULL;
3625 }
3626
3627 nphi_def_loop_uses++;
3628 phi_use_stmt = use_stmt;
3629 }
3630
3631 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3632 if (TREE_CODE (latch_def) != SSA_NAME)
3633 {
3634 if (dump_enabled_p ())
3635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636 "reduction: not ssa_name: %T\n", latch_def);
3637 return NULL;
3638 }
3639
3640 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3641 if (!def_stmt_info
3642 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3643 return NULL;
3644
3645 bool nested_in_vect_loop
3646 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3647 unsigned nlatch_def_loop_uses = 0;
3648 auto_vec<gphi *, 3> lcphis;
3649 bool inner_loop_of_double_reduc = false;
3650 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3651 {
3652 gimple *use_stmt = USE_STMT (use_p);
3653 if (is_gimple_debug (use_stmt))
3654 continue;
3655 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3656 nlatch_def_loop_uses++;
3657 else
3658 {
3659 /* We can have more than one loop-closed PHI. */
3660 lcphis.safe_push (as_a <gphi *> (use_stmt));
3661 if (nested_in_vect_loop
3662 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3663 == vect_double_reduction_def))
3664 inner_loop_of_double_reduc = true;
3665 }
3666 }
3667
3668 /* If we are vectorizing an inner reduction we are executing that
3669 in the original order only in case we are not dealing with a
3670 double reduction. */
3671 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3672 {
3673 if (dump_enabled_p ())
3674 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3675 "detected nested cycle: ");
3676 return def_stmt_info;
3677 }
3678
3679 /* When the inner loop of a double reduction ends up with more than
3680 one loop-closed PHI we have failed to classify alternate such
3681 PHIs as double reduction, leading to wrong code. See PR103237. */
3682 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3683 {
3684 if (dump_enabled_p ())
3685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3686 "unhandle double reduction\n");
3687 return NULL;
3688 }
3689
3690 /* If this isn't a nested cycle or if the nested cycle reduction value
3691 is used ouside of the inner loop we cannot handle uses of the reduction
3692 value. */
3693 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3694 {
3695 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "reduction used in loop.\n");
3698 return NULL;
3699 }
3700
3701 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3702 defined in the inner loop. */
3703 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3704 {
3705 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3706 if (gimple_phi_num_args (def_stmt) != 1
3707 || TREE_CODE (op1) != SSA_NAME)
3708 {
3709 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711 "unsupported phi node definition.\n");
3712
3713 return NULL;
3714 }
3715
3716 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3717 if (gimple_bb (def1)
3718 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3719 && loop->inner
3720 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3721 && (is_gimple_assign (def1) || is_gimple_call (def1))
3722 && is_a <gphi *> (phi_use_stmt)
3723 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3724 {
3725 if (dump_enabled_p ())
3726 report_vect_op (MSG_NOTE, def_stmt,
3727 "detected double reduction: ");
3728
3729 *double_reduc = true;
3730 return def_stmt_info;
3731 }
3732
3733 return NULL;
3734 }
3735
3736 /* Look for the expression computing latch_def from then loop PHI result. */
3737 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3738 code_helper code;
3739 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3740 path))
3741 {
3742 STMT_VINFO_REDUC_CODE (phi_info) = code;
3743 if (code == COND_EXPR && !nested_in_vect_loop)
3744 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3745
3746 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3747 reduction chain for which the additional restriction is that
3748 all operations in the chain are the same. */
3749 auto_vec<stmt_vec_info, 8> reduc_chain;
3750 unsigned i;
3751 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3752 for (i = path.length () - 1; i >= 1; --i)
3753 {
3754 gimple *stmt = USE_STMT (path[i].second);
3755 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3756 gimple_match_op op;
3757 if (!gimple_extract_op (stmt, &op))
3758 gcc_unreachable ();
3759 if (gassign *assign = dyn_cast<gassign *> (stmt))
3760 STMT_VINFO_REDUC_IDX (stmt_info)
3761 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3762 else
3763 {
3764 gcall *call = as_a<gcall *> (stmt);
3765 STMT_VINFO_REDUC_IDX (stmt_info)
3766 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3767 }
3768 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3769 && (i == 1 || i == path.length () - 1));
3770 if ((op.code != code && !leading_conversion)
3771 /* We can only handle the final value in epilogue
3772 generation for reduction chains. */
3773 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3774 is_slp_reduc = false;
3775 /* For reduction chains we support a trailing/leading
3776 conversions. We do not store those in the actual chain. */
3777 if (leading_conversion)
3778 continue;
3779 reduc_chain.safe_push (stmt_info);
3780 }
3781 if (is_slp_reduc && reduc_chain.length () > 1)
3782 {
3783 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3784 {
3785 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3786 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3787 }
3788 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3789 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3790
3791 /* Save the chain for further analysis in SLP detection. */
3792 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3793 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3794
3795 *reduc_chain_p = true;
3796 if (dump_enabled_p ())
3797 dump_printf_loc (MSG_NOTE, vect_location,
3798 "reduction: detected reduction chain\n");
3799 }
3800 else if (dump_enabled_p ())
3801 dump_printf_loc (MSG_NOTE, vect_location,
3802 "reduction: detected reduction\n");
3803
3804 return def_stmt_info;
3805 }
3806
3807 if (dump_enabled_p ())
3808 dump_printf_loc (MSG_NOTE, vect_location,
3809 "reduction: unknown pattern\n");
3810
3811 return NULL;
3812 }
3813
3814 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3815 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3816 or -1 if not known. */
3817
3818 static int
vect_get_peel_iters_epilogue(loop_vec_info loop_vinfo,int peel_iters_prologue)3819 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3820 {
3821 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3822 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3823 {
3824 if (dump_enabled_p ())
3825 dump_printf_loc (MSG_NOTE, vect_location,
3826 "cost model: epilogue peel iters set to vf/2 "
3827 "because loop iterations are unknown .\n");
3828 return assumed_vf / 2;
3829 }
3830 else
3831 {
3832 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3833 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3834 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3835 /* If we need to peel for gaps, but no peeling is required, we have to
3836 peel VF iterations. */
3837 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3838 peel_iters_epilogue = assumed_vf;
3839 return peel_iters_epilogue;
3840 }
3841 }
3842
3843 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3844 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3845 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3846 int *peel_iters_epilogue,
3847 stmt_vector_for_cost *scalar_cost_vec,
3848 stmt_vector_for_cost *prologue_cost_vec,
3849 stmt_vector_for_cost *epilogue_cost_vec)
3850 {
3851 int retval = 0;
3852
3853 *peel_iters_epilogue
3854 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3855
3856 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3857 {
3858 /* If peeled iterations are known but number of scalar loop
3859 iterations are unknown, count a taken branch per peeled loop. */
3860 if (peel_iters_prologue > 0)
3861 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3862 vect_prologue);
3863 if (*peel_iters_epilogue > 0)
3864 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3865 vect_epilogue);
3866 }
3867
3868 stmt_info_for_cost *si;
3869 int j;
3870 if (peel_iters_prologue)
3871 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3872 retval += record_stmt_cost (prologue_cost_vec,
3873 si->count * peel_iters_prologue,
3874 si->kind, si->stmt_info, si->misalign,
3875 vect_prologue);
3876 if (*peel_iters_epilogue)
3877 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3878 retval += record_stmt_cost (epilogue_cost_vec,
3879 si->count * *peel_iters_epilogue,
3880 si->kind, si->stmt_info, si->misalign,
3881 vect_epilogue);
3882
3883 return retval;
3884 }
3885
3886 /* Function vect_estimate_min_profitable_iters
3887
3888 Return the number of iterations required for the vector version of the
3889 loop to be profitable relative to the cost of the scalar version of the
3890 loop.
3891
3892 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3893 of iterations for vectorization. -1 value means loop vectorization
3894 is not profitable. This returned value may be used for dynamic
3895 profitability check.
3896
3897 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3898 for static check against estimated number of iterations. */
3899
3900 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate,unsigned * suggested_unroll_factor)3901 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3902 int *ret_min_profitable_niters,
3903 int *ret_min_profitable_estimate,
3904 unsigned *suggested_unroll_factor)
3905 {
3906 int min_profitable_iters;
3907 int min_profitable_estimate;
3908 int peel_iters_prologue;
3909 int peel_iters_epilogue;
3910 unsigned vec_inside_cost = 0;
3911 int vec_outside_cost = 0;
3912 unsigned vec_prologue_cost = 0;
3913 unsigned vec_epilogue_cost = 0;
3914 int scalar_single_iter_cost = 0;
3915 int scalar_outside_cost = 0;
3916 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3917 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3918 vector_costs *target_cost_data = loop_vinfo->vector_costs;
3919
3920 /* Cost model disabled. */
3921 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3922 {
3923 if (dump_enabled_p ())
3924 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3925 *ret_min_profitable_niters = 0;
3926 *ret_min_profitable_estimate = 0;
3927 return;
3928 }
3929
3930 /* Requires loop versioning tests to handle misalignment. */
3931 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3932 {
3933 /* FIXME: Make cost depend on complexity of individual check. */
3934 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3935 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3936 if (dump_enabled_p ())
3937 dump_printf (MSG_NOTE,
3938 "cost model: Adding cost of checks for loop "
3939 "versioning to treat misalignment.\n");
3940 }
3941
3942 /* Requires loop versioning with alias checks. */
3943 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3944 {
3945 /* FIXME: Make cost depend on complexity of individual check. */
3946 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3947 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3948 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3949 if (len)
3950 /* Count LEN - 1 ANDs and LEN comparisons. */
3951 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3952 scalar_stmt, vect_prologue);
3953 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3954 if (len)
3955 {
3956 /* Count LEN - 1 ANDs and LEN comparisons. */
3957 unsigned int nstmts = len * 2 - 1;
3958 /* +1 for each bias that needs adding. */
3959 for (unsigned int i = 0; i < len; ++i)
3960 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3961 nstmts += 1;
3962 (void) add_stmt_cost (target_cost_data, nstmts,
3963 scalar_stmt, vect_prologue);
3964 }
3965 if (dump_enabled_p ())
3966 dump_printf (MSG_NOTE,
3967 "cost model: Adding cost of checks for loop "
3968 "versioning aliasing.\n");
3969 }
3970
3971 /* Requires loop versioning with niter checks. */
3972 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3973 {
3974 /* FIXME: Make cost depend on complexity of individual check. */
3975 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3976 NULL, NULL, NULL_TREE, 0, vect_prologue);
3977 if (dump_enabled_p ())
3978 dump_printf (MSG_NOTE,
3979 "cost model: Adding cost of checks for loop "
3980 "versioning niters.\n");
3981 }
3982
3983 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3984 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3985 vect_prologue);
3986
3987 /* Count statements in scalar loop. Using this as scalar cost for a single
3988 iteration for now.
3989
3990 TODO: Add outer loop support.
3991
3992 TODO: Consider assigning different costs to different scalar
3993 statements. */
3994
3995 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3996
3997 /* Add additional cost for the peeled instructions in prologue and epilogue
3998 loop. (For fully-masked loops there will be no peeling.)
3999
4000 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4001 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4002
4003 TODO: Build an expression that represents peel_iters for prologue and
4004 epilogue to be used in a run-time test. */
4005
4006 bool prologue_need_br_taken_cost = false;
4007 bool prologue_need_br_not_taken_cost = false;
4008
4009 /* Calculate peel_iters_prologue. */
4010 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4011 peel_iters_prologue = 0;
4012 else if (npeel < 0)
4013 {
4014 peel_iters_prologue = assumed_vf / 2;
4015 if (dump_enabled_p ())
4016 dump_printf (MSG_NOTE, "cost model: "
4017 "prologue peel iters set to vf/2.\n");
4018
4019 /* If peeled iterations are unknown, count a taken branch and a not taken
4020 branch per peeled loop. Even if scalar loop iterations are known,
4021 vector iterations are not known since peeled prologue iterations are
4022 not known. Hence guards remain the same. */
4023 prologue_need_br_taken_cost = true;
4024 prologue_need_br_not_taken_cost = true;
4025 }
4026 else
4027 {
4028 peel_iters_prologue = npeel;
4029 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4030 /* If peeled iterations are known but number of scalar loop
4031 iterations are unknown, count a taken branch per peeled loop. */
4032 prologue_need_br_taken_cost = true;
4033 }
4034
4035 bool epilogue_need_br_taken_cost = false;
4036 bool epilogue_need_br_not_taken_cost = false;
4037
4038 /* Calculate peel_iters_epilogue. */
4039 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4040 /* We need to peel exactly one iteration for gaps. */
4041 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4042 else if (npeel < 0)
4043 {
4044 /* If peeling for alignment is unknown, loop bound of main loop
4045 becomes unknown. */
4046 peel_iters_epilogue = assumed_vf / 2;
4047 if (dump_enabled_p ())
4048 dump_printf (MSG_NOTE, "cost model: "
4049 "epilogue peel iters set to vf/2 because "
4050 "peeling for alignment is unknown.\n");
4051
4052 /* See the same reason above in peel_iters_prologue calculation. */
4053 epilogue_need_br_taken_cost = true;
4054 epilogue_need_br_not_taken_cost = true;
4055 }
4056 else
4057 {
4058 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4059 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4060 /* If peeled iterations are known but number of scalar loop
4061 iterations are unknown, count a taken branch per peeled loop. */
4062 epilogue_need_br_taken_cost = true;
4063 }
4064
4065 stmt_info_for_cost *si;
4066 int j;
4067 /* Add costs associated with peel_iters_prologue. */
4068 if (peel_iters_prologue)
4069 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4070 {
4071 (void) add_stmt_cost (target_cost_data,
4072 si->count * peel_iters_prologue, si->kind,
4073 si->stmt_info, si->node, si->vectype,
4074 si->misalign, vect_prologue);
4075 }
4076
4077 /* Add costs associated with peel_iters_epilogue. */
4078 if (peel_iters_epilogue)
4079 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4080 {
4081 (void) add_stmt_cost (target_cost_data,
4082 si->count * peel_iters_epilogue, si->kind,
4083 si->stmt_info, si->node, si->vectype,
4084 si->misalign, vect_epilogue);
4085 }
4086
4087 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4088
4089 if (prologue_need_br_taken_cost)
4090 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4091 vect_prologue);
4092
4093 if (prologue_need_br_not_taken_cost)
4094 (void) add_stmt_cost (target_cost_data, 1,
4095 cond_branch_not_taken, vect_prologue);
4096
4097 if (epilogue_need_br_taken_cost)
4098 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4099 vect_epilogue);
4100
4101 if (epilogue_need_br_not_taken_cost)
4102 (void) add_stmt_cost (target_cost_data, 1,
4103 cond_branch_not_taken, vect_epilogue);
4104
4105 /* Take care of special costs for rgroup controls of partial vectors. */
4106 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4107 {
4108 /* Calculate how many masks we need to generate. */
4109 unsigned int num_masks = 0;
4110 rgroup_controls *rgm;
4111 unsigned int num_vectors_m1;
4112 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4113 if (rgm->type)
4114 num_masks += num_vectors_m1 + 1;
4115 gcc_assert (num_masks > 0);
4116
4117 /* In the worst case, we need to generate each mask in the prologue
4118 and in the loop body. One of the loop body mask instructions
4119 replaces the comparison in the scalar loop, and since we don't
4120 count the scalar comparison against the scalar body, we shouldn't
4121 count that vector instruction against the vector body either.
4122
4123 Sometimes we can use unpacks instead of generating prologue
4124 masks and sometimes the prologue mask will fold to a constant,
4125 so the actual prologue cost might be smaller. However, it's
4126 simpler and safer to use the worst-case cost; if this ends up
4127 being the tie-breaker between vectorizing or not, then it's
4128 probably better not to vectorize. */
4129 (void) add_stmt_cost (target_cost_data, num_masks,
4130 vector_stmt, NULL, NULL, NULL_TREE, 0,
4131 vect_prologue);
4132 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4133 vector_stmt, NULL, NULL, NULL_TREE, 0,
4134 vect_body);
4135 }
4136 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4137 {
4138 /* Referring to the functions vect_set_loop_condition_partial_vectors
4139 and vect_set_loop_controls_directly, we need to generate each
4140 length in the prologue and in the loop body if required. Although
4141 there are some possible optimizations, we consider the worst case
4142 here. */
4143
4144 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4145 signed char partial_load_store_bias
4146 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4147 bool need_iterate_p
4148 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4149 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4150
4151 /* Calculate how many statements to be added. */
4152 unsigned int prologue_stmts = 0;
4153 unsigned int body_stmts = 0;
4154
4155 rgroup_controls *rgc;
4156 unsigned int num_vectors_m1;
4157 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4158 if (rgc->type)
4159 {
4160 /* May need one SHIFT for nitems_total computation. */
4161 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4162 if (nitems != 1 && !niters_known_p)
4163 prologue_stmts += 1;
4164
4165 /* May need one MAX and one MINUS for wrap around. */
4166 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4167 prologue_stmts += 2;
4168
4169 /* Need one MAX and one MINUS for each batch limit excepting for
4170 the 1st one. */
4171 prologue_stmts += num_vectors_m1 * 2;
4172
4173 unsigned int num_vectors = num_vectors_m1 + 1;
4174
4175 /* Need to set up lengths in prologue, only one MIN required
4176 for each since start index is zero. */
4177 prologue_stmts += num_vectors;
4178
4179 /* If we have a non-zero partial load bias, we need one PLUS
4180 to adjust the load length. */
4181 if (partial_load_store_bias != 0)
4182 body_stmts += 1;
4183
4184 /* Each may need two MINs and one MINUS to update lengths in body
4185 for next iteration. */
4186 if (need_iterate_p)
4187 body_stmts += 3 * num_vectors;
4188 }
4189
4190 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4191 scalar_stmt, vect_prologue);
4192 (void) add_stmt_cost (target_cost_data, body_stmts,
4193 scalar_stmt, vect_body);
4194 }
4195
4196 /* FORNOW: The scalar outside cost is incremented in one of the
4197 following ways:
4198
4199 1. The vectorizer checks for alignment and aliasing and generates
4200 a condition that allows dynamic vectorization. A cost model
4201 check is ANDED with the versioning condition. Hence scalar code
4202 path now has the added cost of the versioning check.
4203
4204 if (cost > th & versioning_check)
4205 jmp to vector code
4206
4207 Hence run-time scalar is incremented by not-taken branch cost.
4208
4209 2. The vectorizer then checks if a prologue is required. If the
4210 cost model check was not done before during versioning, it has to
4211 be done before the prologue check.
4212
4213 if (cost <= th)
4214 prologue = scalar_iters
4215 if (prologue == 0)
4216 jmp to vector code
4217 else
4218 execute prologue
4219 if (prologue == num_iters)
4220 go to exit
4221
4222 Hence the run-time scalar cost is incremented by a taken branch,
4223 plus a not-taken branch, plus a taken branch cost.
4224
4225 3. The vectorizer then checks if an epilogue is required. If the
4226 cost model check was not done before during prologue check, it
4227 has to be done with the epilogue check.
4228
4229 if (prologue == 0)
4230 jmp to vector code
4231 else
4232 execute prologue
4233 if (prologue == num_iters)
4234 go to exit
4235 vector code:
4236 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4237 jmp to epilogue
4238
4239 Hence the run-time scalar cost should be incremented by 2 taken
4240 branches.
4241
4242 TODO: The back end may reorder the BBS's differently and reverse
4243 conditions/branch directions. Change the estimates below to
4244 something more reasonable. */
4245
4246 /* If the number of iterations is known and we do not do versioning, we can
4247 decide whether to vectorize at compile time. Hence the scalar version
4248 do not carry cost model guard costs. */
4249 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4250 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4251 {
4252 /* Cost model check occurs at versioning. */
4253 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4254 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4255 else
4256 {
4257 /* Cost model check occurs at prologue generation. */
4258 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4259 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4260 + vect_get_stmt_cost (cond_branch_not_taken);
4261 /* Cost model check occurs at epilogue generation. */
4262 else
4263 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4264 }
4265 }
4266
4267 /* Complete the target-specific cost calculations. */
4268 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4269 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4270 suggested_unroll_factor);
4271
4272 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4273 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4274 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4275 *suggested_unroll_factor,
4276 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4277 {
4278 if (dump_enabled_p ())
4279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4280 "can't unroll as unrolled vectorization factor larger"
4281 " than maximum vectorization factor: "
4282 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4283 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4284 *suggested_unroll_factor = 1;
4285 }
4286
4287 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4288
4289 if (dump_enabled_p ())
4290 {
4291 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4292 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4293 vec_inside_cost);
4294 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4295 vec_prologue_cost);
4296 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4297 vec_epilogue_cost);
4298 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4299 scalar_single_iter_cost);
4300 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4301 scalar_outside_cost);
4302 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4303 vec_outside_cost);
4304 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4305 peel_iters_prologue);
4306 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4307 peel_iters_epilogue);
4308 }
4309
4310 /* Calculate number of iterations required to make the vector version
4311 profitable, relative to the loop bodies only. The following condition
4312 must hold true:
4313 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4314 where
4315 SIC = scalar iteration cost, VIC = vector iteration cost,
4316 VOC = vector outside cost, VF = vectorization factor,
4317 NPEEL = prologue iterations + epilogue iterations,
4318 SOC = scalar outside cost for run time cost model check. */
4319
4320 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4321 - vec_inside_cost);
4322 if (saving_per_viter <= 0)
4323 {
4324 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4325 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4326 "vectorization did not happen for a simd loop");
4327
4328 if (dump_enabled_p ())
4329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4330 "cost model: the vector iteration cost = %d "
4331 "divided by the scalar iteration cost = %d "
4332 "is greater or equal to the vectorization factor = %d"
4333 ".\n",
4334 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4335 *ret_min_profitable_niters = -1;
4336 *ret_min_profitable_estimate = -1;
4337 return;
4338 }
4339
4340 /* ??? The "if" arm is written to handle all cases; see below for what
4341 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4342 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4343 {
4344 /* Rewriting the condition above in terms of the number of
4345 vector iterations (vniters) rather than the number of
4346 scalar iterations (niters) gives:
4347
4348 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4349
4350 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4351
4352 For integer N, X and Y when X > 0:
4353
4354 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4355 int outside_overhead = (vec_outside_cost
4356 - scalar_single_iter_cost * peel_iters_prologue
4357 - scalar_single_iter_cost * peel_iters_epilogue
4358 - scalar_outside_cost);
4359 /* We're only interested in cases that require at least one
4360 vector iteration. */
4361 int min_vec_niters = 1;
4362 if (outside_overhead > 0)
4363 min_vec_niters = outside_overhead / saving_per_viter + 1;
4364
4365 if (dump_enabled_p ())
4366 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4367 min_vec_niters);
4368
4369 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4370 {
4371 /* Now that we know the minimum number of vector iterations,
4372 find the minimum niters for which the scalar cost is larger:
4373
4374 SIC * niters > VIC * vniters + VOC - SOC
4375
4376 We know that the minimum niters is no more than
4377 vniters * VF + NPEEL, but it might be (and often is) less
4378 than that if a partial vector iteration is cheaper than the
4379 equivalent scalar code. */
4380 int threshold = (vec_inside_cost * min_vec_niters
4381 + vec_outside_cost
4382 - scalar_outside_cost);
4383 if (threshold <= 0)
4384 min_profitable_iters = 1;
4385 else
4386 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4387 }
4388 else
4389 /* Convert the number of vector iterations into a number of
4390 scalar iterations. */
4391 min_profitable_iters = (min_vec_niters * assumed_vf
4392 + peel_iters_prologue
4393 + peel_iters_epilogue);
4394 }
4395 else
4396 {
4397 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4398 * assumed_vf
4399 - vec_inside_cost * peel_iters_prologue
4400 - vec_inside_cost * peel_iters_epilogue);
4401 if (min_profitable_iters <= 0)
4402 min_profitable_iters = 0;
4403 else
4404 {
4405 min_profitable_iters /= saving_per_viter;
4406
4407 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4408 <= (((int) vec_inside_cost * min_profitable_iters)
4409 + (((int) vec_outside_cost - scalar_outside_cost)
4410 * assumed_vf)))
4411 min_profitable_iters++;
4412 }
4413 }
4414
4415 if (dump_enabled_p ())
4416 dump_printf (MSG_NOTE,
4417 " Calculated minimum iters for profitability: %d\n",
4418 min_profitable_iters);
4419
4420 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4421 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4422 /* We want the vectorized loop to execute at least once. */
4423 min_profitable_iters = assumed_vf + peel_iters_prologue;
4424 else if (min_profitable_iters < peel_iters_prologue)
4425 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4426 vectorized loop executes at least once. */
4427 min_profitable_iters = peel_iters_prologue;
4428
4429 if (dump_enabled_p ())
4430 dump_printf_loc (MSG_NOTE, vect_location,
4431 " Runtime profitability threshold = %d\n",
4432 min_profitable_iters);
4433
4434 *ret_min_profitable_niters = min_profitable_iters;
4435
4436 /* Calculate number of iterations required to make the vector version
4437 profitable, relative to the loop bodies only.
4438
4439 Non-vectorized variant is SIC * niters and it must win over vector
4440 variant on the expected loop trip count. The following condition must hold true:
4441 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4442
4443 if (vec_outside_cost <= 0)
4444 min_profitable_estimate = 0;
4445 /* ??? This "else if" arm is written to handle all cases; see below for
4446 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4447 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4448 {
4449 /* This is a repeat of the code above, but with + SOC rather
4450 than - SOC. */
4451 int outside_overhead = (vec_outside_cost
4452 - scalar_single_iter_cost * peel_iters_prologue
4453 - scalar_single_iter_cost * peel_iters_epilogue
4454 + scalar_outside_cost);
4455 int min_vec_niters = 1;
4456 if (outside_overhead > 0)
4457 min_vec_niters = outside_overhead / saving_per_viter + 1;
4458
4459 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4460 {
4461 int threshold = (vec_inside_cost * min_vec_niters
4462 + vec_outside_cost
4463 + scalar_outside_cost);
4464 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4465 }
4466 else
4467 min_profitable_estimate = (min_vec_niters * assumed_vf
4468 + peel_iters_prologue
4469 + peel_iters_epilogue);
4470 }
4471 else
4472 {
4473 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4474 * assumed_vf
4475 - vec_inside_cost * peel_iters_prologue
4476 - vec_inside_cost * peel_iters_epilogue)
4477 / ((scalar_single_iter_cost * assumed_vf)
4478 - vec_inside_cost);
4479 }
4480 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4481 if (dump_enabled_p ())
4482 dump_printf_loc (MSG_NOTE, vect_location,
4483 " Static estimate profitability threshold = %d\n",
4484 min_profitable_estimate);
4485
4486 *ret_min_profitable_estimate = min_profitable_estimate;
4487 }
4488
4489 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4490 vector elements (not bits) for a vector with NELT elements. */
4491 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4492 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4493 vec_perm_builder *sel)
4494 {
4495 /* The encoding is a single stepped pattern. Any wrap-around is handled
4496 by vec_perm_indices. */
4497 sel->new_vector (nelt, 1, 3);
4498 for (unsigned int i = 0; i < 3; i++)
4499 sel->quick_push (i + offset);
4500 }
4501
4502 /* Checks whether the target supports whole-vector shifts for vectors of mode
4503 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4504 it supports vec_perm_const with masks for all necessary shift amounts. */
4505 static bool
have_whole_vector_shift(machine_mode mode)4506 have_whole_vector_shift (machine_mode mode)
4507 {
4508 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4509 return true;
4510
4511 /* Variable-length vectors should be handled via the optab. */
4512 unsigned int nelt;
4513 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4514 return false;
4515
4516 vec_perm_builder sel;
4517 vec_perm_indices indices;
4518 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4519 {
4520 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4521 indices.new_vector (sel, 2, nelt);
4522 if (!can_vec_perm_const_p (mode, indices, false))
4523 return false;
4524 }
4525 return true;
4526 }
4527
4528 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4529 functions. Design better to avoid maintenance issues. */
4530
4531 /* Function vect_model_reduction_cost.
4532
4533 Models cost for a reduction operation, including the vector ops
4534 generated within the strip-mine loop in some cases, the initial
4535 definition before the loop, and the epilogue code that must be generated. */
4536
4537 static void
vect_model_reduction_cost(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)4538 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4539 stmt_vec_info stmt_info, internal_fn reduc_fn,
4540 vect_reduction_type reduction_type,
4541 int ncopies, stmt_vector_for_cost *cost_vec)
4542 {
4543 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4544 tree vectype;
4545 machine_mode mode;
4546 class loop *loop = NULL;
4547
4548 if (loop_vinfo)
4549 loop = LOOP_VINFO_LOOP (loop_vinfo);
4550
4551 /* Condition reductions generate two reductions in the loop. */
4552 if (reduction_type == COND_REDUCTION)
4553 ncopies *= 2;
4554
4555 vectype = STMT_VINFO_VECTYPE (stmt_info);
4556 mode = TYPE_MODE (vectype);
4557 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4558
4559 gimple_match_op op;
4560 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4561 gcc_unreachable ();
4562
4563 if (reduction_type == EXTRACT_LAST_REDUCTION)
4564 /* No extra instructions are needed in the prologue. The loop body
4565 operations are costed in vectorizable_condition. */
4566 inside_cost = 0;
4567 else if (reduction_type == FOLD_LEFT_REDUCTION)
4568 {
4569 /* No extra instructions needed in the prologue. */
4570 prologue_cost = 0;
4571
4572 if (reduc_fn != IFN_LAST)
4573 /* Count one reduction-like operation per vector. */
4574 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4575 stmt_info, 0, vect_body);
4576 else
4577 {
4578 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4579 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4580 inside_cost = record_stmt_cost (cost_vec, nelements,
4581 vec_to_scalar, stmt_info, 0,
4582 vect_body);
4583 inside_cost += record_stmt_cost (cost_vec, nelements,
4584 scalar_stmt, stmt_info, 0,
4585 vect_body);
4586 }
4587 }
4588 else
4589 {
4590 /* Add in cost for initial definition.
4591 For cond reduction we have four vectors: initial index, step,
4592 initial result of the data reduction, initial value of the index
4593 reduction. */
4594 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4595 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4596 scalar_to_vec, stmt_info, 0,
4597 vect_prologue);
4598 }
4599
4600 /* Determine cost of epilogue code.
4601
4602 We have a reduction operator that will reduce the vector in one statement.
4603 Also requires scalar extract. */
4604
4605 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4606 {
4607 if (reduc_fn != IFN_LAST)
4608 {
4609 if (reduction_type == COND_REDUCTION)
4610 {
4611 /* An EQ stmt and an COND_EXPR stmt. */
4612 epilogue_cost += record_stmt_cost (cost_vec, 2,
4613 vector_stmt, stmt_info, 0,
4614 vect_epilogue);
4615 /* Reduction of the max index and a reduction of the found
4616 values. */
4617 epilogue_cost += record_stmt_cost (cost_vec, 2,
4618 vec_to_scalar, stmt_info, 0,
4619 vect_epilogue);
4620 /* A broadcast of the max value. */
4621 epilogue_cost += record_stmt_cost (cost_vec, 1,
4622 scalar_to_vec, stmt_info, 0,
4623 vect_epilogue);
4624 }
4625 else
4626 {
4627 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4628 stmt_info, 0, vect_epilogue);
4629 epilogue_cost += record_stmt_cost (cost_vec, 1,
4630 vec_to_scalar, stmt_info, 0,
4631 vect_epilogue);
4632 }
4633 }
4634 else if (reduction_type == COND_REDUCTION)
4635 {
4636 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4637 /* Extraction of scalar elements. */
4638 epilogue_cost += record_stmt_cost (cost_vec,
4639 2 * estimated_nunits,
4640 vec_to_scalar, stmt_info, 0,
4641 vect_epilogue);
4642 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4643 epilogue_cost += record_stmt_cost (cost_vec,
4644 2 * estimated_nunits - 3,
4645 scalar_stmt, stmt_info, 0,
4646 vect_epilogue);
4647 }
4648 else if (reduction_type == EXTRACT_LAST_REDUCTION
4649 || reduction_type == FOLD_LEFT_REDUCTION)
4650 /* No extra instructions need in the epilogue. */
4651 ;
4652 else
4653 {
4654 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4655 tree bitsize = TYPE_SIZE (op.type);
4656 int element_bitsize = tree_to_uhwi (bitsize);
4657 int nelements = vec_size_in_bits / element_bitsize;
4658
4659 if (op.code == COND_EXPR)
4660 op.code = MAX_EXPR;
4661
4662 /* We have a whole vector shift available. */
4663 if (VECTOR_MODE_P (mode)
4664 && directly_supported_p (op.code, vectype)
4665 && have_whole_vector_shift (mode))
4666 {
4667 /* Final reduction via vector shifts and the reduction operator.
4668 Also requires scalar extract. */
4669 epilogue_cost += record_stmt_cost (cost_vec,
4670 exact_log2 (nelements) * 2,
4671 vector_stmt, stmt_info, 0,
4672 vect_epilogue);
4673 epilogue_cost += record_stmt_cost (cost_vec, 1,
4674 vec_to_scalar, stmt_info, 0,
4675 vect_epilogue);
4676 }
4677 else
4678 /* Use extracts and reduction op for final reduction. For N
4679 elements, we have N extracts and N-1 reduction ops. */
4680 epilogue_cost += record_stmt_cost (cost_vec,
4681 nelements + nelements - 1,
4682 vector_stmt, stmt_info, 0,
4683 vect_epilogue);
4684 }
4685 }
4686
4687 if (dump_enabled_p ())
4688 dump_printf (MSG_NOTE,
4689 "vect_model_reduction_cost: inside_cost = %d, "
4690 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4691 prologue_cost, epilogue_cost);
4692 }
4693
4694 /* SEQ is a sequence of instructions that initialize the reduction
4695 described by REDUC_INFO. Emit them in the appropriate place. */
4696
4697 static void
vect_emit_reduction_init_stmts(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,gimple * seq)4698 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4699 stmt_vec_info reduc_info, gimple *seq)
4700 {
4701 if (reduc_info->reused_accumulator)
4702 {
4703 /* When reusing an accumulator from the main loop, we only need
4704 initialization instructions if the main loop can be skipped.
4705 In that case, emit the initialization instructions at the end
4706 of the guard block that does the skip. */
4707 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4708 gcc_assert (skip_edge);
4709 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4710 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4711 }
4712 else
4713 {
4714 /* The normal case: emit the initialization instructions on the
4715 preheader edge. */
4716 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4717 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4718 }
4719 }
4720
4721 /* Function get_initial_def_for_reduction
4722
4723 Input:
4724 REDUC_INFO - the info_for_reduction
4725 INIT_VAL - the initial value of the reduction variable
4726 NEUTRAL_OP - a value that has no effect on the reduction, as per
4727 neutral_op_for_reduction
4728
4729 Output:
4730 Return a vector variable, initialized according to the operation that
4731 STMT_VINFO performs. This vector will be used as the initial value
4732 of the vector of partial results.
4733
4734 The value we need is a vector in which element 0 has value INIT_VAL
4735 and every other element has value NEUTRAL_OP. */
4736
4737 static tree
get_initial_def_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,tree init_val,tree neutral_op)4738 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4739 stmt_vec_info reduc_info,
4740 tree init_val, tree neutral_op)
4741 {
4742 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4743 tree scalar_type = TREE_TYPE (init_val);
4744 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4745 tree init_def;
4746 gimple_seq stmts = NULL;
4747
4748 gcc_assert (vectype);
4749
4750 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4751 || SCALAR_FLOAT_TYPE_P (scalar_type));
4752
4753 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4754 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4755
4756 if (operand_equal_p (init_val, neutral_op))
4757 {
4758 /* If both elements are equal then the vector described above is
4759 just a splat. */
4760 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4761 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4762 }
4763 else
4764 {
4765 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4766 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4767 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4768 {
4769 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4770 element 0. */
4771 init_def = gimple_build_vector_from_val (&stmts, vectype,
4772 neutral_op);
4773 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4774 vectype, init_def, init_val);
4775 }
4776 else
4777 {
4778 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4779 tree_vector_builder elts (vectype, 1, 2);
4780 elts.quick_push (init_val);
4781 elts.quick_push (neutral_op);
4782 init_def = gimple_build_vector (&stmts, &elts);
4783 }
4784 }
4785
4786 if (stmts)
4787 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4788 return init_def;
4789 }
4790
4791 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4792 which performs a reduction involving GROUP_SIZE scalar statements.
4793 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4794 is nonnull, introducing extra elements of that value will not change the
4795 result. */
4796
4797 static void
get_initial_defs_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,vec<tree> * vec_oprnds,unsigned int number_of_vectors,unsigned int group_size,tree neutral_op)4798 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4799 stmt_vec_info reduc_info,
4800 vec<tree> *vec_oprnds,
4801 unsigned int number_of_vectors,
4802 unsigned int group_size, tree neutral_op)
4803 {
4804 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4805 unsigned HOST_WIDE_INT nunits;
4806 unsigned j, number_of_places_left_in_vector;
4807 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4808 unsigned int i;
4809
4810 gcc_assert (group_size == initial_values.length () || neutral_op);
4811
4812 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4813 created vectors. It is greater than 1 if unrolling is performed.
4814
4815 For example, we have two scalar operands, s1 and s2 (e.g., group of
4816 strided accesses of size two), while NUNITS is four (i.e., four scalars
4817 of this type can be packed in a vector). The output vector will contain
4818 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4819 will be 2).
4820
4821 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4822 vectors containing the operands.
4823
4824 For example, NUNITS is four as before, and the group size is 8
4825 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4826 {s5, s6, s7, s8}. */
4827
4828 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4829 nunits = group_size;
4830
4831 number_of_places_left_in_vector = nunits;
4832 bool constant_p = true;
4833 tree_vector_builder elts (vector_type, nunits, 1);
4834 elts.quick_grow (nunits);
4835 gimple_seq ctor_seq = NULL;
4836 for (j = 0; j < nunits * number_of_vectors; ++j)
4837 {
4838 tree op;
4839 i = j % group_size;
4840
4841 /* Get the def before the loop. In reduction chain we have only
4842 one initial value. Else we have as many as PHIs in the group. */
4843 if (i >= initial_values.length () || (j > i && neutral_op))
4844 op = neutral_op;
4845 else
4846 op = initial_values[i];
4847
4848 /* Create 'vect_ = {op0,op1,...,opn}'. */
4849 number_of_places_left_in_vector--;
4850 elts[nunits - number_of_places_left_in_vector - 1] = op;
4851 if (!CONSTANT_CLASS_P (op))
4852 constant_p = false;
4853
4854 if (number_of_places_left_in_vector == 0)
4855 {
4856 tree init;
4857 if (constant_p && !neutral_op
4858 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4859 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4860 /* Build the vector directly from ELTS. */
4861 init = gimple_build_vector (&ctor_seq, &elts);
4862 else if (neutral_op)
4863 {
4864 /* Build a vector of the neutral value and shift the
4865 other elements into place. */
4866 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4867 neutral_op);
4868 int k = nunits;
4869 while (k > 0 && elts[k - 1] == neutral_op)
4870 k -= 1;
4871 while (k > 0)
4872 {
4873 k -= 1;
4874 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4875 vector_type, init, elts[k]);
4876 }
4877 }
4878 else
4879 {
4880 /* First time round, duplicate ELTS to fill the
4881 required number of vectors. */
4882 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4883 elts, number_of_vectors, *vec_oprnds);
4884 break;
4885 }
4886 vec_oprnds->quick_push (init);
4887
4888 number_of_places_left_in_vector = nunits;
4889 elts.new_vector (vector_type, nunits, 1);
4890 elts.quick_grow (nunits);
4891 constant_p = true;
4892 }
4893 }
4894 if (ctor_seq != NULL)
4895 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4896 }
4897
4898 /* For a statement STMT_INFO taking part in a reduction operation return
4899 the stmt_vec_info the meta information is stored on. */
4900
4901 stmt_vec_info
info_for_reduction(vec_info * vinfo,stmt_vec_info stmt_info)4902 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4903 {
4904 stmt_info = vect_orig_stmt (stmt_info);
4905 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4906 if (!is_a <gphi *> (stmt_info->stmt)
4907 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4908 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4909 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4910 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4911 {
4912 if (gimple_phi_num_args (phi) == 1)
4913 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4914 }
4915 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4916 {
4917 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4918 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4919 stmt_info = info;
4920 }
4921 return stmt_info;
4922 }
4923
4924 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4925 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4926 return false. */
4927
4928 static bool
vect_find_reusable_accumulator(loop_vec_info loop_vinfo,stmt_vec_info reduc_info)4929 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4930 stmt_vec_info reduc_info)
4931 {
4932 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4933 if (!main_loop_vinfo)
4934 return false;
4935
4936 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4937 return false;
4938
4939 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4940 auto_vec<tree, 16> main_loop_results (num_phis);
4941 auto_vec<tree, 16> initial_values (num_phis);
4942 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4943 {
4944 /* The epilogue loop can be entered either from the main loop or
4945 from an earlier guard block. */
4946 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4947 for (tree incoming_value : reduc_info->reduc_initial_values)
4948 {
4949 /* Look for:
4950
4951 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4952 INITIAL_VALUE(guard block)>. */
4953 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4954
4955 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4956 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4957
4958 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4959 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4960
4961 main_loop_results.quick_push (from_main_loop);
4962 initial_values.quick_push (from_skip);
4963 }
4964 }
4965 else
4966 /* The main loop dominates the epilogue loop. */
4967 main_loop_results.splice (reduc_info->reduc_initial_values);
4968
4969 /* See if the main loop has the kind of accumulator we need. */
4970 vect_reusable_accumulator *accumulator
4971 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4972 if (!accumulator
4973 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4974 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4975 accumulator->reduc_info->reduc_scalar_results.begin ()))
4976 return false;
4977
4978 /* Handle the case where we can reduce wider vectors to narrower ones. */
4979 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4980 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4981 unsigned HOST_WIDE_INT m;
4982 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4983 TYPE_VECTOR_SUBPARTS (vectype), &m))
4984 return false;
4985 /* Check the intermediate vector types and operations are available. */
4986 tree prev_vectype = old_vectype;
4987 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4988 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4989 {
4990 intermediate_nunits = exact_div (intermediate_nunits, 2);
4991 tree intermediate_vectype = get_related_vectype_for_scalar_type
4992 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
4993 if (!intermediate_vectype
4994 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
4995 intermediate_vectype)
4996 || !can_vec_extract (TYPE_MODE (prev_vectype),
4997 TYPE_MODE (intermediate_vectype)))
4998 return false;
4999 prev_vectype = intermediate_vectype;
5000 }
5001
5002 /* Non-SLP reductions might apply an adjustment after the reduction
5003 operation, in order to simplify the initialization of the accumulator.
5004 If the epilogue loop carries on from where the main loop left off,
5005 it should apply the same adjustment to the final reduction result.
5006
5007 If the epilogue loop can also be entered directly (rather than via
5008 the main loop), we need to be able to handle that case in the same way,
5009 with the same adjustment. (In principle we could add a PHI node
5010 to select the correct adjustment, but in practice that shouldn't be
5011 necessary.) */
5012 tree main_adjustment
5013 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5014 if (loop_vinfo->main_loop_edge && main_adjustment)
5015 {
5016 gcc_assert (num_phis == 1);
5017 tree initial_value = initial_values[0];
5018 /* Check that we can use INITIAL_VALUE as the adjustment and
5019 initialize the accumulator with a neutral value instead. */
5020 if (!operand_equal_p (initial_value, main_adjustment))
5021 return false;
5022 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5023 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5024 code, initial_value);
5025 }
5026 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5027 reduc_info->reduc_initial_values.truncate (0);
5028 reduc_info->reduc_initial_values.splice (initial_values);
5029 reduc_info->reused_accumulator = accumulator;
5030 return true;
5031 }
5032
5033 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5034 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5035
5036 static tree
vect_create_partial_epilog(tree vec_def,tree vectype,code_helper code,gimple_seq * seq)5037 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5038 gimple_seq *seq)
5039 {
5040 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5041 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5042 tree stype = TREE_TYPE (vectype);
5043 tree new_temp = vec_def;
5044 while (nunits > nunits1)
5045 {
5046 nunits /= 2;
5047 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5048 stype, nunits);
5049 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5050
5051 /* The target has to make sure we support lowpart/highpart
5052 extraction, either via direct vector extract or through
5053 an integer mode punning. */
5054 tree dst1, dst2;
5055 gimple *epilog_stmt;
5056 if (convert_optab_handler (vec_extract_optab,
5057 TYPE_MODE (TREE_TYPE (new_temp)),
5058 TYPE_MODE (vectype1))
5059 != CODE_FOR_nothing)
5060 {
5061 /* Extract sub-vectors directly once vec_extract becomes
5062 a conversion optab. */
5063 dst1 = make_ssa_name (vectype1);
5064 epilog_stmt
5065 = gimple_build_assign (dst1, BIT_FIELD_REF,
5066 build3 (BIT_FIELD_REF, vectype1,
5067 new_temp, TYPE_SIZE (vectype1),
5068 bitsize_int (0)));
5069 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5070 dst2 = make_ssa_name (vectype1);
5071 epilog_stmt
5072 = gimple_build_assign (dst2, BIT_FIELD_REF,
5073 build3 (BIT_FIELD_REF, vectype1,
5074 new_temp, TYPE_SIZE (vectype1),
5075 bitsize_int (bitsize)));
5076 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5077 }
5078 else
5079 {
5080 /* Extract via punning to appropriately sized integer mode
5081 vector. */
5082 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5083 tree etype = build_vector_type (eltype, 2);
5084 gcc_assert (convert_optab_handler (vec_extract_optab,
5085 TYPE_MODE (etype),
5086 TYPE_MODE (eltype))
5087 != CODE_FOR_nothing);
5088 tree tem = make_ssa_name (etype);
5089 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5090 build1 (VIEW_CONVERT_EXPR,
5091 etype, new_temp));
5092 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5093 new_temp = tem;
5094 tem = make_ssa_name (eltype);
5095 epilog_stmt
5096 = gimple_build_assign (tem, BIT_FIELD_REF,
5097 build3 (BIT_FIELD_REF, eltype,
5098 new_temp, TYPE_SIZE (eltype),
5099 bitsize_int (0)));
5100 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5101 dst1 = make_ssa_name (vectype1);
5102 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5103 build1 (VIEW_CONVERT_EXPR,
5104 vectype1, tem));
5105 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5106 tem = make_ssa_name (eltype);
5107 epilog_stmt
5108 = gimple_build_assign (tem, BIT_FIELD_REF,
5109 build3 (BIT_FIELD_REF, eltype,
5110 new_temp, TYPE_SIZE (eltype),
5111 bitsize_int (bitsize)));
5112 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5113 dst2 = make_ssa_name (vectype1);
5114 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5115 build1 (VIEW_CONVERT_EXPR,
5116 vectype1, tem));
5117 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5118 }
5119
5120 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5121 }
5122
5123 return new_temp;
5124 }
5125
5126 /* Function vect_create_epilog_for_reduction
5127
5128 Create code at the loop-epilog to finalize the result of a reduction
5129 computation.
5130
5131 STMT_INFO is the scalar reduction stmt that is being vectorized.
5132 SLP_NODE is an SLP node containing a group of reduction statements. The
5133 first one in this group is STMT_INFO.
5134 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5135 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5136 (counting from 0)
5137
5138 This function:
5139 1. Completes the reduction def-use cycles.
5140 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5141 by calling the function specified by REDUC_FN if available, or by
5142 other means (whole-vector shifts or a scalar loop).
5143 The function also creates a new phi node at the loop exit to preserve
5144 loop-closed form, as illustrated below.
5145
5146 The flow at the entry to this function:
5147
5148 loop:
5149 vec_def = phi <vec_init, null> # REDUCTION_PHI
5150 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5151 s_loop = scalar_stmt # (scalar) STMT_INFO
5152 loop_exit:
5153 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5154 use <s_out0>
5155 use <s_out0>
5156
5157 The above is transformed by this function into:
5158
5159 loop:
5160 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5161 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5162 s_loop = scalar_stmt # (scalar) STMT_INFO
5163 loop_exit:
5164 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5165 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5166 v_out2 = reduce <v_out1>
5167 s_out3 = extract_field <v_out2, 0>
5168 s_out4 = adjust_result <s_out3>
5169 use <s_out4>
5170 use <s_out4>
5171 */
5172
5173 static void
vect_create_epilog_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)5174 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5175 stmt_vec_info stmt_info,
5176 slp_tree slp_node,
5177 slp_instance slp_node_instance)
5178 {
5179 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5180 gcc_assert (reduc_info->is_reduc_info);
5181 /* For double reductions we need to get at the inner loop reduction
5182 stmt which has the meta info attached. Our stmt_info is that of the
5183 loop-closed PHI of the inner loop which we remember as
5184 def for the reduction PHI generation. */
5185 bool double_reduc = false;
5186 stmt_vec_info rdef_info = stmt_info;
5187 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5188 {
5189 gcc_assert (!slp_node);
5190 double_reduc = true;
5191 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5192 (stmt_info->stmt, 0));
5193 stmt_info = vect_stmt_to_vectorize (stmt_info);
5194 }
5195 gphi *reduc_def_stmt
5196 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5197 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5198 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5199 tree vectype;
5200 machine_mode mode;
5201 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5202 basic_block exit_bb;
5203 tree scalar_dest;
5204 tree scalar_type;
5205 gimple *new_phi = NULL, *phi;
5206 gimple_stmt_iterator exit_gsi;
5207 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5208 gimple *epilog_stmt = NULL;
5209 gimple *exit_phi;
5210 tree bitsize;
5211 tree def;
5212 tree orig_name, scalar_result;
5213 imm_use_iterator imm_iter, phi_imm_iter;
5214 use_operand_p use_p, phi_use_p;
5215 gimple *use_stmt;
5216 auto_vec<tree> reduc_inputs;
5217 int j, i;
5218 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5219 unsigned int group_size = 1, k;
5220 auto_vec<gimple *> phis;
5221 /* SLP reduction without reduction chain, e.g.,
5222 # a1 = phi <a2, a0>
5223 # b1 = phi <b2, b0>
5224 a2 = operation (a1)
5225 b2 = operation (b1) */
5226 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5227 bool direct_slp_reduc;
5228 tree induction_index = NULL_TREE;
5229
5230 if (slp_node)
5231 group_size = SLP_TREE_LANES (slp_node);
5232
5233 if (nested_in_vect_loop_p (loop, stmt_info))
5234 {
5235 outer_loop = loop;
5236 loop = loop->inner;
5237 gcc_assert (!slp_node && double_reduc);
5238 }
5239
5240 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5241 gcc_assert (vectype);
5242 mode = TYPE_MODE (vectype);
5243
5244 tree induc_val = NULL_TREE;
5245 tree adjustment_def = NULL;
5246 if (slp_node)
5247 ;
5248 else
5249 {
5250 /* Optimize: for induction condition reduction, if we can't use zero
5251 for induc_val, use initial_def. */
5252 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5253 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5254 else if (double_reduc)
5255 ;
5256 else
5257 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5258 }
5259
5260 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5261 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5262 if (slp_reduc)
5263 /* All statements produce live-out values. */
5264 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5265 else if (slp_node)
5266 {
5267 /* The last statement in the reduction chain produces the live-out
5268 value. Note SLP optimization can shuffle scalar stmts to
5269 optimize permutations so we have to search for the last stmt. */
5270 for (k = 0; k < group_size; ++k)
5271 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5272 {
5273 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5274 break;
5275 }
5276 }
5277
5278 unsigned vec_num;
5279 int ncopies;
5280 if (slp_node)
5281 {
5282 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5283 ncopies = 1;
5284 }
5285 else
5286 {
5287 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5288 vec_num = 1;
5289 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5290 }
5291
5292 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5293 which is updated with the current index of the loop for every match of
5294 the original loop's cond_expr (VEC_STMT). This results in a vector
5295 containing the last time the condition passed for that vector lane.
5296 The first match will be a 1 to allow 0 to be used for non-matching
5297 indexes. If there are no matches at all then the vector will be all
5298 zeroes.
5299
5300 PR92772: This algorithm is broken for architectures that support
5301 masked vectors, but do not provide fold_extract_last. */
5302 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5303 {
5304 auto_vec<std::pair<tree, bool>, 2> ccompares;
5305 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5306 cond_info = vect_stmt_to_vectorize (cond_info);
5307 while (cond_info != reduc_info)
5308 {
5309 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5310 {
5311 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5312 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5313 ccompares.safe_push
5314 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5315 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5316 }
5317 cond_info
5318 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5319 1 + STMT_VINFO_REDUC_IDX
5320 (cond_info)));
5321 cond_info = vect_stmt_to_vectorize (cond_info);
5322 }
5323 gcc_assert (ccompares.length () != 0);
5324
5325 tree indx_before_incr, indx_after_incr;
5326 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5327 int scalar_precision
5328 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5329 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5330 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5331 (TYPE_MODE (vectype), cr_index_scalar_type,
5332 TYPE_VECTOR_SUBPARTS (vectype));
5333
5334 /* First we create a simple vector induction variable which starts
5335 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5336 vector size (STEP). */
5337
5338 /* Create a {1,2,3,...} vector. */
5339 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5340
5341 /* Create a vector of the step value. */
5342 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5343 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5344
5345 /* Create an induction variable. */
5346 gimple_stmt_iterator incr_gsi;
5347 bool insert_after;
5348 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5349 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5350 insert_after, &indx_before_incr, &indx_after_incr);
5351
5352 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5353 filled with zeros (VEC_ZERO). */
5354
5355 /* Create a vector of 0s. */
5356 tree zero = build_zero_cst (cr_index_scalar_type);
5357 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5358
5359 /* Create a vector phi node. */
5360 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5361 new_phi = create_phi_node (new_phi_tree, loop->header);
5362 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5363 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5364
5365 /* Now take the condition from the loops original cond_exprs
5366 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5367 every match uses values from the induction variable
5368 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5369 (NEW_PHI_TREE).
5370 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5371 the new cond_expr (INDEX_COND_EXPR). */
5372 gimple_seq stmts = NULL;
5373 for (int i = ccompares.length () - 1; i != -1; --i)
5374 {
5375 tree ccompare = ccompares[i].first;
5376 if (ccompares[i].second)
5377 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5378 cr_index_vector_type,
5379 ccompare,
5380 indx_before_incr, new_phi_tree);
5381 else
5382 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5383 cr_index_vector_type,
5384 ccompare,
5385 new_phi_tree, indx_before_incr);
5386 }
5387 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5388
5389 /* Update the phi with the vec cond. */
5390 induction_index = new_phi_tree;
5391 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5392 loop_latch_edge (loop), UNKNOWN_LOCATION);
5393 }
5394
5395 /* 2. Create epilog code.
5396 The reduction epilog code operates across the elements of the vector
5397 of partial results computed by the vectorized loop.
5398 The reduction epilog code consists of:
5399
5400 step 1: compute the scalar result in a vector (v_out2)
5401 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5402 step 3: adjust the scalar result (s_out3) if needed.
5403
5404 Step 1 can be accomplished using one the following three schemes:
5405 (scheme 1) using reduc_fn, if available.
5406 (scheme 2) using whole-vector shifts, if available.
5407 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5408 combined.
5409
5410 The overall epilog code looks like this:
5411
5412 s_out0 = phi <s_loop> # original EXIT_PHI
5413 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5414 v_out2 = reduce <v_out1> # step 1
5415 s_out3 = extract_field <v_out2, 0> # step 2
5416 s_out4 = adjust_result <s_out3> # step 3
5417
5418 (step 3 is optional, and steps 1 and 2 may be combined).
5419 Lastly, the uses of s_out0 are replaced by s_out4. */
5420
5421
5422 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5423 v_out1 = phi <VECT_DEF>
5424 Store them in NEW_PHIS. */
5425 if (double_reduc)
5426 loop = outer_loop;
5427 exit_bb = single_exit (loop)->dest;
5428 exit_gsi = gsi_after_labels (exit_bb);
5429 reduc_inputs.create (slp_node ? vec_num : ncopies);
5430 for (unsigned i = 0; i < vec_num; i++)
5431 {
5432 gimple_seq stmts = NULL;
5433 if (slp_node)
5434 def = vect_get_slp_vect_def (slp_node, i);
5435 else
5436 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5437 for (j = 0; j < ncopies; j++)
5438 {
5439 tree new_def = copy_ssa_name (def);
5440 phi = create_phi_node (new_def, exit_bb);
5441 if (j)
5442 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5443 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5444 new_def = gimple_convert (&stmts, vectype, new_def);
5445 reduc_inputs.quick_push (new_def);
5446 }
5447 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5448 }
5449
5450 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5451 (i.e. when reduc_fn is not available) and in the final adjustment
5452 code (if needed). Also get the original scalar reduction variable as
5453 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5454 represents a reduction pattern), the tree-code and scalar-def are
5455 taken from the original stmt that the pattern-stmt (STMT) replaces.
5456 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5457 are taken from STMT. */
5458
5459 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5460 if (orig_stmt_info != stmt_info)
5461 {
5462 /* Reduction pattern */
5463 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5464 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5465 }
5466
5467 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5468 scalar_type = TREE_TYPE (scalar_dest);
5469 scalar_results.truncate (0);
5470 scalar_results.reserve_exact (group_size);
5471 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5472 bitsize = TYPE_SIZE (scalar_type);
5473
5474 /* True if we should implement SLP_REDUC using native reduction operations
5475 instead of scalar operations. */
5476 direct_slp_reduc = (reduc_fn != IFN_LAST
5477 && slp_reduc
5478 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5479
5480 /* In case of reduction chain, e.g.,
5481 # a1 = phi <a3, a0>
5482 a2 = operation (a1)
5483 a3 = operation (a2),
5484
5485 we may end up with more than one vector result. Here we reduce them
5486 to one vector.
5487
5488 The same is true if we couldn't use a single defuse cycle. */
5489 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5490 || direct_slp_reduc
5491 || ncopies > 1)
5492 {
5493 gimple_seq stmts = NULL;
5494 tree single_input = reduc_inputs[0];
5495 for (k = 1; k < reduc_inputs.length (); k++)
5496 single_input = gimple_build (&stmts, code, vectype,
5497 single_input, reduc_inputs[k]);
5498 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5499
5500 reduc_inputs.truncate (0);
5501 reduc_inputs.safe_push (single_input);
5502 }
5503
5504 tree orig_reduc_input = reduc_inputs[0];
5505
5506 /* If this loop is an epilogue loop that can be skipped after the
5507 main loop, we can only share a reduction operation between the
5508 main loop and the epilogue if we put it at the target of the
5509 skip edge.
5510
5511 We can still reuse accumulators if this check fails. Doing so has
5512 the minor(?) benefit of making the epilogue loop's scalar result
5513 independent of the main loop's scalar result. */
5514 bool unify_with_main_loop_p = false;
5515 if (reduc_info->reused_accumulator
5516 && loop_vinfo->skip_this_loop_edge
5517 && single_succ_p (exit_bb)
5518 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5519 {
5520 unify_with_main_loop_p = true;
5521
5522 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5523 reduc_inputs[0] = make_ssa_name (vectype);
5524 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5525 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5526 UNKNOWN_LOCATION);
5527 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5528 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5529 exit_gsi = gsi_after_labels (reduc_block);
5530 }
5531
5532 /* Shouldn't be used beyond this point. */
5533 exit_bb = nullptr;
5534
5535 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5536 && reduc_fn != IFN_LAST)
5537 {
5538 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5539 various data values where the condition matched and another vector
5540 (INDUCTION_INDEX) containing all the indexes of those matches. We
5541 need to extract the last matching index (which will be the index with
5542 highest value) and use this to index into the data vector.
5543 For the case where there were no matches, the data vector will contain
5544 all default values and the index vector will be all zeros. */
5545
5546 /* Get various versions of the type of the vector of indexes. */
5547 tree index_vec_type = TREE_TYPE (induction_index);
5548 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5549 tree index_scalar_type = TREE_TYPE (index_vec_type);
5550 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5551
5552 /* Get an unsigned integer version of the type of the data vector. */
5553 int scalar_precision
5554 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5555 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5556 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5557 vectype);
5558
5559 /* First we need to create a vector (ZERO_VEC) of zeros and another
5560 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5561 can create using a MAX reduction and then expanding.
5562 In the case where the loop never made any matches, the max index will
5563 be zero. */
5564
5565 /* Vector of {0, 0, 0,...}. */
5566 tree zero_vec = build_zero_cst (vectype);
5567
5568 /* Find maximum value from the vector of found indexes. */
5569 tree max_index = make_ssa_name (index_scalar_type);
5570 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5571 1, induction_index);
5572 gimple_call_set_lhs (max_index_stmt, max_index);
5573 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5574
5575 /* Vector of {max_index, max_index, max_index,...}. */
5576 tree max_index_vec = make_ssa_name (index_vec_type);
5577 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5578 max_index);
5579 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5580 max_index_vec_rhs);
5581 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5582
5583 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5584 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5585 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5586 otherwise. Only one value should match, resulting in a vector
5587 (VEC_COND) with one data value and the rest zeros.
5588 In the case where the loop never made any matches, every index will
5589 match, resulting in a vector with all data values (which will all be
5590 the default value). */
5591
5592 /* Compare the max index vector to the vector of found indexes to find
5593 the position of the max value. */
5594 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5595 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5596 induction_index,
5597 max_index_vec);
5598 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5599
5600 /* Use the compare to choose either values from the data vector or
5601 zero. */
5602 tree vec_cond = make_ssa_name (vectype);
5603 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5604 vec_compare,
5605 reduc_inputs[0],
5606 zero_vec);
5607 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5608
5609 /* Finally we need to extract the data value from the vector (VEC_COND)
5610 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5611 reduction, but because this doesn't exist, we can use a MAX reduction
5612 instead. The data value might be signed or a float so we need to cast
5613 it first.
5614 In the case where the loop never made any matches, the data values are
5615 all identical, and so will reduce down correctly. */
5616
5617 /* Make the matched data values unsigned. */
5618 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5619 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5620 vec_cond);
5621 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5622 VIEW_CONVERT_EXPR,
5623 vec_cond_cast_rhs);
5624 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5625
5626 /* Reduce down to a scalar value. */
5627 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5628 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5629 1, vec_cond_cast);
5630 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5631 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5632
5633 /* Convert the reduced value back to the result type and set as the
5634 result. */
5635 gimple_seq stmts = NULL;
5636 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5637 data_reduc);
5638 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5639 scalar_results.safe_push (new_temp);
5640 }
5641 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5642 && reduc_fn == IFN_LAST)
5643 {
5644 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5645 idx = 0;
5646 idx_val = induction_index[0];
5647 val = data_reduc[0];
5648 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5649 if (induction_index[i] > idx_val)
5650 val = data_reduc[i], idx_val = induction_index[i];
5651 return val; */
5652
5653 tree data_eltype = TREE_TYPE (vectype);
5654 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5655 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5656 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5657 /* Enforced by vectorizable_reduction, which ensures we have target
5658 support before allowing a conditional reduction on variable-length
5659 vectors. */
5660 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5661 tree idx_val = NULL_TREE, val = NULL_TREE;
5662 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5663 {
5664 tree old_idx_val = idx_val;
5665 tree old_val = val;
5666 idx_val = make_ssa_name (idx_eltype);
5667 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5668 build3 (BIT_FIELD_REF, idx_eltype,
5669 induction_index,
5670 bitsize_int (el_size),
5671 bitsize_int (off)));
5672 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5673 val = make_ssa_name (data_eltype);
5674 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5675 build3 (BIT_FIELD_REF,
5676 data_eltype,
5677 reduc_inputs[0],
5678 bitsize_int (el_size),
5679 bitsize_int (off)));
5680 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5681 if (off != 0)
5682 {
5683 tree new_idx_val = idx_val;
5684 if (off != v_size - el_size)
5685 {
5686 new_idx_val = make_ssa_name (idx_eltype);
5687 epilog_stmt = gimple_build_assign (new_idx_val,
5688 MAX_EXPR, idx_val,
5689 old_idx_val);
5690 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5691 }
5692 tree new_val = make_ssa_name (data_eltype);
5693 epilog_stmt = gimple_build_assign (new_val,
5694 COND_EXPR,
5695 build2 (GT_EXPR,
5696 boolean_type_node,
5697 idx_val,
5698 old_idx_val),
5699 val, old_val);
5700 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5701 idx_val = new_idx_val;
5702 val = new_val;
5703 }
5704 }
5705 /* Convert the reduced value back to the result type and set as the
5706 result. */
5707 gimple_seq stmts = NULL;
5708 val = gimple_convert (&stmts, scalar_type, val);
5709 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5710 scalar_results.safe_push (val);
5711 }
5712
5713 /* 2.3 Create the reduction code, using one of the three schemes described
5714 above. In SLP we simply need to extract all the elements from the
5715 vector (without reducing them), so we use scalar shifts. */
5716 else if (reduc_fn != IFN_LAST && !slp_reduc)
5717 {
5718 tree tmp;
5719 tree vec_elem_type;
5720
5721 /* Case 1: Create:
5722 v_out2 = reduc_expr <v_out1> */
5723
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_NOTE, vect_location,
5726 "Reduce using direct vector reduction.\n");
5727
5728 gimple_seq stmts = NULL;
5729 vec_elem_type = TREE_TYPE (vectype);
5730 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5731 vec_elem_type, reduc_inputs[0]);
5732 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5733 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5734
5735 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5736 && induc_val)
5737 {
5738 /* Earlier we set the initial value to be a vector if induc_val
5739 values. Check the result and if it is induc_val then replace
5740 with the original initial value, unless induc_val is
5741 the same as initial_def already. */
5742 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5743 induc_val);
5744 tree initial_def = reduc_info->reduc_initial_values[0];
5745
5746 tmp = make_ssa_name (new_scalar_dest);
5747 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5748 initial_def, new_temp);
5749 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750 new_temp = tmp;
5751 }
5752
5753 scalar_results.safe_push (new_temp);
5754 }
5755 else if (direct_slp_reduc)
5756 {
5757 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5758 with the elements for other SLP statements replaced with the
5759 neutral value. We can then do a normal reduction on each vector. */
5760
5761 /* Enforced by vectorizable_reduction. */
5762 gcc_assert (reduc_inputs.length () == 1);
5763 gcc_assert (pow2p_hwi (group_size));
5764
5765 gimple_seq seq = NULL;
5766
5767 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5768 and the same element size as VECTYPE. */
5769 tree index = build_index_vector (vectype, 0, 1);
5770 tree index_type = TREE_TYPE (index);
5771 tree index_elt_type = TREE_TYPE (index_type);
5772 tree mask_type = truth_type_for (index_type);
5773
5774 /* Create a vector that, for each element, identifies which of
5775 the REDUC_GROUP_SIZE results should use it. */
5776 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5777 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5778 build_vector_from_val (index_type, index_mask));
5779
5780 /* Get a neutral vector value. This is simply a splat of the neutral
5781 scalar value if we have one, otherwise the initial scalar value
5782 is itself a neutral value. */
5783 tree vector_identity = NULL_TREE;
5784 tree neutral_op = NULL_TREE;
5785 if (slp_node)
5786 {
5787 tree initial_value = NULL_TREE;
5788 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5789 initial_value = reduc_info->reduc_initial_values[0];
5790 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5791 initial_value);
5792 }
5793 if (neutral_op)
5794 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5795 neutral_op);
5796 for (unsigned int i = 0; i < group_size; ++i)
5797 {
5798 /* If there's no univeral neutral value, we can use the
5799 initial scalar value from the original PHI. This is used
5800 for MIN and MAX reduction, for example. */
5801 if (!neutral_op)
5802 {
5803 tree scalar_value = reduc_info->reduc_initial_values[i];
5804 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5805 scalar_value);
5806 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5807 scalar_value);
5808 }
5809
5810 /* Calculate the equivalent of:
5811
5812 sel[j] = (index[j] == i);
5813
5814 which selects the elements of REDUC_INPUTS[0] that should
5815 be included in the result. */
5816 tree compare_val = build_int_cst (index_elt_type, i);
5817 compare_val = build_vector_from_val (index_type, compare_val);
5818 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5819 index, compare_val);
5820
5821 /* Calculate the equivalent of:
5822
5823 vec = seq ? reduc_inputs[0] : vector_identity;
5824
5825 VEC is now suitable for a full vector reduction. */
5826 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5827 sel, reduc_inputs[0], vector_identity);
5828
5829 /* Do the reduction and convert it to the appropriate type. */
5830 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5831 TREE_TYPE (vectype), vec);
5832 scalar = gimple_convert (&seq, scalar_type, scalar);
5833 scalar_results.safe_push (scalar);
5834 }
5835 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5836 }
5837 else
5838 {
5839 bool reduce_with_shift;
5840 tree vec_temp;
5841
5842 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5843
5844 /* See if the target wants to do the final (shift) reduction
5845 in a vector mode of smaller size and first reduce upper/lower
5846 halves against each other. */
5847 enum machine_mode mode1 = mode;
5848 tree stype = TREE_TYPE (vectype);
5849 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5850 unsigned nunits1 = nunits;
5851 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5852 && reduc_inputs.length () == 1)
5853 {
5854 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5855 /* For SLP reductions we have to make sure lanes match up, but
5856 since we're doing individual element final reduction reducing
5857 vector width here is even more important.
5858 ??? We can also separate lanes with permutes, for the common
5859 case of power-of-two group-size odd/even extracts would work. */
5860 if (slp_reduc && nunits != nunits1)
5861 {
5862 nunits1 = least_common_multiple (nunits1, group_size);
5863 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5864 }
5865 }
5866 if (!slp_reduc
5867 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5868 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5869
5870 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5871 stype, nunits1);
5872 reduce_with_shift = have_whole_vector_shift (mode1);
5873 if (!VECTOR_MODE_P (mode1)
5874 || !directly_supported_p (code, vectype1))
5875 reduce_with_shift = false;
5876
5877 /* First reduce the vector to the desired vector size we should
5878 do shift reduction on by combining upper and lower halves. */
5879 gimple_seq stmts = NULL;
5880 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5881 code, &stmts);
5882 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5883 reduc_inputs[0] = new_temp;
5884
5885 if (reduce_with_shift && !slp_reduc)
5886 {
5887 int element_bitsize = tree_to_uhwi (bitsize);
5888 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5889 for variable-length vectors and also requires direct target support
5890 for loop reductions. */
5891 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5892 int nelements = vec_size_in_bits / element_bitsize;
5893 vec_perm_builder sel;
5894 vec_perm_indices indices;
5895
5896 int elt_offset;
5897
5898 tree zero_vec = build_zero_cst (vectype1);
5899 /* Case 2: Create:
5900 for (offset = nelements/2; offset >= 1; offset/=2)
5901 {
5902 Create: va' = vec_shift <va, offset>
5903 Create: va = vop <va, va'>
5904 } */
5905
5906 tree rhs;
5907
5908 if (dump_enabled_p ())
5909 dump_printf_loc (MSG_NOTE, vect_location,
5910 "Reduce using vector shifts\n");
5911
5912 gimple_seq stmts = NULL;
5913 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5914 for (elt_offset = nelements / 2;
5915 elt_offset >= 1;
5916 elt_offset /= 2)
5917 {
5918 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5919 indices.new_vector (sel, 2, nelements);
5920 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5921 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5922 new_temp, zero_vec, mask);
5923 new_temp = gimple_build (&stmts, code,
5924 vectype1, new_name, new_temp);
5925 }
5926 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5927
5928 /* 2.4 Extract the final scalar result. Create:
5929 s_out3 = extract_field <v_out2, bitpos> */
5930
5931 if (dump_enabled_p ())
5932 dump_printf_loc (MSG_NOTE, vect_location,
5933 "extract scalar result\n");
5934
5935 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5936 bitsize, bitsize_zero_node);
5937 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5938 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5939 gimple_assign_set_lhs (epilog_stmt, new_temp);
5940 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5941 scalar_results.safe_push (new_temp);
5942 }
5943 else
5944 {
5945 /* Case 3: Create:
5946 s = extract_field <v_out2, 0>
5947 for (offset = element_size;
5948 offset < vector_size;
5949 offset += element_size;)
5950 {
5951 Create: s' = extract_field <v_out2, offset>
5952 Create: s = op <s, s'> // For non SLP cases
5953 } */
5954
5955 if (dump_enabled_p ())
5956 dump_printf_loc (MSG_NOTE, vect_location,
5957 "Reduce using scalar code.\n");
5958
5959 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5960 int element_bitsize = tree_to_uhwi (bitsize);
5961 tree compute_type = TREE_TYPE (vectype);
5962 gimple_seq stmts = NULL;
5963 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5964 {
5965 int bit_offset;
5966 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5967 vec_temp, bitsize, bitsize_zero_node);
5968
5969 /* In SLP we don't need to apply reduction operation, so we just
5970 collect s' values in SCALAR_RESULTS. */
5971 if (slp_reduc)
5972 scalar_results.safe_push (new_temp);
5973
5974 for (bit_offset = element_bitsize;
5975 bit_offset < vec_size_in_bits;
5976 bit_offset += element_bitsize)
5977 {
5978 tree bitpos = bitsize_int (bit_offset);
5979 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5980 compute_type, vec_temp,
5981 bitsize, bitpos);
5982 if (slp_reduc)
5983 {
5984 /* In SLP we don't need to apply reduction operation, so
5985 we just collect s' values in SCALAR_RESULTS. */
5986 new_temp = new_name;
5987 scalar_results.safe_push (new_name);
5988 }
5989 else
5990 new_temp = gimple_build (&stmts, code, compute_type,
5991 new_name, new_temp);
5992 }
5993 }
5994
5995 /* The only case where we need to reduce scalar results in SLP, is
5996 unrolling. If the size of SCALAR_RESULTS is greater than
5997 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5998 REDUC_GROUP_SIZE. */
5999 if (slp_reduc)
6000 {
6001 tree res, first_res, new_res;
6002
6003 /* Reduce multiple scalar results in case of SLP unrolling. */
6004 for (j = group_size; scalar_results.iterate (j, &res);
6005 j++)
6006 {
6007 first_res = scalar_results[j % group_size];
6008 new_res = gimple_build (&stmts, code, compute_type,
6009 first_res, res);
6010 scalar_results[j % group_size] = new_res;
6011 }
6012 scalar_results.truncate (group_size);
6013 for (k = 0; k < group_size; k++)
6014 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6015 scalar_results[k]);
6016 }
6017 else
6018 {
6019 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6020 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6021 scalar_results.safe_push (new_temp);
6022 }
6023
6024 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6025 }
6026
6027 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6028 && induc_val)
6029 {
6030 /* Earlier we set the initial value to be a vector if induc_val
6031 values. Check the result and if it is induc_val then replace
6032 with the original initial value, unless induc_val is
6033 the same as initial_def already. */
6034 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6035 induc_val);
6036 tree initial_def = reduc_info->reduc_initial_values[0];
6037
6038 tree tmp = make_ssa_name (new_scalar_dest);
6039 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6040 initial_def, new_temp);
6041 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6042 scalar_results[0] = tmp;
6043 }
6044 }
6045
6046 /* 2.5 Adjust the final result by the initial value of the reduction
6047 variable. (When such adjustment is not needed, then
6048 'adjustment_def' is zero). For example, if code is PLUS we create:
6049 new_temp = loop_exit_def + adjustment_def */
6050
6051 if (adjustment_def)
6052 {
6053 gcc_assert (!slp_reduc);
6054 gimple_seq stmts = NULL;
6055 if (double_reduc)
6056 {
6057 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6058 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6059 new_temp = gimple_build (&stmts, code, vectype,
6060 reduc_inputs[0], adjustment_def);
6061 }
6062 else
6063 {
6064 new_temp = scalar_results[0];
6065 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6066 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6067 adjustment_def);
6068 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6069 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6070 new_temp, adjustment_def);
6071 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6072 }
6073
6074 epilog_stmt = gimple_seq_last_stmt (stmts);
6075 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6076 scalar_results[0] = new_temp;
6077 }
6078
6079 /* Record this operation if it could be reused by the epilogue loop. */
6080 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6081 && vec_num == 1)
6082 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6083 { orig_reduc_input, reduc_info });
6084
6085 if (double_reduc)
6086 loop = outer_loop;
6087
6088 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6089 phis with new adjusted scalar results, i.e., replace use <s_out0>
6090 with use <s_out4>.
6091
6092 Transform:
6093 loop_exit:
6094 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6095 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6096 v_out2 = reduce <v_out1>
6097 s_out3 = extract_field <v_out2, 0>
6098 s_out4 = adjust_result <s_out3>
6099 use <s_out0>
6100 use <s_out0>
6101
6102 into:
6103
6104 loop_exit:
6105 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6106 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6107 v_out2 = reduce <v_out1>
6108 s_out3 = extract_field <v_out2, 0>
6109 s_out4 = adjust_result <s_out3>
6110 use <s_out4>
6111 use <s_out4> */
6112
6113 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6114 for (k = 0; k < live_out_stmts.size (); k++)
6115 {
6116 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6117 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6118
6119 phis.create (3);
6120 /* Find the loop-closed-use at the loop exit of the original scalar
6121 result. (The reduction result is expected to have two immediate uses,
6122 one at the latch block, and one at the loop exit). For double
6123 reductions we are looking for exit phis of the outer loop. */
6124 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6125 {
6126 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6127 {
6128 if (!is_gimple_debug (USE_STMT (use_p)))
6129 phis.safe_push (USE_STMT (use_p));
6130 }
6131 else
6132 {
6133 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6134 {
6135 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6136
6137 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6138 {
6139 if (!flow_bb_inside_loop_p (loop,
6140 gimple_bb (USE_STMT (phi_use_p)))
6141 && !is_gimple_debug (USE_STMT (phi_use_p)))
6142 phis.safe_push (USE_STMT (phi_use_p));
6143 }
6144 }
6145 }
6146 }
6147
6148 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6149 {
6150 /* Replace the uses: */
6151 orig_name = PHI_RESULT (exit_phi);
6152
6153 /* Look for a single use at the target of the skip edge. */
6154 if (unify_with_main_loop_p)
6155 {
6156 use_operand_p use_p;
6157 gimple *user;
6158 if (!single_imm_use (orig_name, &use_p, &user))
6159 gcc_unreachable ();
6160 orig_name = gimple_get_lhs (user);
6161 }
6162
6163 scalar_result = scalar_results[k];
6164 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6165 {
6166 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6167 SET_USE (use_p, scalar_result);
6168 update_stmt (use_stmt);
6169 }
6170 }
6171
6172 phis.release ();
6173 }
6174 }
6175
6176 /* Return a vector of type VECTYPE that is equal to the vector select
6177 operation "MASK ? VEC : IDENTITY". Insert the select statements
6178 before GSI. */
6179
6180 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6181 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6182 tree vec, tree identity)
6183 {
6184 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6185 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6186 mask, vec, identity);
6187 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6188 return cond;
6189 }
6190
6191 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6192 order, starting with LHS. Insert the extraction statements before GSI and
6193 associate the new scalar SSA names with variable SCALAR_DEST.
6194 Return the SSA name for the result. */
6195
6196 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6197 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6198 tree_code code, tree lhs, tree vector_rhs)
6199 {
6200 tree vectype = TREE_TYPE (vector_rhs);
6201 tree scalar_type = TREE_TYPE (vectype);
6202 tree bitsize = TYPE_SIZE (scalar_type);
6203 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6204 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6205
6206 for (unsigned HOST_WIDE_INT bit_offset = 0;
6207 bit_offset < vec_size_in_bits;
6208 bit_offset += element_bitsize)
6209 {
6210 tree bitpos = bitsize_int (bit_offset);
6211 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6212 bitsize, bitpos);
6213
6214 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6215 rhs = make_ssa_name (scalar_dest, stmt);
6216 gimple_assign_set_lhs (stmt, rhs);
6217 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6218
6219 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6220 tree new_name = make_ssa_name (scalar_dest, stmt);
6221 gimple_assign_set_lhs (stmt, new_name);
6222 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6223 lhs = new_name;
6224 }
6225 return lhs;
6226 }
6227
6228 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6229 type of the vector input. */
6230
6231 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)6232 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6233 {
6234 internal_fn mask_reduc_fn;
6235
6236 switch (reduc_fn)
6237 {
6238 case IFN_FOLD_LEFT_PLUS:
6239 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6240 break;
6241
6242 default:
6243 return IFN_LAST;
6244 }
6245
6246 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6247 OPTIMIZE_FOR_SPEED))
6248 return mask_reduc_fn;
6249 return IFN_LAST;
6250 }
6251
6252 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6253 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6254 statement. CODE is the operation performed by STMT_INFO and OPS are
6255 its scalar operands. REDUC_INDEX is the index of the operand in
6256 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6257 implements in-order reduction, or IFN_LAST if we should open-code it.
6258 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6259 that should be used to control the operation in a fully-masked loop. */
6260
6261 static bool
vectorize_fold_left_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6262 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6263 stmt_vec_info stmt_info,
6264 gimple_stmt_iterator *gsi,
6265 gimple **vec_stmt, slp_tree slp_node,
6266 gimple *reduc_def_stmt,
6267 tree_code code, internal_fn reduc_fn,
6268 tree ops[3], tree vectype_in,
6269 int reduc_index, vec_loop_masks *masks)
6270 {
6271 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6272 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6273 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6274
6275 int ncopies;
6276 if (slp_node)
6277 ncopies = 1;
6278 else
6279 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6280
6281 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6282 gcc_assert (ncopies == 1);
6283 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6284
6285 if (slp_node)
6286 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6287 TYPE_VECTOR_SUBPARTS (vectype_in)));
6288
6289 tree op0 = ops[1 - reduc_index];
6290
6291 int group_size = 1;
6292 stmt_vec_info scalar_dest_def_info;
6293 auto_vec<tree> vec_oprnds0;
6294 if (slp_node)
6295 {
6296 auto_vec<vec<tree> > vec_defs (2);
6297 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6298 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6299 vec_defs[0].release ();
6300 vec_defs[1].release ();
6301 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6302 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6303 }
6304 else
6305 {
6306 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6307 op0, &vec_oprnds0);
6308 scalar_dest_def_info = stmt_info;
6309 }
6310
6311 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6312 tree scalar_type = TREE_TYPE (scalar_dest);
6313 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6314
6315 int vec_num = vec_oprnds0.length ();
6316 gcc_assert (vec_num == 1 || slp_node);
6317 tree vec_elem_type = TREE_TYPE (vectype_out);
6318 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6319
6320 tree vector_identity = NULL_TREE;
6321 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6322 vector_identity = build_zero_cst (vectype_out);
6323
6324 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6325 int i;
6326 tree def0;
6327 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6328 {
6329 gimple *new_stmt;
6330 tree mask = NULL_TREE;
6331 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6332 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6333
6334 /* Handle MINUS by adding the negative. */
6335 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6336 {
6337 tree negated = make_ssa_name (vectype_out);
6338 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6339 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6340 def0 = negated;
6341 }
6342
6343 if (mask && mask_reduc_fn == IFN_LAST)
6344 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6345 vector_identity);
6346
6347 /* On the first iteration the input is simply the scalar phi
6348 result, and for subsequent iterations it is the output of
6349 the preceding operation. */
6350 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6351 {
6352 if (mask && mask_reduc_fn != IFN_LAST)
6353 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6354 def0, mask);
6355 else
6356 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6357 def0);
6358 /* For chained SLP reductions the output of the previous reduction
6359 operation serves as the input of the next. For the final statement
6360 the output cannot be a temporary - we reuse the original
6361 scalar destination of the last statement. */
6362 if (i != vec_num - 1)
6363 {
6364 gimple_set_lhs (new_stmt, scalar_dest_var);
6365 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6366 gimple_set_lhs (new_stmt, reduc_var);
6367 }
6368 }
6369 else
6370 {
6371 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6372 reduc_var, def0);
6373 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6374 /* Remove the statement, so that we can use the same code paths
6375 as for statements that we've just created. */
6376 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6377 gsi_remove (&tmp_gsi, true);
6378 }
6379
6380 if (i == vec_num - 1)
6381 {
6382 gimple_set_lhs (new_stmt, scalar_dest);
6383 vect_finish_replace_stmt (loop_vinfo,
6384 scalar_dest_def_info,
6385 new_stmt);
6386 }
6387 else
6388 vect_finish_stmt_generation (loop_vinfo,
6389 scalar_dest_def_info,
6390 new_stmt, gsi);
6391
6392 if (slp_node)
6393 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6394 else
6395 {
6396 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6397 *vec_stmt = new_stmt;
6398 }
6399 }
6400
6401 return true;
6402 }
6403
6404 /* Function is_nonwrapping_integer_induction.
6405
6406 Check if STMT_VINO (which is part of loop LOOP) both increments and
6407 does not cause overflow. */
6408
6409 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)6410 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6411 {
6412 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6413 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6414 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6415 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6416 widest_int ni, max_loop_value, lhs_max;
6417 wi::overflow_type overflow = wi::OVF_NONE;
6418
6419 /* Make sure the loop is integer based. */
6420 if (TREE_CODE (base) != INTEGER_CST
6421 || TREE_CODE (step) != INTEGER_CST)
6422 return false;
6423
6424 /* Check that the max size of the loop will not wrap. */
6425
6426 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6427 return true;
6428
6429 if (! max_stmt_executions (loop, &ni))
6430 return false;
6431
6432 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6433 &overflow);
6434 if (overflow)
6435 return false;
6436
6437 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6438 TYPE_SIGN (lhs_type), &overflow);
6439 if (overflow)
6440 return false;
6441
6442 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6443 <= TYPE_PRECISION (lhs_type));
6444 }
6445
6446 /* Check if masking can be supported by inserting a conditional expression.
6447 CODE is the code for the operation. COND_FN is the conditional internal
6448 function, if it exists. VECTYPE_IN is the type of the vector input. */
6449 static bool
use_mask_by_cond_expr_p(code_helper code,internal_fn cond_fn,tree vectype_in)6450 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6451 tree vectype_in)
6452 {
6453 if (cond_fn != IFN_LAST
6454 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6455 OPTIMIZE_FOR_SPEED))
6456 return false;
6457
6458 if (code.is_tree_code ())
6459 switch (tree_code (code))
6460 {
6461 case DOT_PROD_EXPR:
6462 case SAD_EXPR:
6463 return true;
6464
6465 default:
6466 break;
6467 }
6468 return false;
6469 }
6470
6471 /* Insert a conditional expression to enable masked vectorization. CODE is the
6472 code for the operation. VOP is the array of operands. MASK is the loop
6473 mask. GSI is a statement iterator used to place the new conditional
6474 expression. */
6475 static void
build_vect_cond_expr(code_helper code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)6476 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6477 gimple_stmt_iterator *gsi)
6478 {
6479 switch (tree_code (code))
6480 {
6481 case DOT_PROD_EXPR:
6482 {
6483 tree vectype = TREE_TYPE (vop[1]);
6484 tree zero = build_zero_cst (vectype);
6485 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6486 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6487 mask, vop[1], zero);
6488 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6489 vop[1] = masked_op1;
6490 break;
6491 }
6492
6493 case SAD_EXPR:
6494 {
6495 tree vectype = TREE_TYPE (vop[1]);
6496 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6497 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6498 mask, vop[1], vop[0]);
6499 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6500 vop[1] = masked_op1;
6501 break;
6502 }
6503
6504 default:
6505 gcc_unreachable ();
6506 }
6507 }
6508
6509 /* Function vectorizable_reduction.
6510
6511 Check if STMT_INFO performs a reduction operation that can be vectorized.
6512 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6513 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6514 Return true if STMT_INFO is vectorizable in this way.
6515
6516 This function also handles reduction idioms (patterns) that have been
6517 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6518 may be of this form:
6519 X = pattern_expr (arg0, arg1, ..., X)
6520 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6521 sequence that had been detected and replaced by the pattern-stmt
6522 (STMT_INFO).
6523
6524 This function also handles reduction of condition expressions, for example:
6525 for (int i = 0; i < N; i++)
6526 if (a[i] < value)
6527 last = a[i];
6528 This is handled by vectorising the loop and creating an additional vector
6529 containing the loop indexes for which "a[i] < value" was true. In the
6530 function epilogue this is reduced to a single max value and then used to
6531 index into the vector of results.
6532
6533 In some cases of reduction patterns, the type of the reduction variable X is
6534 different than the type of the other arguments of STMT_INFO.
6535 In such cases, the vectype that is used when transforming STMT_INFO into
6536 a vector stmt is different than the vectype that is used to determine the
6537 vectorization factor, because it consists of a different number of elements
6538 than the actual number of elements that are being operated upon in parallel.
6539
6540 For example, consider an accumulation of shorts into an int accumulator.
6541 On some targets it's possible to vectorize this pattern operating on 8
6542 shorts at a time (hence, the vectype for purposes of determining the
6543 vectorization factor should be V8HI); on the other hand, the vectype that
6544 is used to create the vector form is actually V4SI (the type of the result).
6545
6546 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6547 indicates what is the actual level of parallelism (V8HI in the example), so
6548 that the right vectorization factor would be derived. This vectype
6549 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6550 be used to create the vectorized stmt. The right vectype for the vectorized
6551 stmt is obtained from the type of the result X:
6552 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6553
6554 This means that, contrary to "regular" reductions (or "regular" stmts in
6555 general), the following equation:
6556 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6557 does *NOT* necessarily hold for reduction patterns. */
6558
6559 bool
vectorizable_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6560 vectorizable_reduction (loop_vec_info loop_vinfo,
6561 stmt_vec_info stmt_info, slp_tree slp_node,
6562 slp_instance slp_node_instance,
6563 stmt_vector_for_cost *cost_vec)
6564 {
6565 tree vectype_in = NULL_TREE;
6566 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
6567 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6568 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6569 stmt_vec_info cond_stmt_vinfo = NULL;
6570 int i;
6571 int ncopies;
6572 bool single_defuse_cycle = false;
6573 bool nested_cycle = false;
6574 bool double_reduc = false;
6575 int vec_num;
6576 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6577 tree cond_reduc_val = NULL_TREE;
6578
6579 /* Make sure it was already recognized as a reduction computation. */
6580 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6581 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6582 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6583 return false;
6584
6585 /* The stmt we store reduction analysis meta on. */
6586 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6587 reduc_info->is_reduc_info = true;
6588
6589 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6590 {
6591 if (is_a <gphi *> (stmt_info->stmt))
6592 {
6593 if (slp_node)
6594 {
6595 /* We eventually need to set a vector type on invariant
6596 arguments. */
6597 unsigned j;
6598 slp_tree child;
6599 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6600 if (!vect_maybe_update_slp_op_vectype
6601 (child, SLP_TREE_VECTYPE (slp_node)))
6602 {
6603 if (dump_enabled_p ())
6604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605 "incompatible vector types for "
6606 "invariants\n");
6607 return false;
6608 }
6609 }
6610 /* Analysis for double-reduction is done on the outer
6611 loop PHI, nested cycles have no further restrictions. */
6612 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6613 }
6614 else
6615 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6616 return true;
6617 }
6618
6619 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6620 stmt_vec_info phi_info = stmt_info;
6621 if (!is_a <gphi *> (stmt_info->stmt))
6622 {
6623 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6624 return true;
6625 }
6626 if (slp_node)
6627 {
6628 slp_node_instance->reduc_phis = slp_node;
6629 /* ??? We're leaving slp_node to point to the PHIs, we only
6630 need it to get at the number of vector stmts which wasn't
6631 yet initialized for the instance root. */
6632 }
6633 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6634 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6635 else
6636 {
6637 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6638 == vect_double_reduction_def);
6639 use_operand_p use_p;
6640 gimple *use_stmt;
6641 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6642 &use_p, &use_stmt);
6643 gcc_assert (res);
6644 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6645 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6646 }
6647
6648 /* PHIs should not participate in patterns. */
6649 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6650 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6651
6652 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6653 and compute the reduction chain length. Discover the real
6654 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6655 tree reduc_def
6656 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6657 loop_latch_edge
6658 (gimple_bb (reduc_def_phi)->loop_father));
6659 unsigned reduc_chain_length = 0;
6660 bool only_slp_reduc_chain = true;
6661 stmt_info = NULL;
6662 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6663 while (reduc_def != PHI_RESULT (reduc_def_phi))
6664 {
6665 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6666 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6667 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6668 {
6669 if (dump_enabled_p ())
6670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6671 "reduction chain broken by patterns.\n");
6672 return false;
6673 }
6674 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6675 only_slp_reduc_chain = false;
6676 /* For epilogue generation live members of the chain need
6677 to point back to the PHI via their original stmt for
6678 info_for_reduction to work. For SLP we need to look at
6679 all lanes here - even though we only will vectorize from
6680 the SLP node with live lane zero the other live lanes also
6681 need to be identified as part of a reduction to be able
6682 to skip code generation for them. */
6683 if (slp_for_stmt_info)
6684 {
6685 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6686 if (STMT_VINFO_LIVE_P (s))
6687 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6688 }
6689 else if (STMT_VINFO_LIVE_P (vdef))
6690 STMT_VINFO_REDUC_DEF (def) = phi_info;
6691 gimple_match_op op;
6692 if (!gimple_extract_op (vdef->stmt, &op))
6693 {
6694 if (dump_enabled_p ())
6695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6696 "reduction chain includes unsupported"
6697 " statement type.\n");
6698 return false;
6699 }
6700 if (CONVERT_EXPR_CODE_P (op.code))
6701 {
6702 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6703 {
6704 if (dump_enabled_p ())
6705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706 "conversion in the reduction chain.\n");
6707 return false;
6708 }
6709 }
6710 else if (!stmt_info)
6711 /* First non-conversion stmt. */
6712 stmt_info = vdef;
6713 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6714 reduc_chain_length++;
6715 if (!stmt_info && slp_node)
6716 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6717 }
6718 /* PHIs should not participate in patterns. */
6719 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6720
6721 if (nested_in_vect_loop_p (loop, stmt_info))
6722 {
6723 loop = loop->inner;
6724 nested_cycle = true;
6725 }
6726
6727 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6728 element. */
6729 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6730 {
6731 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6732 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6733 }
6734 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6735 gcc_assert (slp_node
6736 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6737
6738 /* 1. Is vectorizable reduction? */
6739 /* Not supportable if the reduction variable is used in the loop, unless
6740 it's a reduction chain. */
6741 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6742 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6743 return false;
6744
6745 /* Reductions that are not used even in an enclosing outer-loop,
6746 are expected to be "live" (used out of the loop). */
6747 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6748 && !STMT_VINFO_LIVE_P (stmt_info))
6749 return false;
6750
6751 /* 2. Has this been recognized as a reduction pattern?
6752
6753 Check if STMT represents a pattern that has been recognized
6754 in earlier analysis stages. For stmts that represent a pattern,
6755 the STMT_VINFO_RELATED_STMT field records the last stmt in
6756 the original sequence that constitutes the pattern. */
6757
6758 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6759 if (orig_stmt_info)
6760 {
6761 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6762 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6763 }
6764
6765 /* 3. Check the operands of the operation. The first operands are defined
6766 inside the loop body. The last operand is the reduction variable,
6767 which is defined by the loop-header-phi. */
6768
6769 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6770 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6771 gimple_match_op op;
6772 if (!gimple_extract_op (stmt_info->stmt, &op))
6773 gcc_unreachable ();
6774 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6775 || op.code == WIDEN_SUM_EXPR
6776 || op.code == SAD_EXPR);
6777 enum optab_subtype optab_query_kind = optab_vector;
6778 if (op.code == DOT_PROD_EXPR
6779 && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6780 != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6781 optab_query_kind = optab_vector_mixed_sign;
6782
6783 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6784 && !SCALAR_FLOAT_TYPE_P (op.type))
6785 return false;
6786
6787 /* Do not try to vectorize bit-precision reductions. */
6788 if (!type_has_mode_precision_p (op.type))
6789 return false;
6790
6791 /* For lane-reducing ops we're reducing the number of reduction PHIs
6792 which means the only use of that may be in the lane-reducing operation. */
6793 if (lane_reduc_code_p
6794 && reduc_chain_length != 1
6795 && !only_slp_reduc_chain)
6796 {
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 "lane-reducing reduction with extra stmts.\n");
6800 return false;
6801 }
6802
6803 /* All uses but the last are expected to be defined in the loop.
6804 The last use is the reduction variable. In case of nested cycle this
6805 assumption is not true: we use reduc_index to record the index of the
6806 reduction variable. */
6807 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6808 /* We need to skip an extra operand for COND_EXPRs with embedded
6809 comparison. */
6810 unsigned opno_adjust = 0;
6811 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6812 opno_adjust = 1;
6813 for (i = 0; i < (int) op.num_ops; i++)
6814 {
6815 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6816 if (i == 0 && op.code == COND_EXPR)
6817 continue;
6818
6819 stmt_vec_info def_stmt_info;
6820 enum vect_def_type dt;
6821 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6822 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6823 &vectype_op[i], &def_stmt_info))
6824 {
6825 if (dump_enabled_p ())
6826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6827 "use not simple.\n");
6828 return false;
6829 }
6830 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6831 continue;
6832
6833 /* There should be only one cycle def in the stmt, the one
6834 leading to reduc_def. */
6835 if (VECTORIZABLE_CYCLE_DEF (dt))
6836 return false;
6837
6838 if (!vectype_op[i])
6839 vectype_op[i]
6840 = get_vectype_for_scalar_type (loop_vinfo,
6841 TREE_TYPE (op.ops[i]), slp_op[i]);
6842
6843 /* To properly compute ncopies we are interested in the widest
6844 non-reduction input type in case we're looking at a widening
6845 accumulation that we later handle in vect_transform_reduction. */
6846 if (lane_reduc_code_p
6847 && vectype_op[i]
6848 && (!vectype_in
6849 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6850 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
6851 vectype_in = vectype_op[i];
6852
6853 /* Record how the non-reduction-def value of COND_EXPR is defined.
6854 ??? For a chain of multiple CONDs we'd have to match them up all. */
6855 if (op.code == COND_EXPR && reduc_chain_length == 1)
6856 {
6857 if (dt == vect_constant_def)
6858 {
6859 cond_reduc_dt = dt;
6860 cond_reduc_val = op.ops[i];
6861 }
6862 else if (dt == vect_induction_def
6863 && def_stmt_info
6864 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6865 {
6866 cond_reduc_dt = dt;
6867 cond_stmt_vinfo = def_stmt_info;
6868 }
6869 }
6870 }
6871 if (!vectype_in)
6872 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6873 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6874
6875 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6876 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6877 /* If we have a condition reduction, see if we can simplify it further. */
6878 if (v_reduc_type == COND_REDUCTION)
6879 {
6880 if (slp_node)
6881 return false;
6882
6883 /* When the condition uses the reduction value in the condition, fail. */
6884 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6885 {
6886 if (dump_enabled_p ())
6887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6888 "condition depends on previous iteration\n");
6889 return false;
6890 }
6891
6892 if (reduc_chain_length == 1
6893 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6894 vectype_in, OPTIMIZE_FOR_SPEED))
6895 {
6896 if (dump_enabled_p ())
6897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898 "optimizing condition reduction with"
6899 " FOLD_EXTRACT_LAST.\n");
6900 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6901 }
6902 else if (cond_reduc_dt == vect_induction_def)
6903 {
6904 tree base
6905 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6906 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6907
6908 gcc_assert (TREE_CODE (base) == INTEGER_CST
6909 && TREE_CODE (step) == INTEGER_CST);
6910 cond_reduc_val = NULL_TREE;
6911 enum tree_code cond_reduc_op_code = ERROR_MARK;
6912 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6913 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6914 ;
6915 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6916 above base; punt if base is the minimum value of the type for
6917 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6918 else if (tree_int_cst_sgn (step) == -1)
6919 {
6920 cond_reduc_op_code = MIN_EXPR;
6921 if (tree_int_cst_sgn (base) == -1)
6922 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6923 else if (tree_int_cst_lt (base,
6924 TYPE_MAX_VALUE (TREE_TYPE (base))))
6925 cond_reduc_val
6926 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6927 }
6928 else
6929 {
6930 cond_reduc_op_code = MAX_EXPR;
6931 if (tree_int_cst_sgn (base) == 1)
6932 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6933 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6934 base))
6935 cond_reduc_val
6936 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6937 }
6938 if (cond_reduc_val)
6939 {
6940 if (dump_enabled_p ())
6941 dump_printf_loc (MSG_NOTE, vect_location,
6942 "condition expression based on "
6943 "integer induction.\n");
6944 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6945 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6946 = cond_reduc_val;
6947 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6948 }
6949 }
6950 else if (cond_reduc_dt == vect_constant_def)
6951 {
6952 enum vect_def_type cond_initial_dt;
6953 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6954 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6955 if (cond_initial_dt == vect_constant_def
6956 && types_compatible_p (TREE_TYPE (cond_initial_val),
6957 TREE_TYPE (cond_reduc_val)))
6958 {
6959 tree e = fold_binary (LE_EXPR, boolean_type_node,
6960 cond_initial_val, cond_reduc_val);
6961 if (e && (integer_onep (e) || integer_zerop (e)))
6962 {
6963 if (dump_enabled_p ())
6964 dump_printf_loc (MSG_NOTE, vect_location,
6965 "condition expression based on "
6966 "compile time constant.\n");
6967 /* Record reduction code at analysis stage. */
6968 STMT_VINFO_REDUC_CODE (reduc_info)
6969 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6970 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6971 }
6972 }
6973 }
6974 }
6975
6976 if (STMT_VINFO_LIVE_P (phi_info))
6977 return false;
6978
6979 if (slp_node)
6980 ncopies = 1;
6981 else
6982 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6983
6984 gcc_assert (ncopies >= 1);
6985
6986 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6987
6988 if (nested_cycle)
6989 {
6990 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6991 == vect_double_reduction_def);
6992 double_reduc = true;
6993 }
6994
6995 /* 4.2. Check support for the epilog operation.
6996
6997 If STMT represents a reduction pattern, then the type of the
6998 reduction variable may be different than the type of the rest
6999 of the arguments. For example, consider the case of accumulation
7000 of shorts into an int accumulator; The original code:
7001 S1: int_a = (int) short_a;
7002 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7003
7004 was replaced with:
7005 STMT: int_acc = widen_sum <short_a, int_acc>
7006
7007 This means that:
7008 1. The tree-code that is used to create the vector operation in the
7009 epilog code (that reduces the partial results) is not the
7010 tree-code of STMT, but is rather the tree-code of the original
7011 stmt from the pattern that STMT is replacing. I.e, in the example
7012 above we want to use 'widen_sum' in the loop, but 'plus' in the
7013 epilog.
7014 2. The type (mode) we use to check available target support
7015 for the vector operation to be created in the *epilog*, is
7016 determined by the type of the reduction variable (in the example
7017 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7018 However the type (mode) we use to check available target support
7019 for the vector operation to be created *inside the loop*, is
7020 determined by the type of the other arguments to STMT (in the
7021 example we'd check this: optab_handler (widen_sum_optab,
7022 vect_short_mode)).
7023
7024 This is contrary to "regular" reductions, in which the types of all
7025 the arguments are the same as the type of the reduction variable.
7026 For "regular" reductions we can therefore use the same vector type
7027 (and also the same tree-code) when generating the epilog code and
7028 when generating the code inside the loop. */
7029
7030 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7031 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7032
7033 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7034 if (reduction_type == TREE_CODE_REDUCTION)
7035 {
7036 /* Check whether it's ok to change the order of the computation.
7037 Generally, when vectorizing a reduction we change the order of the
7038 computation. This may change the behavior of the program in some
7039 cases, so we need to check that this is ok. One exception is when
7040 vectorizing an outer-loop: the inner-loop is executed sequentially,
7041 and therefore vectorizing reductions in the inner-loop during
7042 outer-loop vectorization is safe. Likewise when we are vectorizing
7043 a series of reductions using SLP and the VF is one the reductions
7044 are performed in scalar order. */
7045 if (slp_node
7046 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7047 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7048 ;
7049 else if (needs_fold_left_reduction_p (op.type, orig_code))
7050 {
7051 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7052 is not directy used in stmt. */
7053 if (!only_slp_reduc_chain
7054 && reduc_chain_length != 1)
7055 {
7056 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7058 "in-order reduction chain without SLP.\n");
7059 return false;
7060 }
7061 STMT_VINFO_REDUC_TYPE (reduc_info)
7062 = reduction_type = FOLD_LEFT_REDUCTION;
7063 }
7064 else if (!commutative_binary_op_p (orig_code, op.type)
7065 || !associative_binary_op_p (orig_code, op.type))
7066 {
7067 if (dump_enabled_p ())
7068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069 "reduction: not commutative/associative");
7070 return false;
7071 }
7072 }
7073
7074 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7075 && ncopies > 1)
7076 {
7077 if (dump_enabled_p ())
7078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079 "multiple types in double reduction or condition "
7080 "reduction or fold-left reduction.\n");
7081 return false;
7082 }
7083
7084 internal_fn reduc_fn = IFN_LAST;
7085 if (reduction_type == TREE_CODE_REDUCTION
7086 || reduction_type == FOLD_LEFT_REDUCTION
7087 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7088 || reduction_type == CONST_COND_REDUCTION)
7089 {
7090 if (reduction_type == FOLD_LEFT_REDUCTION
7091 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7092 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7093 {
7094 if (reduc_fn != IFN_LAST
7095 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7096 OPTIMIZE_FOR_SPEED))
7097 {
7098 if (dump_enabled_p ())
7099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7100 "reduc op not supported by target.\n");
7101
7102 reduc_fn = IFN_LAST;
7103 }
7104 }
7105 else
7106 {
7107 if (!nested_cycle || double_reduc)
7108 {
7109 if (dump_enabled_p ())
7110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111 "no reduc code for scalar code.\n");
7112
7113 return false;
7114 }
7115 }
7116 }
7117 else if (reduction_type == COND_REDUCTION)
7118 {
7119 int scalar_precision
7120 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7121 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7122 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7123 vectype_out);
7124
7125 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7126 OPTIMIZE_FOR_SPEED))
7127 reduc_fn = IFN_REDUC_MAX;
7128 }
7129 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7130
7131 if (reduction_type != EXTRACT_LAST_REDUCTION
7132 && (!nested_cycle || double_reduc)
7133 && reduc_fn == IFN_LAST
7134 && !nunits_out.is_constant ())
7135 {
7136 if (dump_enabled_p ())
7137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7138 "missing target support for reduction on"
7139 " variable-length vectors.\n");
7140 return false;
7141 }
7142
7143 /* For SLP reductions, see if there is a neutral value we can use. */
7144 tree neutral_op = NULL_TREE;
7145 if (slp_node)
7146 {
7147 tree initial_value = NULL_TREE;
7148 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7149 initial_value = vect_phi_initial_value (reduc_def_phi);
7150 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7151 orig_code, initial_value);
7152 }
7153
7154 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7155 {
7156 /* We can't support in-order reductions of code such as this:
7157
7158 for (int i = 0; i < n1; ++i)
7159 for (int j = 0; j < n2; ++j)
7160 l += a[j];
7161
7162 since GCC effectively transforms the loop when vectorizing:
7163
7164 for (int i = 0; i < n1 / VF; ++i)
7165 for (int j = 0; j < n2; ++j)
7166 for (int k = 0; k < VF; ++k)
7167 l += a[j];
7168
7169 which is a reassociation of the original operation. */
7170 if (dump_enabled_p ())
7171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7172 "in-order double reduction not supported.\n");
7173
7174 return false;
7175 }
7176
7177 if (reduction_type == FOLD_LEFT_REDUCTION
7178 && slp_node
7179 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7180 {
7181 /* We cannot use in-order reductions in this case because there is
7182 an implicit reassociation of the operations involved. */
7183 if (dump_enabled_p ())
7184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7185 "in-order unchained SLP reductions not supported.\n");
7186 return false;
7187 }
7188
7189 /* For double reductions, and for SLP reductions with a neutral value,
7190 we construct a variable-length initial vector by loading a vector
7191 full of the neutral value and then shift-and-inserting the start
7192 values into the low-numbered elements. */
7193 if ((double_reduc || neutral_op)
7194 && !nunits_out.is_constant ()
7195 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7196 vectype_out, OPTIMIZE_FOR_SPEED))
7197 {
7198 if (dump_enabled_p ())
7199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7200 "reduction on variable-length vectors requires"
7201 " target support for a vector-shift-and-insert"
7202 " operation.\n");
7203 return false;
7204 }
7205
7206 /* Check extra constraints for variable-length unchained SLP reductions. */
7207 if (slp_node
7208 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7209 && !nunits_out.is_constant ())
7210 {
7211 /* We checked above that we could build the initial vector when
7212 there's a neutral element value. Check here for the case in
7213 which each SLP statement has its own initial value and in which
7214 that value needs to be repeated for every instance of the
7215 statement within the initial vector. */
7216 unsigned int group_size = SLP_TREE_LANES (slp_node);
7217 if (!neutral_op
7218 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7219 TREE_TYPE (vectype_out)))
7220 {
7221 if (dump_enabled_p ())
7222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7223 "unsupported form of SLP reduction for"
7224 " variable-length vectors: cannot build"
7225 " initial vector.\n");
7226 return false;
7227 }
7228 /* The epilogue code relies on the number of elements being a multiple
7229 of the group size. The duplicate-and-interleave approach to setting
7230 up the initial vector does too. */
7231 if (!multiple_p (nunits_out, group_size))
7232 {
7233 if (dump_enabled_p ())
7234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7235 "unsupported form of SLP reduction for"
7236 " variable-length vectors: the vector size"
7237 " is not a multiple of the number of results.\n");
7238 return false;
7239 }
7240 }
7241
7242 if (reduction_type == COND_REDUCTION)
7243 {
7244 widest_int ni;
7245
7246 if (! max_loop_iterations (loop, &ni))
7247 {
7248 if (dump_enabled_p ())
7249 dump_printf_loc (MSG_NOTE, vect_location,
7250 "loop count not known, cannot create cond "
7251 "reduction.\n");
7252 return false;
7253 }
7254 /* Convert backedges to iterations. */
7255 ni += 1;
7256
7257 /* The additional index will be the same type as the condition. Check
7258 that the loop can fit into this less one (because we'll use up the
7259 zero slot for when there are no matches). */
7260 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7261 if (wi::geu_p (ni, wi::to_widest (max_index)))
7262 {
7263 if (dump_enabled_p ())
7264 dump_printf_loc (MSG_NOTE, vect_location,
7265 "loop size is greater than data size.\n");
7266 return false;
7267 }
7268 }
7269
7270 /* In case the vectorization factor (VF) is bigger than the number
7271 of elements that we can fit in a vectype (nunits), we have to generate
7272 more than one vector stmt - i.e - we need to "unroll" the
7273 vector stmt by a factor VF/nunits. For more details see documentation
7274 in vectorizable_operation. */
7275
7276 /* If the reduction is used in an outer loop we need to generate
7277 VF intermediate results, like so (e.g. for ncopies=2):
7278 r0 = phi (init, r0)
7279 r1 = phi (init, r1)
7280 r0 = x0 + r0;
7281 r1 = x1 + r1;
7282 (i.e. we generate VF results in 2 registers).
7283 In this case we have a separate def-use cycle for each copy, and therefore
7284 for each copy we get the vector def for the reduction variable from the
7285 respective phi node created for this copy.
7286
7287 Otherwise (the reduction is unused in the loop nest), we can combine
7288 together intermediate results, like so (e.g. for ncopies=2):
7289 r = phi (init, r)
7290 r = x0 + r;
7291 r = x1 + r;
7292 (i.e. we generate VF/2 results in a single register).
7293 In this case for each copy we get the vector def for the reduction variable
7294 from the vectorized reduction operation generated in the previous iteration.
7295
7296 This only works when we see both the reduction PHI and its only consumer
7297 in vectorizable_reduction and there are no intermediate stmts
7298 participating. When unrolling we want each unrolled iteration to have its
7299 own reduction accumulator since one of the main goals of unrolling a
7300 reduction is to reduce the aggregate loop-carried latency. */
7301 if (ncopies > 1
7302 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7303 && reduc_chain_length == 1
7304 && loop_vinfo->suggested_unroll_factor == 1)
7305 single_defuse_cycle = true;
7306
7307 if (single_defuse_cycle || lane_reduc_code_p)
7308 {
7309 gcc_assert (op.code != COND_EXPR);
7310
7311 /* 4. Supportable by target? */
7312 bool ok = true;
7313
7314 /* 4.1. check support for the operation in the loop */
7315 machine_mode vec_mode = TYPE_MODE (vectype_in);
7316 if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7317 {
7318 if (dump_enabled_p ())
7319 dump_printf (MSG_NOTE, "op not supported by target.\n");
7320 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7321 || !vect_can_vectorize_without_simd_p (op.code))
7322 ok = false;
7323 else
7324 if (dump_enabled_p ())
7325 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7326 }
7327
7328 if (vect_emulated_vector_p (vectype_in)
7329 && !vect_can_vectorize_without_simd_p (op.code))
7330 {
7331 if (dump_enabled_p ())
7332 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7333 return false;
7334 }
7335
7336 /* lane-reducing operations have to go through vect_transform_reduction.
7337 For the other cases try without the single cycle optimization. */
7338 if (!ok)
7339 {
7340 if (lane_reduc_code_p)
7341 return false;
7342 else
7343 single_defuse_cycle = false;
7344 }
7345 }
7346 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7347
7348 /* If the reduction stmt is one of the patterns that have lane
7349 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7350 if ((ncopies > 1 && ! single_defuse_cycle)
7351 && lane_reduc_code_p)
7352 {
7353 if (dump_enabled_p ())
7354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7355 "multi def-use cycle not possible for lane-reducing "
7356 "reduction operation\n");
7357 return false;
7358 }
7359
7360 if (slp_node
7361 && !(!single_defuse_cycle
7362 && !lane_reduc_code_p
7363 && reduction_type != FOLD_LEFT_REDUCTION))
7364 for (i = 0; i < (int) op.num_ops; i++)
7365 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7366 {
7367 if (dump_enabled_p ())
7368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7369 "incompatible vector types for invariants\n");
7370 return false;
7371 }
7372
7373 if (slp_node)
7374 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7375 else
7376 vec_num = 1;
7377
7378 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7379 reduction_type, ncopies, cost_vec);
7380 /* Cost the reduction op inside the loop if transformed via
7381 vect_transform_reduction. Otherwise this is costed by the
7382 separate vectorizable_* routines. */
7383 if (single_defuse_cycle || lane_reduc_code_p)
7384 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7385
7386 if (dump_enabled_p ()
7387 && reduction_type == FOLD_LEFT_REDUCTION)
7388 dump_printf_loc (MSG_NOTE, vect_location,
7389 "using an in-order (fold-left) reduction.\n");
7390 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7391 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7392 reductions go through their own vectorizable_* routines. */
7393 if (!single_defuse_cycle
7394 && !lane_reduc_code_p
7395 && reduction_type != FOLD_LEFT_REDUCTION)
7396 {
7397 stmt_vec_info tem
7398 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7399 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7400 {
7401 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7402 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7403 }
7404 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7405 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7406 }
7407 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7408 {
7409 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7410 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7411
7412 if (reduction_type != FOLD_LEFT_REDUCTION
7413 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7414 && (cond_fn == IFN_LAST
7415 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7416 OPTIMIZE_FOR_SPEED)))
7417 {
7418 if (dump_enabled_p ())
7419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420 "can't operate on partial vectors because"
7421 " no conditional operation is available.\n");
7422 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7423 }
7424 else if (reduction_type == FOLD_LEFT_REDUCTION
7425 && reduc_fn == IFN_LAST
7426 && !expand_vec_cond_expr_p (vectype_in,
7427 truth_type_for (vectype_in),
7428 SSA_NAME))
7429 {
7430 if (dump_enabled_p ())
7431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7432 "can't operate on partial vectors because"
7433 " no conditional operation is available.\n");
7434 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7435 }
7436 else
7437 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7438 vectype_in, NULL);
7439 }
7440 return true;
7441 }
7442
7443 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7444 value. */
7445
7446 bool
vect_transform_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)7447 vect_transform_reduction (loop_vec_info loop_vinfo,
7448 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7449 gimple **vec_stmt, slp_tree slp_node)
7450 {
7451 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7452 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7453 int i;
7454 int ncopies;
7455 int vec_num;
7456
7457 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7458 gcc_assert (reduc_info->is_reduc_info);
7459
7460 if (nested_in_vect_loop_p (loop, stmt_info))
7461 {
7462 loop = loop->inner;
7463 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7464 }
7465
7466 gimple_match_op op;
7467 if (!gimple_extract_op (stmt_info->stmt, &op))
7468 gcc_unreachable ();
7469
7470 /* All uses but the last are expected to be defined in the loop.
7471 The last use is the reduction variable. In case of nested cycle this
7472 assumption is not true: we use reduc_index to record the index of the
7473 reduction variable. */
7474 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7475 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7476 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7477 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7478
7479 if (slp_node)
7480 {
7481 ncopies = 1;
7482 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7483 }
7484 else
7485 {
7486 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7487 vec_num = 1;
7488 }
7489
7490 code_helper code = canonicalize_code (op.code, op.type);
7491 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7492 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7493 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7494
7495 /* Transform. */
7496 tree new_temp = NULL_TREE;
7497 auto_vec<tree> vec_oprnds0;
7498 auto_vec<tree> vec_oprnds1;
7499 auto_vec<tree> vec_oprnds2;
7500 tree def0;
7501
7502 if (dump_enabled_p ())
7503 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7504
7505 /* FORNOW: Multiple types are not supported for condition. */
7506 if (code == COND_EXPR)
7507 gcc_assert (ncopies == 1);
7508
7509 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7510
7511 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7512 if (reduction_type == FOLD_LEFT_REDUCTION)
7513 {
7514 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7515 gcc_assert (code.is_tree_code ());
7516 return vectorize_fold_left_reduction
7517 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7518 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7519 }
7520
7521 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7522 gcc_assert (single_defuse_cycle
7523 || code == DOT_PROD_EXPR
7524 || code == WIDEN_SUM_EXPR
7525 || code == SAD_EXPR);
7526
7527 /* Create the destination vector */
7528 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7529 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7530
7531 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7532 single_defuse_cycle && reduc_index == 0
7533 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7534 single_defuse_cycle && reduc_index == 1
7535 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7536 op.num_ops == 3
7537 && !(single_defuse_cycle && reduc_index == 2)
7538 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7539 if (single_defuse_cycle)
7540 {
7541 gcc_assert (!slp_node);
7542 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7543 op.ops[reduc_index],
7544 reduc_index == 0 ? &vec_oprnds0
7545 : (reduc_index == 1 ? &vec_oprnds1
7546 : &vec_oprnds2));
7547 }
7548
7549 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7550 {
7551 gimple *new_stmt;
7552 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7553 if (masked_loop_p && !mask_by_cond_expr)
7554 {
7555 /* Make sure that the reduction accumulator is vop[0]. */
7556 if (reduc_index == 1)
7557 {
7558 gcc_assert (commutative_binary_op_p (code, op.type));
7559 std::swap (vop[0], vop[1]);
7560 }
7561 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7562 vectype_in, i);
7563 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7564 vop[0], vop[1], vop[0]);
7565 new_temp = make_ssa_name (vec_dest, call);
7566 gimple_call_set_lhs (call, new_temp);
7567 gimple_call_set_nothrow (call, true);
7568 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7569 new_stmt = call;
7570 }
7571 else
7572 {
7573 if (op.num_ops == 3)
7574 vop[2] = vec_oprnds2[i];
7575
7576 if (masked_loop_p && mask_by_cond_expr)
7577 {
7578 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7579 vectype_in, i);
7580 build_vect_cond_expr (code, vop, mask, gsi);
7581 }
7582
7583 if (code.is_internal_fn ())
7584 new_stmt = gimple_build_call_internal (internal_fn (code),
7585 op.num_ops,
7586 vop[0], vop[1], vop[2]);
7587 else
7588 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7589 vop[0], vop[1], vop[2]);
7590 new_temp = make_ssa_name (vec_dest, new_stmt);
7591 gimple_set_lhs (new_stmt, new_temp);
7592 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7593 }
7594
7595 if (slp_node)
7596 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7597 else if (single_defuse_cycle
7598 && i < ncopies - 1)
7599 {
7600 if (reduc_index == 0)
7601 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7602 else if (reduc_index == 1)
7603 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7604 else if (reduc_index == 2)
7605 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7606 }
7607 else
7608 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7609 }
7610
7611 if (!slp_node)
7612 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7613
7614 return true;
7615 }
7616
7617 /* Transform phase of a cycle PHI. */
7618
7619 bool
vect_transform_cycle_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7620 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7621 stmt_vec_info stmt_info, gimple **vec_stmt,
7622 slp_tree slp_node, slp_instance slp_node_instance)
7623 {
7624 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7625 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7626 int i;
7627 int ncopies;
7628 int j;
7629 bool nested_cycle = false;
7630 int vec_num;
7631
7632 if (nested_in_vect_loop_p (loop, stmt_info))
7633 {
7634 loop = loop->inner;
7635 nested_cycle = true;
7636 }
7637
7638 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7639 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7640 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7641 gcc_assert (reduc_info->is_reduc_info);
7642
7643 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7644 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7645 /* Leave the scalar phi in place. */
7646 return true;
7647
7648 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7649 /* For a nested cycle we do not fill the above. */
7650 if (!vectype_in)
7651 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7652 gcc_assert (vectype_in);
7653
7654 if (slp_node)
7655 {
7656 /* The size vect_schedule_slp_instance computes is off for us. */
7657 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7658 * SLP_TREE_LANES (slp_node), vectype_in);
7659 ncopies = 1;
7660 }
7661 else
7662 {
7663 vec_num = 1;
7664 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7665 }
7666
7667 /* Check whether we should use a single PHI node and accumulate
7668 vectors to one before the backedge. */
7669 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7670 ncopies = 1;
7671
7672 /* Create the destination vector */
7673 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7674 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7675 vectype_out);
7676
7677 /* Get the loop-entry arguments. */
7678 tree vec_initial_def = NULL_TREE;
7679 auto_vec<tree> vec_initial_defs;
7680 if (slp_node)
7681 {
7682 vec_initial_defs.reserve (vec_num);
7683 if (nested_cycle)
7684 {
7685 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7686 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7687 &vec_initial_defs);
7688 }
7689 else
7690 {
7691 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7692 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7693 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7694
7695 unsigned int num_phis = stmts.length ();
7696 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7697 num_phis = 1;
7698 initial_values.reserve (num_phis);
7699 for (unsigned int i = 0; i < num_phis; ++i)
7700 {
7701 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7702 initial_values.quick_push (vect_phi_initial_value (this_phi));
7703 }
7704 if (vec_num == 1)
7705 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7706 if (!initial_values.is_empty ())
7707 {
7708 tree initial_value
7709 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7710 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7711 tree neutral_op
7712 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7713 code, initial_value);
7714 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7715 &vec_initial_defs, vec_num,
7716 stmts.length (), neutral_op);
7717 }
7718 }
7719 }
7720 else
7721 {
7722 /* Get at the scalar def before the loop, that defines the initial
7723 value of the reduction variable. */
7724 tree initial_def = vect_phi_initial_value (phi);
7725 reduc_info->reduc_initial_values.safe_push (initial_def);
7726 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7727 and we can't use zero for induc_val, use initial_def. Similarly
7728 for REDUC_MIN and initial_def larger than the base. */
7729 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7730 {
7731 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7732 if (TREE_CODE (initial_def) == INTEGER_CST
7733 && !integer_zerop (induc_val)
7734 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7735 && tree_int_cst_lt (initial_def, induc_val))
7736 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7737 && tree_int_cst_lt (induc_val, initial_def))))
7738 {
7739 induc_val = initial_def;
7740 /* Communicate we used the initial_def to epilouge
7741 generation. */
7742 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7743 }
7744 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7745 }
7746 else if (nested_cycle)
7747 {
7748 /* Do not use an adjustment def as that case is not supported
7749 correctly if ncopies is not one. */
7750 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7751 ncopies, initial_def,
7752 &vec_initial_defs);
7753 }
7754 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7755 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7756 /* Fill the initial vector with the initial scalar value. */
7757 vec_initial_def
7758 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7759 initial_def, initial_def);
7760 else
7761 {
7762 if (ncopies == 1)
7763 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7764 if (!reduc_info->reduc_initial_values.is_empty ())
7765 {
7766 initial_def = reduc_info->reduc_initial_values[0];
7767 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7768 tree neutral_op
7769 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7770 code, initial_def);
7771 gcc_assert (neutral_op);
7772 /* Try to simplify the vector initialization by applying an
7773 adjustment after the reduction has been performed. */
7774 if (!reduc_info->reused_accumulator
7775 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7776 && !operand_equal_p (neutral_op, initial_def))
7777 {
7778 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7779 = initial_def;
7780 initial_def = neutral_op;
7781 }
7782 vec_initial_def
7783 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7784 initial_def, neutral_op);
7785 }
7786 }
7787 }
7788
7789 if (vec_initial_def)
7790 {
7791 vec_initial_defs.create (ncopies);
7792 for (i = 0; i < ncopies; ++i)
7793 vec_initial_defs.quick_push (vec_initial_def);
7794 }
7795
7796 if (auto *accumulator = reduc_info->reused_accumulator)
7797 {
7798 tree def = accumulator->reduc_input;
7799 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7800 {
7801 unsigned int nreduc;
7802 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7803 (TREE_TYPE (def)),
7804 TYPE_VECTOR_SUBPARTS (vectype_out),
7805 &nreduc);
7806 gcc_assert (res);
7807 gimple_seq stmts = NULL;
7808 /* Reduce the single vector to a smaller one. */
7809 if (nreduc != 1)
7810 {
7811 /* Perform the reduction in the appropriate type. */
7812 tree rvectype = vectype_out;
7813 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7814 TREE_TYPE (TREE_TYPE (def))))
7815 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7816 TYPE_VECTOR_SUBPARTS
7817 (vectype_out));
7818 def = vect_create_partial_epilog (def, rvectype,
7819 STMT_VINFO_REDUC_CODE
7820 (reduc_info),
7821 &stmts);
7822 }
7823 /* The epilogue loop might use a different vector mode, like
7824 VNx2DI vs. V2DI. */
7825 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7826 {
7827 tree reduc_type = build_vector_type_for_mode
7828 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7829 def = gimple_convert (&stmts, reduc_type, def);
7830 }
7831 /* Adjust the input so we pick up the partially reduced value
7832 for the skip edge in vect_create_epilog_for_reduction. */
7833 accumulator->reduc_input = def;
7834 /* And the reduction could be carried out using a different sign. */
7835 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7836 def = gimple_convert (&stmts, vectype_out, def);
7837 if (loop_vinfo->main_loop_edge)
7838 {
7839 /* While we'd like to insert on the edge this will split
7840 blocks and disturb bookkeeping, we also will eventually
7841 need this on the skip edge. Rely on sinking to
7842 fixup optimal placement and insert in the pred. */
7843 gimple_stmt_iterator gsi
7844 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7845 /* Insert before a cond that eventually skips the
7846 epilogue. */
7847 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7848 gsi_prev (&gsi);
7849 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7850 }
7851 else
7852 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7853 stmts);
7854 }
7855 if (loop_vinfo->main_loop_edge)
7856 vec_initial_defs[0]
7857 = vect_get_main_loop_result (loop_vinfo, def,
7858 vec_initial_defs[0]);
7859 else
7860 vec_initial_defs.safe_push (def);
7861 }
7862
7863 /* Generate the reduction PHIs upfront. */
7864 for (i = 0; i < vec_num; i++)
7865 {
7866 tree vec_init_def = vec_initial_defs[i];
7867 for (j = 0; j < ncopies; j++)
7868 {
7869 /* Create the reduction-phi that defines the reduction
7870 operand. */
7871 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7872
7873 /* Set the loop-entry arg of the reduction-phi. */
7874 if (j != 0 && nested_cycle)
7875 vec_init_def = vec_initial_defs[j];
7876 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7877 UNKNOWN_LOCATION);
7878
7879 /* The loop-latch arg is set in epilogue processing. */
7880
7881 if (slp_node)
7882 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7883 else
7884 {
7885 if (j == 0)
7886 *vec_stmt = new_phi;
7887 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7888 }
7889 }
7890 }
7891
7892 return true;
7893 }
7894
7895 /* Vectorizes LC PHIs. */
7896
7897 bool
vectorizable_lc_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node)7898 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7899 stmt_vec_info stmt_info, gimple **vec_stmt,
7900 slp_tree slp_node)
7901 {
7902 if (!loop_vinfo
7903 || !is_a <gphi *> (stmt_info->stmt)
7904 || gimple_phi_num_args (stmt_info->stmt) != 1)
7905 return false;
7906
7907 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7908 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7909 return false;
7910
7911 if (!vec_stmt) /* transformation not required. */
7912 {
7913 /* Deal with copies from externs or constants that disguise as
7914 loop-closed PHI nodes (PR97886). */
7915 if (slp_node
7916 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7917 SLP_TREE_VECTYPE (slp_node)))
7918 {
7919 if (dump_enabled_p ())
7920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7921 "incompatible vector types for invariants\n");
7922 return false;
7923 }
7924 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7925 return true;
7926 }
7927
7928 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7929 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7930 basic_block bb = gimple_bb (stmt_info->stmt);
7931 edge e = single_pred_edge (bb);
7932 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7933 auto_vec<tree> vec_oprnds;
7934 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7935 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7936 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7937 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7938 {
7939 /* Create the vectorized LC PHI node. */
7940 gphi *new_phi = create_phi_node (vec_dest, bb);
7941 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7942 if (slp_node)
7943 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7944 else
7945 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7946 }
7947 if (!slp_node)
7948 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7949
7950 return true;
7951 }
7952
7953 /* Vectorizes PHIs. */
7954
7955 bool
vectorizable_phi(vec_info *,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7956 vectorizable_phi (vec_info *,
7957 stmt_vec_info stmt_info, gimple **vec_stmt,
7958 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7959 {
7960 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7961 return false;
7962
7963 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7964 return false;
7965
7966 tree vectype = SLP_TREE_VECTYPE (slp_node);
7967
7968 if (!vec_stmt) /* transformation not required. */
7969 {
7970 slp_tree child;
7971 unsigned i;
7972 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7973 if (!child)
7974 {
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 "PHI node with unvectorized backedge def\n");
7978 return false;
7979 }
7980 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7981 {
7982 if (dump_enabled_p ())
7983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7984 "incompatible vector types for invariants\n");
7985 return false;
7986 }
7987 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7988 && !useless_type_conversion_p (vectype,
7989 SLP_TREE_VECTYPE (child)))
7990 {
7991 /* With bools we can have mask and non-mask precision vectors
7992 or different non-mask precisions. while pattern recog is
7993 supposed to guarantee consistency here bugs in it can cause
7994 mismatches (PR103489 and PR103800 for example).
7995 Deal with them here instead of ICEing later. */
7996 if (dump_enabled_p ())
7997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7998 "incompatible vector type setup from "
7999 "bool pattern detection\n");
8000 return false;
8001 }
8002
8003 /* For single-argument PHIs assume coalescing which means zero cost
8004 for the scalar and the vector PHIs. This avoids artificially
8005 favoring the vector path (but may pessimize it in some cases). */
8006 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8007 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8008 vector_stmt, stmt_info, vectype, 0, vect_body);
8009 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8010 return true;
8011 }
8012
8013 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8014 basic_block bb = gimple_bb (stmt_info->stmt);
8015 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8016 auto_vec<gphi *> new_phis;
8017 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8018 {
8019 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8020
8021 /* Skip not yet vectorized defs. */
8022 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8023 && SLP_TREE_VEC_STMTS (child).is_empty ())
8024 continue;
8025
8026 auto_vec<tree> vec_oprnds;
8027 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8028 if (!new_phis.exists ())
8029 {
8030 new_phis.create (vec_oprnds.length ());
8031 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8032 {
8033 /* Create the vectorized LC PHI node. */
8034 new_phis.quick_push (create_phi_node (vec_dest, bb));
8035 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8036 }
8037 }
8038 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8039 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8040 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8041 }
8042 /* We should have at least one already vectorized child. */
8043 gcc_assert (new_phis.exists ());
8044
8045 return true;
8046 }
8047
8048 /* Return true if VECTYPE represents a vector that requires lowering
8049 by the vector lowering pass. */
8050
8051 bool
vect_emulated_vector_p(tree vectype)8052 vect_emulated_vector_p (tree vectype)
8053 {
8054 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8055 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8056 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8057 }
8058
8059 /* Return true if we can emulate CODE on an integer mode representation
8060 of a vector. */
8061
8062 bool
vect_can_vectorize_without_simd_p(tree_code code)8063 vect_can_vectorize_without_simd_p (tree_code code)
8064 {
8065 switch (code)
8066 {
8067 case PLUS_EXPR:
8068 case MINUS_EXPR:
8069 case NEGATE_EXPR:
8070 case BIT_AND_EXPR:
8071 case BIT_IOR_EXPR:
8072 case BIT_XOR_EXPR:
8073 case BIT_NOT_EXPR:
8074 return true;
8075
8076 default:
8077 return false;
8078 }
8079 }
8080
8081 /* Likewise, but taking a code_helper. */
8082
8083 bool
vect_can_vectorize_without_simd_p(code_helper code)8084 vect_can_vectorize_without_simd_p (code_helper code)
8085 {
8086 return (code.is_tree_code ()
8087 && vect_can_vectorize_without_simd_p (tree_code (code)));
8088 }
8089
8090 /* Function vectorizable_induction
8091
8092 Check if STMT_INFO performs an induction computation that can be vectorized.
8093 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8094 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8095 Return true if STMT_INFO is vectorizable in this way. */
8096
8097 bool
vectorizable_induction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)8098 vectorizable_induction (loop_vec_info loop_vinfo,
8099 stmt_vec_info stmt_info,
8100 gimple **vec_stmt, slp_tree slp_node,
8101 stmt_vector_for_cost *cost_vec)
8102 {
8103 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8104 unsigned ncopies;
8105 bool nested_in_vect_loop = false;
8106 class loop *iv_loop;
8107 tree vec_def;
8108 edge pe = loop_preheader_edge (loop);
8109 basic_block new_bb;
8110 tree new_vec, vec_init, vec_step, t;
8111 tree new_name;
8112 gimple *new_stmt;
8113 gphi *induction_phi;
8114 tree induc_def, vec_dest;
8115 tree init_expr, step_expr;
8116 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8117 unsigned i;
8118 tree expr;
8119 gimple_stmt_iterator si;
8120
8121 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8122 if (!phi)
8123 return false;
8124
8125 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8126 return false;
8127
8128 /* Make sure it was recognized as induction computation. */
8129 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8130 return false;
8131
8132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8133 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8134
8135 if (slp_node)
8136 ncopies = 1;
8137 else
8138 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8139 gcc_assert (ncopies >= 1);
8140
8141 /* FORNOW. These restrictions should be relaxed. */
8142 if (nested_in_vect_loop_p (loop, stmt_info))
8143 {
8144 imm_use_iterator imm_iter;
8145 use_operand_p use_p;
8146 gimple *exit_phi;
8147 edge latch_e;
8148 tree loop_arg;
8149
8150 if (ncopies > 1)
8151 {
8152 if (dump_enabled_p ())
8153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8154 "multiple types in nested loop.\n");
8155 return false;
8156 }
8157
8158 exit_phi = NULL;
8159 latch_e = loop_latch_edge (loop->inner);
8160 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8161 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8162 {
8163 gimple *use_stmt = USE_STMT (use_p);
8164 if (is_gimple_debug (use_stmt))
8165 continue;
8166
8167 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8168 {
8169 exit_phi = use_stmt;
8170 break;
8171 }
8172 }
8173 if (exit_phi)
8174 {
8175 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8176 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8177 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8178 {
8179 if (dump_enabled_p ())
8180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181 "inner-loop induction only used outside "
8182 "of the outer vectorized loop.\n");
8183 return false;
8184 }
8185 }
8186
8187 nested_in_vect_loop = true;
8188 iv_loop = loop->inner;
8189 }
8190 else
8191 iv_loop = loop;
8192 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8193
8194 if (slp_node && !nunits.is_constant ())
8195 {
8196 /* The current SLP code creates the step value element-by-element. */
8197 if (dump_enabled_p ())
8198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199 "SLP induction not supported for variable-length"
8200 " vectors.\n");
8201 return false;
8202 }
8203
8204 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8205 {
8206 if (dump_enabled_p ())
8207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8208 "floating point induction vectorization disabled\n");
8209 return false;
8210 }
8211
8212 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8213 gcc_assert (step_expr != NULL_TREE);
8214 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
8215 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
8216 {
8217 if (dump_enabled_p ())
8218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8219 "bit-precision induction vectorization not "
8220 "supported.\n");
8221 return false;
8222 }
8223 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8224
8225 /* Check for backend support of PLUS/MINUS_EXPR. */
8226 if (!directly_supported_p (PLUS_EXPR, step_vectype)
8227 || !directly_supported_p (MINUS_EXPR, step_vectype))
8228 return false;
8229
8230 if (!vec_stmt) /* transformation not required. */
8231 {
8232 unsigned inside_cost = 0, prologue_cost = 0;
8233 if (slp_node)
8234 {
8235 /* We eventually need to set a vector type on invariant
8236 arguments. */
8237 unsigned j;
8238 slp_tree child;
8239 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8240 if (!vect_maybe_update_slp_op_vectype
8241 (child, SLP_TREE_VECTYPE (slp_node)))
8242 {
8243 if (dump_enabled_p ())
8244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8245 "incompatible vector types for "
8246 "invariants\n");
8247 return false;
8248 }
8249 /* loop cost for vec_loop. */
8250 inside_cost
8251 = record_stmt_cost (cost_vec,
8252 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8253 vector_stmt, stmt_info, 0, vect_body);
8254 /* prologue cost for vec_init (if not nested) and step. */
8255 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8256 scalar_to_vec,
8257 stmt_info, 0, vect_prologue);
8258 }
8259 else /* if (!slp_node) */
8260 {
8261 /* loop cost for vec_loop. */
8262 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8263 stmt_info, 0, vect_body);
8264 /* prologue cost for vec_init and vec_step. */
8265 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8266 stmt_info, 0, vect_prologue);
8267 }
8268 if (dump_enabled_p ())
8269 dump_printf_loc (MSG_NOTE, vect_location,
8270 "vect_model_induction_cost: inside_cost = %d, "
8271 "prologue_cost = %d .\n", inside_cost,
8272 prologue_cost);
8273
8274 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8275 DUMP_VECT_SCOPE ("vectorizable_induction");
8276 return true;
8277 }
8278
8279 /* Transform. */
8280
8281 /* Compute a vector variable, initialized with the first VF values of
8282 the induction variable. E.g., for an iv with IV_PHI='X' and
8283 evolution S, for a vector of 4 units, we want to compute:
8284 [X, X + S, X + 2*S, X + 3*S]. */
8285
8286 if (dump_enabled_p ())
8287 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8288
8289 pe = loop_preheader_edge (iv_loop);
8290 /* Find the first insertion point in the BB. */
8291 basic_block bb = gimple_bb (phi);
8292 si = gsi_after_labels (bb);
8293
8294 /* For SLP induction we have to generate several IVs as for example
8295 with group size 3 we need
8296 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8297 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8298 if (slp_node)
8299 {
8300 /* Enforced above. */
8301 unsigned int const_nunits = nunits.to_constant ();
8302
8303 /* The initial values are vectorized, but any lanes > group_size
8304 need adjustment. */
8305 slp_tree init_node
8306 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8307
8308 /* Gather steps. Since we do not vectorize inductions as
8309 cycles we have to reconstruct the step from SCEV data. */
8310 unsigned group_size = SLP_TREE_LANES (slp_node);
8311 tree *steps = XALLOCAVEC (tree, group_size);
8312 tree *inits = XALLOCAVEC (tree, group_size);
8313 stmt_vec_info phi_info;
8314 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8315 {
8316 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8317 if (!init_node)
8318 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8319 pe->dest_idx);
8320 }
8321
8322 /* Now generate the IVs. */
8323 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8324 gcc_assert ((const_nunits * nvects) % group_size == 0);
8325 unsigned nivs;
8326 if (nested_in_vect_loop)
8327 nivs = nvects;
8328 else
8329 {
8330 /* Compute the number of distinct IVs we need. First reduce
8331 group_size if it is a multiple of const_nunits so we get
8332 one IV for a group_size of 4 but const_nunits 2. */
8333 unsigned group_sizep = group_size;
8334 if (group_sizep % const_nunits == 0)
8335 group_sizep = group_sizep / const_nunits;
8336 nivs = least_common_multiple (group_sizep,
8337 const_nunits) / const_nunits;
8338 }
8339 tree stept = TREE_TYPE (step_vectype);
8340 tree lupdate_mul = NULL_TREE;
8341 if (!nested_in_vect_loop)
8342 {
8343 /* The number of iterations covered in one vector iteration. */
8344 unsigned lup_mul = (nvects * const_nunits) / group_size;
8345 lupdate_mul
8346 = build_vector_from_val (step_vectype,
8347 SCALAR_FLOAT_TYPE_P (stept)
8348 ? build_real_from_wide (stept, lup_mul,
8349 UNSIGNED)
8350 : build_int_cstu (stept, lup_mul));
8351 }
8352 tree peel_mul = NULL_TREE;
8353 gimple_seq init_stmts = NULL;
8354 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8355 {
8356 if (SCALAR_FLOAT_TYPE_P (stept))
8357 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8358 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8359 else
8360 peel_mul = gimple_convert (&init_stmts, stept,
8361 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8362 peel_mul = gimple_build_vector_from_val (&init_stmts,
8363 step_vectype, peel_mul);
8364 }
8365 unsigned ivn;
8366 auto_vec<tree> vec_steps;
8367 for (ivn = 0; ivn < nivs; ++ivn)
8368 {
8369 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8370 tree_vector_builder init_elts (vectype, const_nunits, 1);
8371 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8372 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8373 {
8374 /* The scalar steps of the IVs. */
8375 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8376 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8377 step_elts.quick_push (elt);
8378 if (!init_node)
8379 {
8380 /* The scalar inits of the IVs if not vectorized. */
8381 elt = inits[(ivn*const_nunits + eltn) % group_size];
8382 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8383 TREE_TYPE (elt)))
8384 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8385 TREE_TYPE (vectype), elt);
8386 init_elts.quick_push (elt);
8387 }
8388 /* The number of steps to add to the initial values. */
8389 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8390 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8391 ? build_real_from_wide (stept,
8392 mul_elt, UNSIGNED)
8393 : build_int_cstu (stept, mul_elt));
8394 }
8395 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8396 vec_steps.safe_push (vec_step);
8397 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8398 if (peel_mul)
8399 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8400 step_mul, peel_mul);
8401 if (!init_node)
8402 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8403
8404 /* Create the induction-phi that defines the induction-operand. */
8405 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8406 "vec_iv_");
8407 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8408 induc_def = PHI_RESULT (induction_phi);
8409
8410 /* Create the iv update inside the loop */
8411 tree up = vec_step;
8412 if (lupdate_mul)
8413 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8414 vec_step, lupdate_mul);
8415 gimple_seq stmts = NULL;
8416 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8417 vec_def = gimple_build (&stmts,
8418 PLUS_EXPR, step_vectype, vec_def, up);
8419 vec_def = gimple_convert (&stmts, vectype, vec_def);
8420 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8421 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8422 UNKNOWN_LOCATION);
8423
8424 if (init_node)
8425 vec_init = vect_get_slp_vect_def (init_node, ivn);
8426 if (!nested_in_vect_loop
8427 && !integer_zerop (step_mul))
8428 {
8429 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8430 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8431 vec_step, step_mul);
8432 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8433 vec_def, up);
8434 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8435 }
8436
8437 /* Set the arguments of the phi node: */
8438 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8439
8440 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8441 }
8442 if (!nested_in_vect_loop)
8443 {
8444 /* Fill up to the number of vectors we need for the whole group. */
8445 nivs = least_common_multiple (group_size,
8446 const_nunits) / const_nunits;
8447 vec_steps.reserve (nivs-ivn);
8448 for (; ivn < nivs; ++ivn)
8449 {
8450 SLP_TREE_VEC_STMTS (slp_node)
8451 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8452 vec_steps.quick_push (vec_steps[0]);
8453 }
8454 }
8455
8456 /* Re-use IVs when we can. We are generating further vector
8457 stmts by adding VF' * stride to the IVs generated above. */
8458 if (ivn < nvects)
8459 {
8460 unsigned vfp
8461 = least_common_multiple (group_size, const_nunits) / group_size;
8462 tree lupdate_mul
8463 = build_vector_from_val (step_vectype,
8464 SCALAR_FLOAT_TYPE_P (stept)
8465 ? build_real_from_wide (stept,
8466 vfp, UNSIGNED)
8467 : build_int_cstu (stept, vfp));
8468 for (; ivn < nvects; ++ivn)
8469 {
8470 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8471 tree def = gimple_get_lhs (iv);
8472 if (ivn < 2*nivs)
8473 vec_steps[ivn - nivs]
8474 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8475 vec_steps[ivn - nivs], lupdate_mul);
8476 gimple_seq stmts = NULL;
8477 def = gimple_convert (&stmts, step_vectype, def);
8478 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8479 def, vec_steps[ivn % nivs]);
8480 def = gimple_convert (&stmts, vectype, def);
8481 if (gimple_code (iv) == GIMPLE_PHI)
8482 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8483 else
8484 {
8485 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8486 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8487 }
8488 SLP_TREE_VEC_STMTS (slp_node)
8489 .quick_push (SSA_NAME_DEF_STMT (def));
8490 }
8491 }
8492
8493 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8494 gcc_assert (!new_bb);
8495
8496 return true;
8497 }
8498
8499 init_expr = vect_phi_initial_value (phi);
8500
8501 gimple_seq stmts = NULL;
8502 if (!nested_in_vect_loop)
8503 {
8504 /* Convert the initial value to the IV update type. */
8505 tree new_type = TREE_TYPE (step_expr);
8506 init_expr = gimple_convert (&stmts, new_type, init_expr);
8507
8508 /* If we are using the loop mask to "peel" for alignment then we need
8509 to adjust the start value here. */
8510 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8511 if (skip_niters != NULL_TREE)
8512 {
8513 if (FLOAT_TYPE_P (vectype))
8514 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8515 skip_niters);
8516 else
8517 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8518 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8519 skip_niters, step_expr);
8520 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8521 init_expr, skip_step);
8522 }
8523 }
8524
8525 if (stmts)
8526 {
8527 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8528 gcc_assert (!new_bb);
8529 }
8530
8531 /* Create the vector that holds the initial_value of the induction. */
8532 if (nested_in_vect_loop)
8533 {
8534 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8535 been created during vectorization of previous stmts. We obtain it
8536 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8537 auto_vec<tree> vec_inits;
8538 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8539 init_expr, &vec_inits);
8540 vec_init = vec_inits[0];
8541 /* If the initial value is not of proper type, convert it. */
8542 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8543 {
8544 new_stmt
8545 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8546 vect_simple_var,
8547 "vec_iv_"),
8548 VIEW_CONVERT_EXPR,
8549 build1 (VIEW_CONVERT_EXPR, vectype,
8550 vec_init));
8551 vec_init = gimple_assign_lhs (new_stmt);
8552 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8553 new_stmt);
8554 gcc_assert (!new_bb);
8555 }
8556 }
8557 else
8558 {
8559 /* iv_loop is the loop to be vectorized. Create:
8560 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8561 stmts = NULL;
8562 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8563
8564 unsigned HOST_WIDE_INT const_nunits;
8565 if (nunits.is_constant (&const_nunits))
8566 {
8567 tree_vector_builder elts (step_vectype, const_nunits, 1);
8568 elts.quick_push (new_name);
8569 for (i = 1; i < const_nunits; i++)
8570 {
8571 /* Create: new_name_i = new_name + step_expr */
8572 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8573 new_name, step_expr);
8574 elts.quick_push (new_name);
8575 }
8576 /* Create a vector from [new_name_0, new_name_1, ...,
8577 new_name_nunits-1] */
8578 vec_init = gimple_build_vector (&stmts, &elts);
8579 }
8580 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8581 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8582 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8583 new_name, step_expr);
8584 else
8585 {
8586 /* Build:
8587 [base, base, base, ...]
8588 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8589 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8590 gcc_assert (flag_associative_math);
8591 tree index = build_index_vector (step_vectype, 0, 1);
8592 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8593 new_name);
8594 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8595 step_expr);
8596 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8597 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8598 vec_init, step_vec);
8599 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8600 vec_init, base_vec);
8601 }
8602 vec_init = gimple_convert (&stmts, vectype, vec_init);
8603
8604 if (stmts)
8605 {
8606 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8607 gcc_assert (!new_bb);
8608 }
8609 }
8610
8611
8612 /* Create the vector that holds the step of the induction. */
8613 if (nested_in_vect_loop)
8614 /* iv_loop is nested in the loop to be vectorized. Generate:
8615 vec_step = [S, S, S, S] */
8616 new_name = step_expr;
8617 else
8618 {
8619 /* iv_loop is the loop to be vectorized. Generate:
8620 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8621 gimple_seq seq = NULL;
8622 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8623 {
8624 expr = build_int_cst (integer_type_node, vf);
8625 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8626 }
8627 else
8628 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8629 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8630 expr, step_expr);
8631 if (seq)
8632 {
8633 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8634 gcc_assert (!new_bb);
8635 }
8636 }
8637
8638 t = unshare_expr (new_name);
8639 gcc_assert (CONSTANT_CLASS_P (new_name)
8640 || TREE_CODE (new_name) == SSA_NAME);
8641 new_vec = build_vector_from_val (step_vectype, t);
8642 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8643 new_vec, step_vectype, NULL);
8644
8645
8646 /* Create the following def-use cycle:
8647 loop prolog:
8648 vec_init = ...
8649 vec_step = ...
8650 loop:
8651 vec_iv = PHI <vec_init, vec_loop>
8652 ...
8653 STMT
8654 ...
8655 vec_loop = vec_iv + vec_step; */
8656
8657 /* Create the induction-phi that defines the induction-operand. */
8658 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8659 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8660 induc_def = PHI_RESULT (induction_phi);
8661
8662 /* Create the iv update inside the loop */
8663 stmts = NULL;
8664 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8665 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8666 vec_def = gimple_convert (&stmts, vectype, vec_def);
8667 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8668 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8669
8670 /* Set the arguments of the phi node: */
8671 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8672 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8673 UNKNOWN_LOCATION);
8674
8675 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8676 *vec_stmt = induction_phi;
8677
8678 /* In case that vectorization factor (VF) is bigger than the number
8679 of elements that we can fit in a vectype (nunits), we have to generate
8680 more than one vector stmt - i.e - we need to "unroll" the
8681 vector stmt by a factor VF/nunits. For more details see documentation
8682 in vectorizable_operation. */
8683
8684 if (ncopies > 1)
8685 {
8686 gimple_seq seq = NULL;
8687 /* FORNOW. This restriction should be relaxed. */
8688 gcc_assert (!nested_in_vect_loop);
8689
8690 /* Create the vector that holds the step of the induction. */
8691 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8692 {
8693 expr = build_int_cst (integer_type_node, nunits);
8694 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8695 }
8696 else
8697 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8698 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8699 expr, step_expr);
8700 if (seq)
8701 {
8702 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8703 gcc_assert (!new_bb);
8704 }
8705
8706 t = unshare_expr (new_name);
8707 gcc_assert (CONSTANT_CLASS_P (new_name)
8708 || TREE_CODE (new_name) == SSA_NAME);
8709 new_vec = build_vector_from_val (step_vectype, t);
8710 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8711 new_vec, step_vectype, NULL);
8712
8713 vec_def = induc_def;
8714 for (i = 1; i < ncopies; i++)
8715 {
8716 /* vec_i = vec_prev + vec_step */
8717 gimple_seq stmts = NULL;
8718 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8719 vec_def = gimple_build (&stmts,
8720 PLUS_EXPR, step_vectype, vec_def, vec_step);
8721 vec_def = gimple_convert (&stmts, vectype, vec_def);
8722
8723 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8724 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8725 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8726 }
8727 }
8728
8729 if (dump_enabled_p ())
8730 dump_printf_loc (MSG_NOTE, vect_location,
8731 "transform induction: created def-use cycle: %G%G",
8732 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8733
8734 return true;
8735 }
8736
8737 /* Function vectorizable_live_operation.
8738
8739 STMT_INFO computes a value that is used outside the loop. Check if
8740 it can be supported. */
8741
8742 bool
vectorizable_live_operation(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost * cost_vec)8743 vectorizable_live_operation (vec_info *vinfo,
8744 stmt_vec_info stmt_info,
8745 gimple_stmt_iterator *gsi,
8746 slp_tree slp_node, slp_instance slp_node_instance,
8747 int slp_index, bool vec_stmt_p,
8748 stmt_vector_for_cost *cost_vec)
8749 {
8750 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8751 imm_use_iterator imm_iter;
8752 tree lhs, lhs_type, bitsize;
8753 tree vectype = (slp_node
8754 ? SLP_TREE_VECTYPE (slp_node)
8755 : STMT_VINFO_VECTYPE (stmt_info));
8756 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8757 int ncopies;
8758 gimple *use_stmt;
8759 auto_vec<tree> vec_oprnds;
8760 int vec_entry = 0;
8761 poly_uint64 vec_index = 0;
8762
8763 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8764
8765 /* If a stmt of a reduction is live, vectorize it via
8766 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8767 validity so just trigger the transform here. */
8768 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8769 {
8770 if (!vec_stmt_p)
8771 return true;
8772 if (slp_node)
8773 {
8774 /* For reduction chains the meta-info is attached to
8775 the group leader. */
8776 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8777 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8778 /* For SLP reductions we vectorize the epilogue for
8779 all involved stmts together. */
8780 else if (slp_index != 0)
8781 return true;
8782 }
8783 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8784 gcc_assert (reduc_info->is_reduc_info);
8785 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8786 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8787 return true;
8788 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8789 slp_node_instance);
8790 return true;
8791 }
8792
8793 /* If STMT is not relevant and it is a simple assignment and its inputs are
8794 invariant then it can remain in place, unvectorized. The original last
8795 scalar value that it computes will be used. */
8796 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8797 {
8798 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8799 if (dump_enabled_p ())
8800 dump_printf_loc (MSG_NOTE, vect_location,
8801 "statement is simple and uses invariant. Leaving in "
8802 "place.\n");
8803 return true;
8804 }
8805
8806 if (slp_node)
8807 ncopies = 1;
8808 else
8809 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8810
8811 if (slp_node)
8812 {
8813 gcc_assert (slp_index >= 0);
8814
8815 /* Get the last occurrence of the scalar index from the concatenation of
8816 all the slp vectors. Calculate which slp vector it is and the index
8817 within. */
8818 int num_scalar = SLP_TREE_LANES (slp_node);
8819 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8820 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8821
8822 /* Calculate which vector contains the result, and which lane of
8823 that vector we need. */
8824 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8825 {
8826 if (dump_enabled_p ())
8827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8828 "Cannot determine which vector holds the"
8829 " final result.\n");
8830 return false;
8831 }
8832 }
8833
8834 if (!vec_stmt_p)
8835 {
8836 /* No transformation required. */
8837 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8838 {
8839 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8840 OPTIMIZE_FOR_SPEED))
8841 {
8842 if (dump_enabled_p ())
8843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8844 "can't operate on partial vectors "
8845 "because the target doesn't support extract "
8846 "last reduction.\n");
8847 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8848 }
8849 else if (slp_node)
8850 {
8851 if (dump_enabled_p ())
8852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8853 "can't operate on partial vectors "
8854 "because an SLP statement is live after "
8855 "the loop.\n");
8856 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8857 }
8858 else if (ncopies > 1)
8859 {
8860 if (dump_enabled_p ())
8861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8862 "can't operate on partial vectors "
8863 "because ncopies is greater than 1.\n");
8864 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8865 }
8866 else
8867 {
8868 gcc_assert (ncopies == 1 && !slp_node);
8869 vect_record_loop_mask (loop_vinfo,
8870 &LOOP_VINFO_MASKS (loop_vinfo),
8871 1, vectype, NULL);
8872 }
8873 }
8874 /* ??? Enable for loop costing as well. */
8875 if (!loop_vinfo)
8876 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8877 0, vect_epilogue);
8878 return true;
8879 }
8880
8881 /* Use the lhs of the original scalar statement. */
8882 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8883 if (dump_enabled_p ())
8884 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8885 "stmt %G", stmt);
8886
8887 lhs = gimple_get_lhs (stmt);
8888 lhs_type = TREE_TYPE (lhs);
8889
8890 bitsize = vector_element_bits_tree (vectype);
8891
8892 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8893 tree vec_lhs, bitstart;
8894 gimple *vec_stmt;
8895 if (slp_node)
8896 {
8897 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8898
8899 /* Get the correct slp vectorized stmt. */
8900 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8901 vec_lhs = gimple_get_lhs (vec_stmt);
8902
8903 /* Get entry to use. */
8904 bitstart = bitsize_int (vec_index);
8905 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8906 }
8907 else
8908 {
8909 /* For multiple copies, get the last copy. */
8910 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8911 vec_lhs = gimple_get_lhs (vec_stmt);
8912
8913 /* Get the last lane in the vector. */
8914 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8915 }
8916
8917 if (loop_vinfo)
8918 {
8919 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8920 requirement, insert one phi node for it. It looks like:
8921 loop;
8922 BB:
8923 # lhs' = PHI <lhs>
8924 ==>
8925 loop;
8926 BB:
8927 # vec_lhs' = PHI <vec_lhs>
8928 new_tree = lane_extract <vec_lhs', ...>;
8929 lhs' = new_tree; */
8930
8931 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8932 basic_block exit_bb = single_exit (loop)->dest;
8933 gcc_assert (single_pred_p (exit_bb));
8934
8935 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8936 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8937 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8938
8939 gimple_seq stmts = NULL;
8940 tree new_tree;
8941 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8942 {
8943 /* Emit:
8944
8945 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8946
8947 where VEC_LHS is the vectorized live-out result and MASK is
8948 the loop mask for the final iteration. */
8949 gcc_assert (ncopies == 1 && !slp_node);
8950 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8951 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8952 1, vectype, 0);
8953 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8954 mask, vec_lhs_phi);
8955
8956 /* Convert the extracted vector element to the scalar type. */
8957 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8958 }
8959 else
8960 {
8961 tree bftype = TREE_TYPE (vectype);
8962 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8963 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8964 new_tree = build3 (BIT_FIELD_REF, bftype,
8965 vec_lhs_phi, bitsize, bitstart);
8966 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8967 &stmts, true, NULL_TREE);
8968 }
8969
8970 if (stmts)
8971 {
8972 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8973 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8974
8975 /* Remove existing phi from lhs and create one copy from new_tree. */
8976 tree lhs_phi = NULL_TREE;
8977 gimple_stmt_iterator gsi;
8978 for (gsi = gsi_start_phis (exit_bb);
8979 !gsi_end_p (gsi); gsi_next (&gsi))
8980 {
8981 gimple *phi = gsi_stmt (gsi);
8982 if ((gimple_phi_arg_def (phi, 0) == lhs))
8983 {
8984 remove_phi_node (&gsi, false);
8985 lhs_phi = gimple_phi_result (phi);
8986 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8987 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8988 break;
8989 }
8990 }
8991 }
8992
8993 /* Replace use of lhs with newly computed result. If the use stmt is a
8994 single arg PHI, just replace all uses of PHI result. It's necessary
8995 because lcssa PHI defining lhs may be before newly inserted stmt. */
8996 use_operand_p use_p;
8997 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8998 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8999 && !is_gimple_debug (use_stmt))
9000 {
9001 if (gimple_code (use_stmt) == GIMPLE_PHI
9002 && gimple_phi_num_args (use_stmt) == 1)
9003 {
9004 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
9005 }
9006 else
9007 {
9008 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9009 SET_USE (use_p, new_tree);
9010 }
9011 update_stmt (use_stmt);
9012 }
9013 }
9014 else
9015 {
9016 /* For basic-block vectorization simply insert the lane-extraction. */
9017 tree bftype = TREE_TYPE (vectype);
9018 if (VECTOR_BOOLEAN_TYPE_P (vectype))
9019 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9020 tree new_tree = build3 (BIT_FIELD_REF, bftype,
9021 vec_lhs, bitsize, bitstart);
9022 gimple_seq stmts = NULL;
9023 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9024 &stmts, true, NULL_TREE);
9025 if (TREE_CODE (new_tree) == SSA_NAME
9026 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9027 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9028 if (is_a <gphi *> (vec_stmt))
9029 {
9030 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9031 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9032 }
9033 else
9034 {
9035 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9036 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9037 }
9038
9039 /* Replace use of lhs with newly computed result. If the use stmt is a
9040 single arg PHI, just replace all uses of PHI result. It's necessary
9041 because lcssa PHI defining lhs may be before newly inserted stmt. */
9042 use_operand_p use_p;
9043 stmt_vec_info use_stmt_info;
9044 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9045 if (!is_gimple_debug (use_stmt)
9046 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9047 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9048 {
9049 /* ??? This can happen when the live lane ends up being
9050 used in a vector construction code-generated by an
9051 external SLP node (and code-generation for that already
9052 happened). See gcc.dg/vect/bb-slp-47.c.
9053 Doing this is what would happen if that vector CTOR
9054 were not code-generated yet so it is not too bad.
9055 ??? In fact we'd likely want to avoid this situation
9056 in the first place. */
9057 if (TREE_CODE (new_tree) == SSA_NAME
9058 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9059 && gimple_code (use_stmt) != GIMPLE_PHI
9060 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9061 use_stmt))
9062 {
9063 enum tree_code code = gimple_assign_rhs_code (use_stmt);
9064 gcc_checking_assert (code == SSA_NAME
9065 || code == CONSTRUCTOR
9066 || code == VIEW_CONVERT_EXPR
9067 || CONVERT_EXPR_CODE_P (code));
9068 if (dump_enabled_p ())
9069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9070 "Using original scalar computation for "
9071 "live lane because use preceeds vector "
9072 "def\n");
9073 continue;
9074 }
9075 /* ??? It can also happen that we end up pulling a def into
9076 a loop where replacing out-of-loop uses would require
9077 a new LC SSA PHI node. Retain the original scalar in
9078 those cases as well. PR98064. */
9079 if (TREE_CODE (new_tree) == SSA_NAME
9080 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9081 && (gimple_bb (use_stmt)->loop_father
9082 != gimple_bb (vec_stmt)->loop_father)
9083 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9084 gimple_bb (use_stmt)->loop_father))
9085 {
9086 if (dump_enabled_p ())
9087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9088 "Using original scalar computation for "
9089 "live lane because there is an out-of-loop "
9090 "definition for it\n");
9091 continue;
9092 }
9093 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9094 SET_USE (use_p, new_tree);
9095 update_stmt (use_stmt);
9096 }
9097 }
9098
9099 return true;
9100 }
9101
9102 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9103
9104 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)9105 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9106 {
9107 ssa_op_iter op_iter;
9108 imm_use_iterator imm_iter;
9109 def_operand_p def_p;
9110 gimple *ustmt;
9111
9112 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9113 {
9114 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9115 {
9116 basic_block bb;
9117
9118 if (!is_gimple_debug (ustmt))
9119 continue;
9120
9121 bb = gimple_bb (ustmt);
9122
9123 if (!flow_bb_inside_loop_p (loop, bb))
9124 {
9125 if (gimple_debug_bind_p (ustmt))
9126 {
9127 if (dump_enabled_p ())
9128 dump_printf_loc (MSG_NOTE, vect_location,
9129 "killing debug use\n");
9130
9131 gimple_debug_bind_reset_value (ustmt);
9132 update_stmt (ustmt);
9133 }
9134 else
9135 gcc_unreachable ();
9136 }
9137 }
9138 }
9139 }
9140
9141 /* Given loop represented by LOOP_VINFO, return true if computation of
9142 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9143 otherwise. */
9144
9145 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)9146 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9147 {
9148 /* Constant case. */
9149 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9150 {
9151 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9152 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9153
9154 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9155 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9156 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9157 return true;
9158 }
9159
9160 widest_int max;
9161 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9162 /* Check the upper bound of loop niters. */
9163 if (get_max_loop_iterations (loop, &max))
9164 {
9165 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9166 signop sgn = TYPE_SIGN (type);
9167 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9168 if (max < type_max)
9169 return true;
9170 }
9171 return false;
9172 }
9173
9174 /* Return a mask type with half the number of elements as OLD_TYPE,
9175 given that it should have mode NEW_MODE. */
9176
9177 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)9178 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9179 {
9180 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9181 return build_truth_vector_type_for_mode (nunits, new_mode);
9182 }
9183
9184 /* Return a mask type with twice as many elements as OLD_TYPE,
9185 given that it should have mode NEW_MODE. */
9186
9187 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)9188 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9189 {
9190 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9191 return build_truth_vector_type_for_mode (nunits, new_mode);
9192 }
9193
9194 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9195 contain a sequence of NVECTORS masks that each control a vector of type
9196 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9197 these vector masks with the vector version of SCALAR_MASK. */
9198
9199 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)9200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9201 unsigned int nvectors, tree vectype, tree scalar_mask)
9202 {
9203 gcc_assert (nvectors != 0);
9204 if (masks->length () < nvectors)
9205 masks->safe_grow_cleared (nvectors, true);
9206 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9207 /* The number of scalars per iteration and the number of vectors are
9208 both compile-time constants. */
9209 unsigned int nscalars_per_iter
9210 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9211 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9212
9213 if (scalar_mask)
9214 {
9215 scalar_cond_masked_key cond (scalar_mask, nvectors);
9216 loop_vinfo->scalar_cond_masked_set.add (cond);
9217 }
9218
9219 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9220 {
9221 rgm->max_nscalars_per_iter = nscalars_per_iter;
9222 rgm->type = truth_type_for (vectype);
9223 rgm->factor = 1;
9224 }
9225 }
9226
9227 /* Given a complete set of masks MASKS, extract mask number INDEX
9228 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9229 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9230
9231 See the comment above vec_loop_masks for more details about the mask
9232 arrangement. */
9233
9234 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)9235 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9236 unsigned int nvectors, tree vectype, unsigned int index)
9237 {
9238 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9239 tree mask_type = rgm->type;
9240
9241 /* Populate the rgroup's mask array, if this is the first time we've
9242 used it. */
9243 if (rgm->controls.is_empty ())
9244 {
9245 rgm->controls.safe_grow_cleared (nvectors, true);
9246 for (unsigned int i = 0; i < nvectors; ++i)
9247 {
9248 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9249 /* Provide a dummy definition until the real one is available. */
9250 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9251 rgm->controls[i] = mask;
9252 }
9253 }
9254
9255 tree mask = rgm->controls[index];
9256 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9257 TYPE_VECTOR_SUBPARTS (vectype)))
9258 {
9259 /* A loop mask for data type X can be reused for data type Y
9260 if X has N times more elements than Y and if Y's elements
9261 are N times bigger than X's. In this case each sequence
9262 of N elements in the loop mask will be all-zero or all-one.
9263 We can then view-convert the mask so that each sequence of
9264 N elements is replaced by a single element. */
9265 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9266 TYPE_VECTOR_SUBPARTS (vectype)));
9267 gimple_seq seq = NULL;
9268 mask_type = truth_type_for (vectype);
9269 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9270 if (seq)
9271 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9272 }
9273 return mask;
9274 }
9275
9276 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9277 lengths for controlling an operation on VECTYPE. The operation splits
9278 each element of VECTYPE into FACTOR separate subelements, measuring the
9279 length as a number of these subelements. */
9280
9281 void
vect_record_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,tree vectype,unsigned int factor)9282 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9283 unsigned int nvectors, tree vectype, unsigned int factor)
9284 {
9285 gcc_assert (nvectors != 0);
9286 if (lens->length () < nvectors)
9287 lens->safe_grow_cleared (nvectors, true);
9288 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9289
9290 /* The number of scalars per iteration, scalar occupied bytes and
9291 the number of vectors are both compile-time constants. */
9292 unsigned int nscalars_per_iter
9293 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9294 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9295
9296 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9297 {
9298 /* For now, we only support cases in which all loads and stores fall back
9299 to VnQI or none do. */
9300 gcc_assert (!rgl->max_nscalars_per_iter
9301 || (rgl->factor == 1 && factor == 1)
9302 || (rgl->max_nscalars_per_iter * rgl->factor
9303 == nscalars_per_iter * factor));
9304 rgl->max_nscalars_per_iter = nscalars_per_iter;
9305 rgl->type = vectype;
9306 rgl->factor = factor;
9307 }
9308 }
9309
9310 /* Given a complete set of length LENS, extract length number INDEX for an
9311 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9312
9313 tree
vect_get_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,unsigned int index)9314 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9315 unsigned int nvectors, unsigned int index)
9316 {
9317 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9318 bool use_bias_adjusted_len =
9319 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9320
9321 /* Populate the rgroup's len array, if this is the first time we've
9322 used it. */
9323 if (rgl->controls.is_empty ())
9324 {
9325 rgl->controls.safe_grow_cleared (nvectors, true);
9326 for (unsigned int i = 0; i < nvectors; ++i)
9327 {
9328 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9329 gcc_assert (len_type != NULL_TREE);
9330
9331 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9332
9333 /* Provide a dummy definition until the real one is available. */
9334 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9335 rgl->controls[i] = len;
9336
9337 if (use_bias_adjusted_len)
9338 {
9339 gcc_assert (i == 0);
9340 tree adjusted_len =
9341 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9342 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9343 rgl->bias_adjusted_ctrl = adjusted_len;
9344 }
9345 }
9346 }
9347
9348 if (use_bias_adjusted_len)
9349 return rgl->bias_adjusted_ctrl;
9350 else
9351 return rgl->controls[index];
9352 }
9353
9354 /* Scale profiling counters by estimation for LOOP which is vectorized
9355 by factor VF. */
9356
9357 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)9358 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9359 {
9360 edge preheader = loop_preheader_edge (loop);
9361 /* Reduce loop iterations by the vectorization factor. */
9362 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9363 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9364
9365 if (freq_h.nonzero_p ())
9366 {
9367 profile_probability p;
9368
9369 /* Avoid dropping loop body profile counter to 0 because of zero count
9370 in loop's preheader. */
9371 if (!(freq_e == profile_count::zero ()))
9372 freq_e = freq_e.force_nonzero ();
9373 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9374 scale_loop_frequencies (loop, p);
9375 }
9376
9377 edge exit_e = single_exit (loop);
9378 exit_e->probability = profile_probability::always ()
9379 .apply_scale (1, new_est_niter + 1);
9380
9381 edge exit_l = single_pred_edge (loop->latch);
9382 profile_probability prob = exit_l->probability;
9383 exit_l->probability = exit_e->probability.invert ();
9384 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9385 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9386 }
9387
9388 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9389 latch edge values originally defined by it. */
9390
9391 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)9392 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9393 stmt_vec_info def_stmt_info)
9394 {
9395 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9396 if (!def || TREE_CODE (def) != SSA_NAME)
9397 return;
9398 stmt_vec_info phi_info;
9399 imm_use_iterator iter;
9400 use_operand_p use_p;
9401 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9402 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9403 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9404 && (phi_info = loop_vinfo->lookup_stmt (phi))
9405 && STMT_VINFO_RELEVANT_P (phi_info)
9406 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9407 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9408 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9409 {
9410 loop_p loop = gimple_bb (phi)->loop_father;
9411 edge e = loop_latch_edge (loop);
9412 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9413 {
9414 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9415 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9416 gcc_assert (phi_defs.length () == latch_defs.length ());
9417 for (unsigned i = 0; i < phi_defs.length (); ++i)
9418 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9419 gimple_get_lhs (latch_defs[i]), e,
9420 gimple_phi_arg_location (phi, e->dest_idx));
9421 }
9422 }
9423 }
9424
9425 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9426 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9427 stmt_vec_info. */
9428
9429 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)9430 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9431 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9432 {
9433 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9434 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9435
9436 if (dump_enabled_p ())
9437 dump_printf_loc (MSG_NOTE, vect_location,
9438 "------>vectorizing statement: %G", stmt_info->stmt);
9439
9440 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9441 vect_loop_kill_debug_uses (loop, stmt_info);
9442
9443 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9444 && !STMT_VINFO_LIVE_P (stmt_info))
9445 return false;
9446
9447 if (STMT_VINFO_VECTYPE (stmt_info))
9448 {
9449 poly_uint64 nunits
9450 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9451 if (!STMT_SLP_TYPE (stmt_info)
9452 && maybe_ne (nunits, vf)
9453 && dump_enabled_p ())
9454 /* For SLP VF is set according to unrolling factor, and not
9455 to vector size, hence for SLP this print is not valid. */
9456 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9457 }
9458
9459 /* Pure SLP statements have already been vectorized. We still need
9460 to apply loop vectorization to hybrid SLP statements. */
9461 if (PURE_SLP_STMT (stmt_info))
9462 return false;
9463
9464 if (dump_enabled_p ())
9465 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9466
9467 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9468 *seen_store = stmt_info;
9469
9470 return true;
9471 }
9472
9473 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9474 in the hash_map with its corresponding values. */
9475
9476 static tree
find_in_mapping(tree t,void * context)9477 find_in_mapping (tree t, void *context)
9478 {
9479 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9480
9481 tree *value = mapping->get (t);
9482 return value ? *value : t;
9483 }
9484
9485 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9486 original loop that has now been vectorized.
9487
9488 The inits of the data_references need to be advanced with the number of
9489 iterations of the main loop. This has been computed in vect_do_peeling and
9490 is stored in parameter ADVANCE. We first restore the data_references
9491 initial offset with the values recored in ORIG_DRS_INIT.
9492
9493 Since the loop_vec_info of this EPILOGUE was constructed for the original
9494 loop, its stmt_vec_infos all point to the original statements. These need
9495 to be updated to point to their corresponding copies as well as the SSA_NAMES
9496 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9497
9498 The data_reference's connections also need to be updated. Their
9499 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9500 stmt_vec_infos, their statements need to point to their corresponding copy,
9501 if they are gather loads or scatter stores then their reference needs to be
9502 updated to point to its corresponding copy. */
9503
9504 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)9505 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9506 {
9507 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9508 auto_vec<gimple *> stmt_worklist;
9509 hash_map<tree,tree> mapping;
9510 gimple *orig_stmt, *new_stmt;
9511 gimple_stmt_iterator epilogue_gsi;
9512 gphi_iterator epilogue_phi_gsi;
9513 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9514 basic_block *epilogue_bbs = get_loop_body (epilogue);
9515 unsigned i;
9516
9517 free (LOOP_VINFO_BBS (epilogue_vinfo));
9518 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9519
9520 /* Advance data_reference's with the number of iterations of the previous
9521 loop and its prologue. */
9522 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9523
9524
9525 /* The EPILOGUE loop is a copy of the original loop so they share the same
9526 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9527 point to the copied statements. We also create a mapping of all LHS' in
9528 the original loop and all the LHS' in the EPILOGUE and create worklists to
9529 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9530 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9531 {
9532 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9533 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9534 {
9535 new_stmt = epilogue_phi_gsi.phi ();
9536
9537 gcc_assert (gimple_uid (new_stmt) > 0);
9538 stmt_vinfo
9539 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9540
9541 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9542 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9543
9544 mapping.put (gimple_phi_result (orig_stmt),
9545 gimple_phi_result (new_stmt));
9546 /* PHI nodes can not have patterns or related statements. */
9547 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9548 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9549 }
9550
9551 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9552 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9553 {
9554 new_stmt = gsi_stmt (epilogue_gsi);
9555 if (is_gimple_debug (new_stmt))
9556 continue;
9557
9558 gcc_assert (gimple_uid (new_stmt) > 0);
9559 stmt_vinfo
9560 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9561
9562 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9563 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9564
9565 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9566 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9567
9568 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9569 {
9570 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9571 for (gimple_stmt_iterator gsi = gsi_start (seq);
9572 !gsi_end_p (gsi); gsi_next (&gsi))
9573 stmt_worklist.safe_push (gsi_stmt (gsi));
9574 }
9575
9576 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9577 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9578 {
9579 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9580 stmt_worklist.safe_push (stmt);
9581 /* Set BB such that the assert in
9582 'get_initial_def_for_reduction' is able to determine that
9583 the BB of the related stmt is inside this loop. */
9584 gimple_set_bb (stmt,
9585 gimple_bb (new_stmt));
9586 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9587 gcc_assert (related_vinfo == NULL
9588 || related_vinfo == stmt_vinfo);
9589 }
9590 }
9591 }
9592
9593 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9594 using the original main loop and thus need to be updated to refer to the
9595 cloned variables used in the epilogue. */
9596 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9597 {
9598 gimple *stmt = stmt_worklist[i];
9599 tree *new_op;
9600
9601 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9602 {
9603 tree op = gimple_op (stmt, j);
9604 if ((new_op = mapping.get(op)))
9605 gimple_set_op (stmt, j, *new_op);
9606 else
9607 {
9608 /* PR92429: The last argument of simplify_replace_tree disables
9609 folding when replacing arguments. This is required as
9610 otherwise you might end up with different statements than the
9611 ones analyzed in vect_loop_analyze, leading to different
9612 vectorization. */
9613 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9614 &find_in_mapping, &mapping, false);
9615 gimple_set_op (stmt, j, op);
9616 }
9617 }
9618 }
9619
9620 struct data_reference *dr;
9621 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9622 FOR_EACH_VEC_ELT (datarefs, i, dr)
9623 {
9624 orig_stmt = DR_STMT (dr);
9625 gcc_assert (gimple_uid (orig_stmt) > 0);
9626 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9627 /* Data references for gather loads and scatter stores do not use the
9628 updated offset we set using ADVANCE. Instead we have to make sure the
9629 reference in the data references point to the corresponding copy of
9630 the original in the epilogue. */
9631 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9632 == VMAT_GATHER_SCATTER)
9633 {
9634 DR_REF (dr)
9635 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9636 &find_in_mapping, &mapping);
9637 DR_BASE_ADDRESS (dr)
9638 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9639 &find_in_mapping, &mapping);
9640 }
9641 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9642 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9643 }
9644
9645 epilogue_vinfo->shared->datarefs_copy.release ();
9646 epilogue_vinfo->shared->save_datarefs ();
9647 }
9648
9649 /* Function vect_transform_loop.
9650
9651 The analysis phase has determined that the loop is vectorizable.
9652 Vectorize the loop - created vectorized stmts to replace the scalar
9653 stmts in the loop, and update the loop exit condition.
9654 Returns scalar epilogue loop if any. */
9655
9656 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)9657 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9658 {
9659 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9660 class loop *epilogue = NULL;
9661 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9662 int nbbs = loop->num_nodes;
9663 int i;
9664 tree niters_vector = NULL_TREE;
9665 tree step_vector = NULL_TREE;
9666 tree niters_vector_mult_vf = NULL_TREE;
9667 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9668 unsigned int lowest_vf = constant_lower_bound (vf);
9669 gimple *stmt;
9670 bool check_profitability = false;
9671 unsigned int th;
9672
9673 DUMP_VECT_SCOPE ("vec_transform_loop");
9674
9675 loop_vinfo->shared->check_datarefs ();
9676
9677 /* Use the more conservative vectorization threshold. If the number
9678 of iterations is constant assume the cost check has been performed
9679 by our caller. If the threshold makes all loops profitable that
9680 run at least the (estimated) vectorization factor number of times
9681 checking is pointless, too. */
9682 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9683 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9684 {
9685 if (dump_enabled_p ())
9686 dump_printf_loc (MSG_NOTE, vect_location,
9687 "Profitability threshold is %d loop iterations.\n",
9688 th);
9689 check_profitability = true;
9690 }
9691
9692 /* Make sure there exists a single-predecessor exit bb. Do this before
9693 versioning. */
9694 edge e = single_exit (loop);
9695 if (! single_pred_p (e->dest))
9696 {
9697 split_loop_exit_edge (e, true);
9698 if (dump_enabled_p ())
9699 dump_printf (MSG_NOTE, "split exit edge\n");
9700 }
9701
9702 /* Version the loop first, if required, so the profitability check
9703 comes first. */
9704
9705 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9706 {
9707 class loop *sloop
9708 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9709 sloop->force_vectorize = false;
9710 check_profitability = false;
9711 }
9712
9713 /* Make sure there exists a single-predecessor exit bb also on the
9714 scalar loop copy. Do this after versioning but before peeling
9715 so CFG structure is fine for both scalar and if-converted loop
9716 to make slpeel_duplicate_current_defs_from_edges face matched
9717 loop closed PHI nodes on the exit. */
9718 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9719 {
9720 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9721 if (! single_pred_p (e->dest))
9722 {
9723 split_loop_exit_edge (e, true);
9724 if (dump_enabled_p ())
9725 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9726 }
9727 }
9728
9729 tree niters = vect_build_loop_niters (loop_vinfo);
9730 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9731 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9732 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9733 tree advance;
9734 drs_init_vec orig_drs_init;
9735
9736 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9737 &step_vector, &niters_vector_mult_vf, th,
9738 check_profitability, niters_no_overflow,
9739 &advance);
9740
9741 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9742 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9743 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9744 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9745
9746 if (niters_vector == NULL_TREE)
9747 {
9748 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9749 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9750 && known_eq (lowest_vf, vf))
9751 {
9752 niters_vector
9753 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9754 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9755 step_vector = build_one_cst (TREE_TYPE (niters));
9756 }
9757 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9758 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9759 &step_vector, niters_no_overflow);
9760 else
9761 /* vect_do_peeling subtracted the number of peeled prologue
9762 iterations from LOOP_VINFO_NITERS. */
9763 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9764 &niters_vector, &step_vector,
9765 niters_no_overflow);
9766 }
9767
9768 /* 1) Make sure the loop header has exactly two entries
9769 2) Make sure we have a preheader basic block. */
9770
9771 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9772
9773 split_edge (loop_preheader_edge (loop));
9774
9775 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9776 /* This will deal with any possible peeling. */
9777 vect_prepare_for_masked_peels (loop_vinfo);
9778
9779 /* Schedule the SLP instances first, then handle loop vectorization
9780 below. */
9781 if (!loop_vinfo->slp_instances.is_empty ())
9782 {
9783 DUMP_VECT_SCOPE ("scheduling SLP instances");
9784 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9785 }
9786
9787 /* FORNOW: the vectorizer supports only loops which body consist
9788 of one basic block (header + empty latch). When the vectorizer will
9789 support more involved loop forms, the order by which the BBs are
9790 traversed need to be reconsidered. */
9791
9792 for (i = 0; i < nbbs; i++)
9793 {
9794 basic_block bb = bbs[i];
9795 stmt_vec_info stmt_info;
9796
9797 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9798 gsi_next (&si))
9799 {
9800 gphi *phi = si.phi ();
9801 if (dump_enabled_p ())
9802 dump_printf_loc (MSG_NOTE, vect_location,
9803 "------>vectorizing phi: %G", phi);
9804 stmt_info = loop_vinfo->lookup_stmt (phi);
9805 if (!stmt_info)
9806 continue;
9807
9808 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9809 vect_loop_kill_debug_uses (loop, stmt_info);
9810
9811 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9812 && !STMT_VINFO_LIVE_P (stmt_info))
9813 continue;
9814
9815 if (STMT_VINFO_VECTYPE (stmt_info)
9816 && (maybe_ne
9817 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9818 && dump_enabled_p ())
9819 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9820
9821 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9822 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9823 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9824 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9825 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9826 && ! PURE_SLP_STMT (stmt_info))
9827 {
9828 if (dump_enabled_p ())
9829 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9830 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9831 }
9832 }
9833
9834 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9835 gsi_next (&si))
9836 {
9837 gphi *phi = si.phi ();
9838 stmt_info = loop_vinfo->lookup_stmt (phi);
9839 if (!stmt_info)
9840 continue;
9841
9842 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9843 && !STMT_VINFO_LIVE_P (stmt_info))
9844 continue;
9845
9846 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9847 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9848 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9849 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9850 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9851 && ! PURE_SLP_STMT (stmt_info))
9852 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9853 }
9854
9855 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9856 !gsi_end_p (si);)
9857 {
9858 stmt = gsi_stmt (si);
9859 /* During vectorization remove existing clobber stmts. */
9860 if (gimple_clobber_p (stmt))
9861 {
9862 unlink_stmt_vdef (stmt);
9863 gsi_remove (&si, true);
9864 release_defs (stmt);
9865 }
9866 else
9867 {
9868 /* Ignore vector stmts created in the outer loop. */
9869 stmt_info = loop_vinfo->lookup_stmt (stmt);
9870
9871 /* vector stmts created in the outer-loop during vectorization of
9872 stmts in an inner-loop may not have a stmt_info, and do not
9873 need to be vectorized. */
9874 stmt_vec_info seen_store = NULL;
9875 if (stmt_info)
9876 {
9877 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9878 {
9879 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9880 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9881 !gsi_end_p (subsi); gsi_next (&subsi))
9882 {
9883 stmt_vec_info pat_stmt_info
9884 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9885 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9886 &si, &seen_store);
9887 }
9888 stmt_vec_info pat_stmt_info
9889 = STMT_VINFO_RELATED_STMT (stmt_info);
9890 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9891 &si, &seen_store))
9892 maybe_set_vectorized_backedge_value (loop_vinfo,
9893 pat_stmt_info);
9894 }
9895 else
9896 {
9897 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9898 &seen_store))
9899 maybe_set_vectorized_backedge_value (loop_vinfo,
9900 stmt_info);
9901 }
9902 }
9903 gsi_next (&si);
9904 if (seen_store)
9905 {
9906 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9907 /* Interleaving. If IS_STORE is TRUE, the
9908 vectorization of the interleaving chain was
9909 completed - free all the stores in the chain. */
9910 vect_remove_stores (loop_vinfo,
9911 DR_GROUP_FIRST_ELEMENT (seen_store));
9912 else
9913 /* Free the attached stmt_vec_info and remove the stmt. */
9914 loop_vinfo->remove_stmt (stmt_info);
9915 }
9916 }
9917 }
9918
9919 /* Stub out scalar statements that must not survive vectorization.
9920 Doing this here helps with grouped statements, or statements that
9921 are involved in patterns. */
9922 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9923 !gsi_end_p (gsi); gsi_next (&gsi))
9924 {
9925 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9926 if (!call || !gimple_call_internal_p (call))
9927 continue;
9928 internal_fn ifn = gimple_call_internal_fn (call);
9929 if (ifn == IFN_MASK_LOAD)
9930 {
9931 tree lhs = gimple_get_lhs (call);
9932 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9933 {
9934 tree zero = build_zero_cst (TREE_TYPE (lhs));
9935 gimple *new_stmt = gimple_build_assign (lhs, zero);
9936 gsi_replace (&gsi, new_stmt, true);
9937 }
9938 }
9939 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9940 {
9941 tree lhs = gimple_get_lhs (call);
9942 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9943 {
9944 tree else_arg
9945 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9946 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9947 gsi_replace (&gsi, new_stmt, true);
9948 }
9949 }
9950 }
9951 } /* BBs in loop */
9952
9953 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9954 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9955 if (integer_onep (step_vector))
9956 niters_no_overflow = true;
9957 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9958 niters_vector_mult_vf, !niters_no_overflow);
9959
9960 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9961 scale_profile_for_vect_loop (loop, assumed_vf);
9962
9963 /* True if the final iteration might not handle a full vector's
9964 worth of scalar iterations. */
9965 bool final_iter_may_be_partial
9966 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9967 /* The minimum number of iterations performed by the epilogue. This
9968 is 1 when peeling for gaps because we always need a final scalar
9969 iteration. */
9970 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9971 /* +1 to convert latch counts to loop iteration counts,
9972 -min_epilogue_iters to remove iterations that cannot be performed
9973 by the vector code. */
9974 int bias_for_lowest = 1 - min_epilogue_iters;
9975 int bias_for_assumed = bias_for_lowest;
9976 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9977 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9978 {
9979 /* When the amount of peeling is known at compile time, the first
9980 iteration will have exactly alignment_npeels active elements.
9981 In the worst case it will have at least one. */
9982 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9983 bias_for_lowest += lowest_vf - min_first_active;
9984 bias_for_assumed += assumed_vf - min_first_active;
9985 }
9986 /* In these calculations the "- 1" converts loop iteration counts
9987 back to latch counts. */
9988 if (loop->any_upper_bound)
9989 {
9990 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9991 loop->nb_iterations_upper_bound
9992 = (final_iter_may_be_partial
9993 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9994 lowest_vf) - 1
9995 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9996 lowest_vf) - 1);
9997 if (main_vinfo
9998 /* Both peeling for alignment and peeling for gaps can end up
9999 with the scalar epilogue running for more than VF-1 iterations. */
10000 && !main_vinfo->peeling_for_alignment
10001 && !main_vinfo->peeling_for_gaps)
10002 {
10003 unsigned int bound;
10004 poly_uint64 main_iters
10005 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
10006 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
10007 main_iters
10008 = upper_bound (main_iters,
10009 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
10010 if (can_div_away_from_zero_p (main_iters,
10011 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10012 &bound))
10013 loop->nb_iterations_upper_bound
10014 = wi::umin ((widest_int) (bound - 1),
10015 loop->nb_iterations_upper_bound);
10016 }
10017 }
10018 if (loop->any_likely_upper_bound)
10019 loop->nb_iterations_likely_upper_bound
10020 = (final_iter_may_be_partial
10021 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10022 + bias_for_lowest, lowest_vf) - 1
10023 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10024 + bias_for_lowest, lowest_vf) - 1);
10025 if (loop->any_estimate)
10026 loop->nb_iterations_estimate
10027 = (final_iter_may_be_partial
10028 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10029 assumed_vf) - 1
10030 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10031 assumed_vf) - 1);
10032
10033 if (dump_enabled_p ())
10034 {
10035 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10036 {
10037 dump_printf_loc (MSG_NOTE, vect_location,
10038 "LOOP VECTORIZED\n");
10039 if (loop->inner)
10040 dump_printf_loc (MSG_NOTE, vect_location,
10041 "OUTER LOOP VECTORIZED\n");
10042 dump_printf (MSG_NOTE, "\n");
10043 }
10044 else
10045 dump_printf_loc (MSG_NOTE, vect_location,
10046 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10047 GET_MODE_NAME (loop_vinfo->vector_mode));
10048 }
10049
10050 /* Loops vectorized with a variable factor won't benefit from
10051 unrolling/peeling. */
10052 if (!vf.is_constant ())
10053 {
10054 loop->unroll = 1;
10055 if (dump_enabled_p ())
10056 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10057 " variable-length vectorization factor\n");
10058 }
10059 /* Free SLP instances here because otherwise stmt reference counting
10060 won't work. */
10061 slp_instance instance;
10062 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10063 vect_free_slp_instance (instance);
10064 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10065 /* Clear-up safelen field since its value is invalid after vectorization
10066 since vectorized loop can have loop-carried dependencies. */
10067 loop->safelen = 0;
10068
10069 if (epilogue)
10070 {
10071 update_epilogue_loop_vinfo (epilogue, advance);
10072
10073 epilogue->simduid = loop->simduid;
10074 epilogue->force_vectorize = loop->force_vectorize;
10075 epilogue->dont_vectorize = false;
10076 }
10077
10078 return epilogue;
10079 }
10080
10081 /* The code below is trying to perform simple optimization - revert
10082 if-conversion for masked stores, i.e. if the mask of a store is zero
10083 do not perform it and all stored value producers also if possible.
10084 For example,
10085 for (i=0; i<n; i++)
10086 if (c[i])
10087 {
10088 p1[i] += 1;
10089 p2[i] = p3[i] +2;
10090 }
10091 this transformation will produce the following semi-hammock:
10092
10093 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10094 {
10095 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10096 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10097 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10098 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10099 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10100 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10101 }
10102 */
10103
10104 void
optimize_mask_stores(class loop * loop)10105 optimize_mask_stores (class loop *loop)
10106 {
10107 basic_block *bbs = get_loop_body (loop);
10108 unsigned nbbs = loop->num_nodes;
10109 unsigned i;
10110 basic_block bb;
10111 class loop *bb_loop;
10112 gimple_stmt_iterator gsi;
10113 gimple *stmt;
10114 auto_vec<gimple *> worklist;
10115 auto_purge_vect_location sentinel;
10116
10117 vect_location = find_loop_location (loop);
10118 /* Pick up all masked stores in loop if any. */
10119 for (i = 0; i < nbbs; i++)
10120 {
10121 bb = bbs[i];
10122 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10123 gsi_next (&gsi))
10124 {
10125 stmt = gsi_stmt (gsi);
10126 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10127 worklist.safe_push (stmt);
10128 }
10129 }
10130
10131 free (bbs);
10132 if (worklist.is_empty ())
10133 return;
10134
10135 /* Loop has masked stores. */
10136 while (!worklist.is_empty ())
10137 {
10138 gimple *last, *last_store;
10139 edge e, efalse;
10140 tree mask;
10141 basic_block store_bb, join_bb;
10142 gimple_stmt_iterator gsi_to;
10143 tree vdef, new_vdef;
10144 gphi *phi;
10145 tree vectype;
10146 tree zero;
10147
10148 last = worklist.pop ();
10149 mask = gimple_call_arg (last, 2);
10150 bb = gimple_bb (last);
10151 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10152 the same loop as if_bb. It could be different to LOOP when two
10153 level loop-nest is vectorized and mask_store belongs to the inner
10154 one. */
10155 e = split_block (bb, last);
10156 bb_loop = bb->loop_father;
10157 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10158 join_bb = e->dest;
10159 store_bb = create_empty_bb (bb);
10160 add_bb_to_loop (store_bb, bb_loop);
10161 e->flags = EDGE_TRUE_VALUE;
10162 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10163 /* Put STORE_BB to likely part. */
10164 efalse->probability = profile_probability::unlikely ();
10165 store_bb->count = efalse->count ();
10166 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10167 if (dom_info_available_p (CDI_DOMINATORS))
10168 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10169 if (dump_enabled_p ())
10170 dump_printf_loc (MSG_NOTE, vect_location,
10171 "Create new block %d to sink mask stores.",
10172 store_bb->index);
10173 /* Create vector comparison with boolean result. */
10174 vectype = TREE_TYPE (mask);
10175 zero = build_zero_cst (vectype);
10176 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10177 gsi = gsi_last_bb (bb);
10178 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10179 /* Create new PHI node for vdef of the last masked store:
10180 .MEM_2 = VDEF <.MEM_1>
10181 will be converted to
10182 .MEM.3 = VDEF <.MEM_1>
10183 and new PHI node will be created in join bb
10184 .MEM_2 = PHI <.MEM_1, .MEM_3>
10185 */
10186 vdef = gimple_vdef (last);
10187 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10188 gimple_set_vdef (last, new_vdef);
10189 phi = create_phi_node (vdef, join_bb);
10190 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10191
10192 /* Put all masked stores with the same mask to STORE_BB if possible. */
10193 while (true)
10194 {
10195 gimple_stmt_iterator gsi_from;
10196 gimple *stmt1 = NULL;
10197
10198 /* Move masked store to STORE_BB. */
10199 last_store = last;
10200 gsi = gsi_for_stmt (last);
10201 gsi_from = gsi;
10202 /* Shift GSI to the previous stmt for further traversal. */
10203 gsi_prev (&gsi);
10204 gsi_to = gsi_start_bb (store_bb);
10205 gsi_move_before (&gsi_from, &gsi_to);
10206 /* Setup GSI_TO to the non-empty block start. */
10207 gsi_to = gsi_start_bb (store_bb);
10208 if (dump_enabled_p ())
10209 dump_printf_loc (MSG_NOTE, vect_location,
10210 "Move stmt to created bb\n%G", last);
10211 /* Move all stored value producers if possible. */
10212 while (!gsi_end_p (gsi))
10213 {
10214 tree lhs;
10215 imm_use_iterator imm_iter;
10216 use_operand_p use_p;
10217 bool res;
10218
10219 /* Skip debug statements. */
10220 if (is_gimple_debug (gsi_stmt (gsi)))
10221 {
10222 gsi_prev (&gsi);
10223 continue;
10224 }
10225 stmt1 = gsi_stmt (gsi);
10226 /* Do not consider statements writing to memory or having
10227 volatile operand. */
10228 if (gimple_vdef (stmt1)
10229 || gimple_has_volatile_ops (stmt1))
10230 break;
10231 gsi_from = gsi;
10232 gsi_prev (&gsi);
10233 lhs = gimple_get_lhs (stmt1);
10234 if (!lhs)
10235 break;
10236
10237 /* LHS of vectorized stmt must be SSA_NAME. */
10238 if (TREE_CODE (lhs) != SSA_NAME)
10239 break;
10240
10241 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10242 {
10243 /* Remove dead scalar statement. */
10244 if (has_zero_uses (lhs))
10245 {
10246 gsi_remove (&gsi_from, true);
10247 continue;
10248 }
10249 }
10250
10251 /* Check that LHS does not have uses outside of STORE_BB. */
10252 res = true;
10253 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10254 {
10255 gimple *use_stmt;
10256 use_stmt = USE_STMT (use_p);
10257 if (is_gimple_debug (use_stmt))
10258 continue;
10259 if (gimple_bb (use_stmt) != store_bb)
10260 {
10261 res = false;
10262 break;
10263 }
10264 }
10265 if (!res)
10266 break;
10267
10268 if (gimple_vuse (stmt1)
10269 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10270 break;
10271
10272 /* Can move STMT1 to STORE_BB. */
10273 if (dump_enabled_p ())
10274 dump_printf_loc (MSG_NOTE, vect_location,
10275 "Move stmt to created bb\n%G", stmt1);
10276 gsi_move_before (&gsi_from, &gsi_to);
10277 /* Shift GSI_TO for further insertion. */
10278 gsi_prev (&gsi_to);
10279 }
10280 /* Put other masked stores with the same mask to STORE_BB. */
10281 if (worklist.is_empty ()
10282 || gimple_call_arg (worklist.last (), 2) != mask
10283 || worklist.last () != stmt1)
10284 break;
10285 last = worklist.pop ();
10286 }
10287 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10288 }
10289 }
10290
10291 /* Decide whether it is possible to use a zero-based induction variable
10292 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10293 the value that the induction variable must be able to hold in order
10294 to ensure that the rgroups eventually have no active vector elements.
10295 Return -1 otherwise. */
10296
10297 widest_int
vect_iv_limit_for_partial_vectors(loop_vec_info loop_vinfo)10298 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10299 {
10300 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10301 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10302 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10303
10304 /* Calculate the value that the induction variable must be able
10305 to hit in order to ensure that we end the loop with an all-false mask.
10306 This involves adding the maximum number of inactive trailing scalar
10307 iterations. */
10308 widest_int iv_limit = -1;
10309 if (max_loop_iterations (loop, &iv_limit))
10310 {
10311 if (niters_skip)
10312 {
10313 /* Add the maximum number of skipped iterations to the
10314 maximum iteration count. */
10315 if (TREE_CODE (niters_skip) == INTEGER_CST)
10316 iv_limit += wi::to_widest (niters_skip);
10317 else
10318 iv_limit += max_vf - 1;
10319 }
10320 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10321 /* Make a conservatively-correct assumption. */
10322 iv_limit += max_vf - 1;
10323
10324 /* IV_LIMIT is the maximum number of latch iterations, which is also
10325 the maximum in-range IV value. Round this value down to the previous
10326 vector alignment boundary and then add an extra full iteration. */
10327 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10328 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10329 }
10330 return iv_limit;
10331 }
10332
10333 /* For the given rgroup_controls RGC, check whether an induction variable
10334 would ever hit a value that produces a set of all-false masks or zero
10335 lengths before wrapping around. Return true if it's possible to wrap
10336 around before hitting the desirable value, otherwise return false. */
10337
10338 bool
vect_rgroup_iv_might_wrap_p(loop_vec_info loop_vinfo,rgroup_controls * rgc)10339 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10340 {
10341 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10342
10343 if (iv_limit == -1)
10344 return true;
10345
10346 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10347 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10348 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10349
10350 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10351 return true;
10352
10353 return false;
10354 }
10355