xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-vect-loop.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* Loop Vectorization
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 
59 /* Loop Vectorization Pass.
60 
61    This pass tries to vectorize loops.
62 
63    For example, the vectorizer transforms the following simple loop:
64 
65         short a[N]; short b[N]; short c[N]; int i;
66 
67         for (i=0; i<N; i++){
68           a[i] = b[i] + c[i];
69         }
70 
71    as if it was manually vectorized by rewriting the source code into:
72 
73         typedef int __attribute__((mode(V8HI))) v8hi;
74         short a[N];  short b[N]; short c[N];   int i;
75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76         v8hi va, vb, vc;
77 
78         for (i=0; i<N/8; i++){
79           vb = pb[i];
80           vc = pc[i];
81           va = vb + vc;
82           pa[i] = va;
83         }
84 
85         The main entry to this pass is vectorize_loops(), in which
86    the vectorizer applies a set of analyses on a given set of loops,
87    followed by the actual vectorization transformation for the loops that
88    had successfully passed the analysis phase.
89         Throughout this pass we make a distinction between two types of
90    data: scalars (which are represented by SSA_NAMES), and memory references
91    ("data-refs").  These two types of data require different handling both
92    during analysis and transformation. The types of data-refs that the
93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95    accesses are required to have a simple (consecutive) access pattern.
96 
97    Analysis phase:
98    ===============
99         The driver for the analysis phase is vect_analyze_loop().
100    It applies a set of analyses, some of which rely on the scalar evolution
101    analyzer (scev) developed by Sebastian Pop.
102 
103         During the analysis phase the vectorizer records some information
104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105    loop, as well as general information about the loop as a whole, which is
106    recorded in a "loop_vec_info" struct attached to each loop.
107 
108    Transformation phase:
109    =====================
110         The loop transformation phase scans all the stmts in the loop, and
111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112    the loop that needs to be vectorized.  It inserts the vector code sequence
113    just before the scalar stmt S, and records a pointer to the vector code
114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115    attached to S).  This pointer will be used for the vectorization of following
116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117    otherwise, we rely on dead code elimination for removing it.
118 
119         For example, say stmt S1 was vectorized into stmt VS1:
120 
121    VS1: vb = px[i];
122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123    S2:  a = b;
124 
125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
128    resulting sequence would be:
129 
130    VS1: vb = px[i];
131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132    VS2: va = vb;
133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 
135         Operands that are not SSA_NAMEs, are data-refs that appear in
136    load/store operations (like 'x[i]' in S1), and are handled differently.
137 
138    Target modeling:
139    =================
140         Currently the only target specific information that is used is the
141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142    Targets that can support different sizes of vectors, for now will need
143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
144    flexibility will be added in the future.
145 
146         Since we only vectorize operations which vector form can be
147    expressed using existing tree codes, to verify that an operation is
148    supported, the vectorizer checks the relevant optab at the relevant
149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
150    the value found is CODE_FOR_nothing, then there's no target support, and
151    we can't vectorize the stmt.
152 
153    For additional information on this project see:
154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156 
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 						unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 					       bool *, bool *);
161 
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164    may already be set for general statements (not just data refs).  */
165 
166 static opt_result
vect_determine_vf_for_stmt_1(vec_info * vinfo,stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 			      bool vectype_maybe_set_p,
169 			      poly_uint64 *vf)
170 {
171   gimple *stmt = stmt_info->stmt;
172 
173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174        && !STMT_VINFO_LIVE_P (stmt_info))
175       || gimple_clobber_p (stmt))
176     {
177       if (dump_enabled_p ())
178 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179       return opt_result::success ();
180     }
181 
182   tree stmt_vectype, nunits_vectype;
183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 						   &stmt_vectype,
185 						   &nunits_vectype);
186   if (!res)
187     return res;
188 
189   if (stmt_vectype)
190     {
191       if (STMT_VINFO_VECTYPE (stmt_info))
192 	/* The only case when a vectype had been already set is for stmts
193 	   that contain a data ref, or for "pattern-stmts" (stmts generated
194 	   by the vectorizer to represent/replace a certain idiom).  */
195 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 		     || vectype_maybe_set_p)
197 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198       else
199 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200     }
201 
202   if (nunits_vectype)
203     vect_update_max_nunits (vf, nunits_vectype);
204 
205   return opt_result::success ();
206 }
207 
208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
209    types of STMT_INFO and all attached pattern statements and update
210    the vectorization factor VF accordingly.  Return true on success
211    or false if something prevented vectorization.  */
212 
213 static opt_result
vect_determine_vf_for_stmt(vec_info * vinfo,stmt_vec_info stmt_info,poly_uint64 * vf)214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 			    stmt_vec_info stmt_info, poly_uint64 *vf)
216 {
217   if (dump_enabled_p ())
218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 		     stmt_info->stmt);
220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221   if (!res)
222     return res;
223 
224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225       && STMT_VINFO_RELATED_STMT (stmt_info))
226     {
227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
229 
230       /* If a pattern statement has def stmts, analyze them too.  */
231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 	   !gsi_end_p (si); gsi_next (&si))
233 	{
234 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 	  if (dump_enabled_p ())
236 	    dump_printf_loc (MSG_NOTE, vect_location,
237 			     "==> examining pattern def stmt: %G",
238 			     def_stmt_info->stmt);
239 	  res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 	  if (!res)
241 	    return res;
242 	}
243 
244       if (dump_enabled_p ())
245 	dump_printf_loc (MSG_NOTE, vect_location,
246 			 "==> examining pattern statement: %G",
247 			 stmt_info->stmt);
248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249       if (!res)
250 	return res;
251     }
252 
253   return opt_result::success ();
254 }
255 
256 /* Function vect_determine_vectorization_factor
257 
258    Determine the vectorization factor (VF).  VF is the number of data elements
259    that are operated upon in parallel in a single iteration of the vectorized
260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262    elements can fit in a single vector register.
263 
264    We currently support vectorization of loops in which all types operated upon
265    are of the same size.  Therefore this function currently sets VF according to
266    the size of the types operated upon, and fails if there are multiple sizes
267    in the loop.
268 
269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
270    original loop:
271         for (i=0; i<N; i++){
272           a[i] = b[i] + c[i];
273         }
274 
275    vectorized loop:
276         for (i=0; i<N; i+=VF){
277           a[i:VF] = b[i:VF] + c[i:VF];
278         }
279 */
280 
281 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
283 {
284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286   unsigned nbbs = loop->num_nodes;
287   poly_uint64 vectorization_factor = 1;
288   tree scalar_type = NULL_TREE;
289   gphi *phi;
290   tree vectype;
291   stmt_vec_info stmt_info;
292   unsigned i;
293 
294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
295 
296   for (i = 0; i < nbbs; i++)
297     {
298       basic_block bb = bbs[i];
299 
300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 	   gsi_next (&si))
302 	{
303 	  phi = si.phi ();
304 	  stmt_info = loop_vinfo->lookup_stmt (phi);
305 	  if (dump_enabled_p ())
306 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 			     phi);
308 
309 	  gcc_assert (stmt_info);
310 
311 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
312 	      || STMT_VINFO_LIVE_P (stmt_info))
313             {
314 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
316 
317 	      if (dump_enabled_p ())
318 		dump_printf_loc (MSG_NOTE, vect_location,
319 				 "get vectype for scalar type:  %T\n",
320 				 scalar_type);
321 
322 	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 	      if (!vectype)
324 		return opt_result::failure_at (phi,
325 					       "not vectorized: unsupported "
326 					       "data-type %T\n",
327 					       scalar_type);
328 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
329 
330 	      if (dump_enabled_p ())
331 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 				 vectype);
333 
334 	      if (dump_enabled_p ())
335 		{
336 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 		  dump_printf (MSG_NOTE, "\n");
339 		}
340 
341 	      vect_update_max_nunits (&vectorization_factor, vectype);
342 	    }
343 	}
344 
345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 	   gsi_next (&si))
347 	{
348 	  if (is_gimple_debug (gsi_stmt (si)))
349 	    continue;
350 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 	  opt_result res
352 	    = vect_determine_vf_for_stmt (loop_vinfo,
353 					  stmt_info, &vectorization_factor);
354 	  if (!res)
355 	    return res;
356         }
357     }
358 
359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
360   if (dump_enabled_p ())
361     {
362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363       dump_dec (MSG_NOTE, vectorization_factor);
364       dump_printf (MSG_NOTE, "\n");
365     }
366 
367   if (known_le (vectorization_factor, 1U))
368     return opt_result::failure_at (vect_location,
369 				   "not vectorized: unsupported data-type\n");
370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371   return opt_result::success ();
372 }
373 
374 
375 /* Function vect_is_simple_iv_evolution.
376 
377    FORNOW: A simple evolution of an induction variables in the loop is
378    considered a polynomial evolution.  */
379 
380 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382                              tree * step)
383 {
384   tree init_expr;
385   tree step_expr;
386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387   basic_block bb;
388 
389   /* When there is no evolution in this loop, the evolution function
390      is not "simple".  */
391   if (evolution_part == NULL_TREE)
392     return false;
393 
394   /* When the evolution is a polynomial of degree >= 2
395      the evolution function is not "simple".  */
396   if (tree_is_chrec (evolution_part))
397     return false;
398 
399   step_expr = evolution_part;
400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
401 
402   if (dump_enabled_p ())
403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
404 		     step_expr, init_expr);
405 
406   *init = init_expr;
407   *step = step_expr;
408 
409   if (TREE_CODE (step_expr) != INTEGER_CST
410       && (TREE_CODE (step_expr) != SSA_NAME
411 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 		  || !flag_associative_math)))
416       && (TREE_CODE (step_expr) != REAL_CST
417 	  || !flag_associative_math))
418     {
419       if (dump_enabled_p ())
420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421                          "step unknown.\n");
422       return false;
423     }
424 
425   return true;
426 }
427 
428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
429    what we are assuming is a double reduction.  For example, given
430    a structure like this:
431 
432       outer1:
433 	x_1 = PHI <x_4(outer2), ...>;
434 	...
435 
436       inner:
437 	x_2 = PHI <x_1(outer1), ...>;
438 	...
439 	x_3 = ...;
440 	...
441 
442       outer2:
443 	x_4 = PHI <x_3(inner)>;
444 	...
445 
446    outer loop analysis would treat x_1 as a double reduction phi and
447    this function would then return true for x_2.  */
448 
449 static bool
vect_inner_phi_in_double_reduction_p(loop_vec_info loop_vinfo,gphi * phi)450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
451 {
452   use_operand_p use_p;
453   ssa_op_iter op_iter;
454   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
455     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
456       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
457 	return true;
458   return false;
459 }
460 
461 /* Function vect_analyze_scalar_cycles_1.
462 
463    Examine the cross iteration def-use cycles of scalar variables
464    in LOOP.  LOOP_VINFO represents the loop that is now being
465    considered for vectorization (can be LOOP, or an outer-loop
466    enclosing LOOP).  */
467 
468 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
470 {
471   basic_block bb = loop->header;
472   tree init, step;
473   auto_vec<stmt_vec_info, 64> worklist;
474   gphi_iterator gsi;
475   bool double_reduc, reduc_chain;
476 
477   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
478 
479   /* First - identify all inductions.  Reduction detection assumes that all the
480      inductions have been identified, therefore, this order must not be
481      changed.  */
482   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
483     {
484       gphi *phi = gsi.phi ();
485       tree access_fn = NULL;
486       tree def = PHI_RESULT (phi);
487       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
488 
489       if (dump_enabled_p ())
490 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
491 
492       /* Skip virtual phi's.  The data dependences that are associated with
493          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
494       if (virtual_operand_p (def))
495 	continue;
496 
497       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
498 
499       /* Analyze the evolution function.  */
500       access_fn = analyze_scalar_evolution (loop, def);
501       if (access_fn)
502 	{
503 	  STRIP_NOPS (access_fn);
504 	  if (dump_enabled_p ())
505 	    dump_printf_loc (MSG_NOTE, vect_location,
506 			     "Access function of PHI: %T\n", access_fn);
507 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
508 	    = initial_condition_in_loop_num (access_fn, loop->num);
509 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
510 	    = evolution_part_in_loop_num (access_fn, loop->num);
511 	}
512 
513       if (!access_fn
514 	  || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
515 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
516 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
517 	      && TREE_CODE (step) != INTEGER_CST))
518 	{
519 	  worklist.safe_push (stmt_vinfo);
520 	  continue;
521 	}
522 
523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
524 		  != NULL_TREE);
525       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
526 
527       if (dump_enabled_p ())
528 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
529       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530     }
531 
532 
533   /* Second - identify all reductions and nested cycles.  */
534   while (worklist.length () > 0)
535     {
536       stmt_vec_info stmt_vinfo = worklist.pop ();
537       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
538       tree def = PHI_RESULT (phi);
539 
540       if (dump_enabled_p ())
541 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
542 
543       gcc_assert (!virtual_operand_p (def)
544 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
545 
546       stmt_vec_info reduc_stmt_info
547 	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
548 				    &reduc_chain);
549       if (reduc_stmt_info)
550         {
551 	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
552 	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
553 	  if (double_reduc)
554 	    {
555 	      if (dump_enabled_p ())
556 		dump_printf_loc (MSG_NOTE, vect_location,
557 				 "Detected double reduction.\n");
558 
559               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
560 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
561             }
562           else
563             {
564               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
565                 {
566                   if (dump_enabled_p ())
567                     dump_printf_loc (MSG_NOTE, vect_location,
568 				     "Detected vectorizable nested cycle.\n");
569 
570                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
571                 }
572               else
573                 {
574                   if (dump_enabled_p ())
575                     dump_printf_loc (MSG_NOTE, vect_location,
576 				     "Detected reduction.\n");
577 
578                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
579 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
580                   /* Store the reduction cycles for possible vectorization in
581                      loop-aware SLP if it was not detected as reduction
582 		     chain.  */
583 		  if (! reduc_chain)
584 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
585 		      (reduc_stmt_info);
586                 }
587             }
588         }
589       else
590         if (dump_enabled_p ())
591           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592 			   "Unknown def-use cycle pattern.\n");
593     }
594 }
595 
596 
597 /* Function vect_analyze_scalar_cycles.
598 
599    Examine the cross iteration def-use cycles of scalar variables, by
600    analyzing the loop-header PHIs of scalar variables.  Classify each
601    cycle as one of the following: invariant, induction, reduction, unknown.
602    We do that for the loop represented by LOOP_VINFO, and also to its
603    inner-loop, if exists.
604    Examples for scalar cycles:
605 
606    Example1: reduction:
607 
608               loop1:
609               for (i=0; i<N; i++)
610                  sum += a[i];
611 
612    Example2: induction:
613 
614               loop2:
615               for (i=0; i<N; i++)
616                  a[i] = i;  */
617 
618 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
620 {
621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
622 
623   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
624 
625   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
626      Reductions in such inner-loop therefore have different properties than
627      the reductions in the nest that gets vectorized:
628      1. When vectorized, they are executed in the same order as in the original
629         scalar loop, so we can't change the order of computation when
630         vectorizing them.
631      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
632         current checks are too strict.  */
633 
634   if (loop->inner)
635     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 }
637 
638 /* Transfer group and reduction information from STMT_INFO to its
639    pattern stmt.  */
640 
641 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
643 {
644   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
645   stmt_vec_info stmtp;
646   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
647 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
648   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649   do
650     {
651       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
652       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
653 			   == STMT_VINFO_DEF_TYPE (stmt_info));
654       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
655       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
656       if (stmt_info)
657 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
658 	  = STMT_VINFO_RELATED_STMT (stmt_info);
659     }
660   while (stmt_info);
661 }
662 
663 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
664 
665 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
667 {
668   stmt_vec_info first;
669   unsigned i;
670 
671   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
672     {
673       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
674       while (next)
675 	{
676 	  if ((STMT_VINFO_IN_PATTERN_P (next)
677 	       != STMT_VINFO_IN_PATTERN_P (first))
678 	      || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
679 	    break;
680 	  next = REDUC_GROUP_NEXT_ELEMENT (next);
681 	}
682       /* If all reduction chain members are well-formed patterns adjust
683 	 the group to group the pattern stmts instead.  */
684       if (! next
685 	  && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
686 	{
687 	  if (STMT_VINFO_IN_PATTERN_P (first))
688 	    {
689 	      vect_fixup_reduc_chain (first);
690 	      LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
691 		= STMT_VINFO_RELATED_STMT (first);
692 	    }
693 	}
694       /* If not all stmt in the chain are patterns or if we failed
695 	 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
696 	 it as regular reduction instead.  */
697       else
698 	{
699 	  stmt_vec_info vinfo = first;
700 	  stmt_vec_info last = NULL;
701 	  while (vinfo)
702 	    {
703 	      next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
704 	      REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
705 	      REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
706 	      last = vinfo;
707 	      vinfo = next;
708 	    }
709 	  STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
710 	    = vect_internal_def;
711 	  loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
712 	  LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
713 	  --i;
714 	}
715     }
716 }
717 
718 /* Function vect_get_loop_niters.
719 
720    Determine how many iterations the loop is executed and place it
721    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
722    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
723    niter information holds in ASSUMPTIONS.
724 
725    Return the loop exit condition.  */
726 
727 
728 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)729 vect_get_loop_niters (class loop *loop, tree *assumptions,
730 		      tree *number_of_iterations, tree *number_of_iterationsm1)
731 {
732   edge exit = single_exit (loop);
733   class tree_niter_desc niter_desc;
734   tree niter_assumptions, niter, may_be_zero;
735   gcond *cond = get_loop_exit_condition (loop);
736 
737   *assumptions = boolean_true_node;
738   *number_of_iterationsm1 = chrec_dont_know;
739   *number_of_iterations = chrec_dont_know;
740   DUMP_VECT_SCOPE ("get_loop_niters");
741 
742   if (!exit)
743     return cond;
744 
745   may_be_zero = NULL_TREE;
746   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
747       || chrec_contains_undetermined (niter_desc.niter))
748     return cond;
749 
750   niter_assumptions = niter_desc.assumptions;
751   may_be_zero = niter_desc.may_be_zero;
752   niter = niter_desc.niter;
753 
754   if (may_be_zero && integer_zerop (may_be_zero))
755     may_be_zero = NULL_TREE;
756 
757   if (may_be_zero)
758     {
759       if (COMPARISON_CLASS_P (may_be_zero))
760 	{
761 	  /* Try to combine may_be_zero with assumptions, this can simplify
762 	     computation of niter expression.  */
763 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
764 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
765 					     niter_assumptions,
766 					     fold_build1 (TRUTH_NOT_EXPR,
767 							  boolean_type_node,
768 							  may_be_zero));
769 	  else
770 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
771 				 build_int_cst (TREE_TYPE (niter), 0),
772 				 rewrite_to_non_trapping_overflow (niter));
773 
774 	  may_be_zero = NULL_TREE;
775 	}
776       else if (integer_nonzerop (may_be_zero))
777 	{
778 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
779 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
780 	  return cond;
781 	}
782       else
783 	return cond;
784     }
785 
786   *assumptions = niter_assumptions;
787   *number_of_iterationsm1 = niter;
788 
789   /* We want the number of loop header executions which is the number
790      of latch executions plus one.
791      ???  For UINT_MAX latch executions this number overflows to zero
792      for loops like do { n++; } while (n != 0);  */
793   if (niter && !chrec_contains_undetermined (niter))
794     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
795 			  build_int_cst (TREE_TYPE (niter), 1));
796   *number_of_iterations = niter;
797 
798   return cond;
799 }
800 
801 /* Function bb_in_loop_p
802 
803    Used as predicate for dfs order traversal of the loop bbs.  */
804 
805 static bool
bb_in_loop_p(const_basic_block bb,const void * data)806 bb_in_loop_p (const_basic_block bb, const void *data)
807 {
808   const class loop *const loop = (const class loop *)data;
809   if (flow_bb_inside_loop_p (loop, bb))
810     return true;
811   return false;
812 }
813 
814 
815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
816    stmt_vec_info structs for all the stmts in LOOP_IN.  */
817 
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
819   : vec_info (vec_info::loop, shared),
820     loop (loop_in),
821     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
822     num_itersm1 (NULL_TREE),
823     num_iters (NULL_TREE),
824     num_iters_unchanged (NULL_TREE),
825     num_iters_assumptions (NULL_TREE),
826     vector_costs (nullptr),
827     scalar_costs (nullptr),
828     th (0),
829     versioning_threshold (0),
830     vectorization_factor (0),
831     main_loop_edge (nullptr),
832     skip_main_loop_edge (nullptr),
833     skip_this_loop_edge (nullptr),
834     reusable_accumulators (),
835     suggested_unroll_factor (1),
836     max_vectorization_factor (0),
837     mask_skip_niters (NULL_TREE),
838     rgroup_compare_type (NULL_TREE),
839     simd_if_cond (NULL_TREE),
840     unaligned_dr (NULL),
841     peeling_for_alignment (0),
842     ptr_mask (0),
843     ivexpr_map (NULL),
844     scan_map (NULL),
845     slp_unrolling_factor (1),
846     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
847     vectorizable (false),
848     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
849     using_partial_vectors_p (false),
850     epil_using_partial_vectors_p (false),
851     partial_load_store_bias (0),
852     peeling_for_gaps (false),
853     peeling_for_niter (false),
854     no_data_dependencies (false),
855     has_mask_store (false),
856     scalar_loop_scaling (profile_probability::uninitialized ()),
857     scalar_loop (NULL),
858     orig_loop_info (NULL)
859 {
860   /* CHECKME: We want to visit all BBs before their successors (except for
861      latch blocks, for which this assertion wouldn't hold).  In the simple
862      case of the loop forms we allow, a dfs order of the BBs would the same
863      as reversed postorder traversal, so we are safe.  */
864 
865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 					  bbs, loop->num_nodes, loop);
867   gcc_assert (nbbs == loop->num_nodes);
868 
869   for (unsigned int i = 0; i < nbbs; i++)
870     {
871       basic_block bb = bbs[i];
872       gimple_stmt_iterator si;
873 
874       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
875 	{
876 	  gimple *phi = gsi_stmt (si);
877 	  gimple_set_uid (phi, 0);
878 	  add_stmt (phi);
879 	}
880 
881       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
882 	{
883 	  gimple *stmt = gsi_stmt (si);
884 	  gimple_set_uid (stmt, 0);
885 	  if (is_gimple_debug (stmt))
886 	    continue;
887 	  add_stmt (stmt);
888 	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
889 	     third argument is the #pragma omp simd if (x) condition, when 0,
890 	     loop shouldn't be vectorized, when non-zero constant, it should
891 	     be vectorized normally, otherwise versioned with vectorized loop
892 	     done if the condition is non-zero at runtime.  */
893 	  if (loop_in->simduid
894 	      && is_gimple_call (stmt)
895 	      && gimple_call_internal_p (stmt)
896 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
897 	      && gimple_call_num_args (stmt) >= 3
898 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
899 	      && (loop_in->simduid
900 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
901 	    {
902 	      tree arg = gimple_call_arg (stmt, 2);
903 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
904 		simd_if_cond = arg;
905 	      else
906 		gcc_assert (integer_nonzerop (arg));
907 	    }
908 	}
909     }
910 
911   epilogue_vinfos.create (6);
912 }
913 
914 /* Free all levels of rgroup CONTROLS.  */
915 
916 void
release_vec_loop_controls(vec<rgroup_controls> * controls)917 release_vec_loop_controls (vec<rgroup_controls> *controls)
918 {
919   rgroup_controls *rgc;
920   unsigned int i;
921   FOR_EACH_VEC_ELT (*controls, i, rgc)
922     rgc->controls.release ();
923   controls->release ();
924 }
925 
926 /* Free all memory used by the _loop_vec_info, as well as all the
927    stmt_vec_info structs of all the stmts in the loop.  */
928 
~_loop_vec_info()929 _loop_vec_info::~_loop_vec_info ()
930 {
931   free (bbs);
932 
933   release_vec_loop_controls (&masks);
934   release_vec_loop_controls (&lens);
935   delete ivexpr_map;
936   delete scan_map;
937   epilogue_vinfos.release ();
938   delete scalar_costs;
939   delete vector_costs;
940 
941   /* When we release an epiloge vinfo that we do not intend to use
942      avoid clearing AUX of the main loop which should continue to
943      point to the main loop vinfo since otherwise we'll leak that.  */
944   if (loop->aux == this)
945     loop->aux = NULL;
946 }
947 
948 /* Return an invariant or register for EXPR and emit necessary
949    computations in the LOOP_VINFO loop preheader.  */
950 
951 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
953 {
954   if (is_gimple_reg (expr)
955       || is_gimple_min_invariant (expr))
956     return expr;
957 
958   if (! loop_vinfo->ivexpr_map)
959     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
960   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
961   if (! cached)
962     {
963       gimple_seq stmts = NULL;
964       cached = force_gimple_operand (unshare_expr (expr),
965 				     &stmts, true, NULL_TREE);
966       if (stmts)
967 	{
968 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
969 	  gsi_insert_seq_on_edge_immediate (e, stmts);
970 	}
971     }
972   return cached;
973 }
974 
975 /* Return true if we can use CMP_TYPE as the comparison type to produce
976    all masks required to mask LOOP_VINFO.  */
977 
978 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
980 {
981   rgroup_controls *rgm;
982   unsigned int i;
983   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
984     if (rgm->type != NULL_TREE
985 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
986 					    cmp_type, rgm->type,
987 					    OPTIMIZE_FOR_SPEED))
988       return false;
989   return true;
990 }
991 
992 /* Calculate the maximum number of scalars per iteration for every
993    rgroup in LOOP_VINFO.  */
994 
995 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
997 {
998   unsigned int res = 1;
999   unsigned int i;
1000   rgroup_controls *rgm;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     res = MAX (res, rgm->max_nscalars_per_iter);
1003   return res;
1004 }
1005 
1006 /* Calculate the minimum precision necessary to represent:
1007 
1008       MAX_NITERS * FACTOR
1009 
1010    as an unsigned integer, where MAX_NITERS is the maximum number of
1011    loop header iterations for the original scalar form of LOOP_VINFO.  */
1012 
1013 static unsigned
vect_min_prec_for_max_niters(loop_vec_info loop_vinfo,unsigned int factor)1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1015 {
1016   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022 
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027 
1028   /* Work out how many bits we need to represent the limit.  */
1029   return wi::min_precision (max_ni * factor, UNSIGNED);
1030 }
1031 
1032 /* True if the loop needs peeling or partial vectors when vectorized.  */
1033 
1034 static bool
vect_need_peeling_or_partial_vectors_p(loop_vec_info loop_vinfo)1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1036 {
1037   unsigned HOST_WIDE_INT const_vf;
1038   HOST_WIDE_INT max_niter
1039     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1040 
1041   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044 					  (loop_vinfo));
1045 
1046   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1048     {
1049       /* Work out the (constant) number of iterations that need to be
1050 	 peeled for reasons other than niters.  */
1051       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053 	peel_niter += 1;
1054       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056 	return true;
1057     }
1058   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059       /* ??? When peeling for gaps but not alignment, we could
1060 	 try to check whether the (variable) niters is known to be
1061 	 VF * N + 1.  That's something of a niche case though.  */
1062       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065 	   < (unsigned) exact_log2 (const_vf))
1066 	  /* In case of versioning, check if the maximum number of
1067 	     iterations is greater than th.  If they are identical,
1068 	     the epilogue is unnecessary.  */
1069 	  && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070 	      || ((unsigned HOST_WIDE_INT) max_niter
1071 		  > (th / const_vf) * const_vf))))
1072     return true;
1073 
1074   return false;
1075 }
1076 
1077 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1078    whether we can actually generate the masks required.  Return true if so,
1079    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1080 
1081 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1083 {
1084   unsigned int min_ni_width;
1085   unsigned int max_nscalars_per_iter
1086     = vect_get_max_nscalars_per_iter (loop_vinfo);
1087 
1088   /* Use a normal loop if there are no statements that need masking.
1089      This only happens in rare degenerate cases: it means that the loop
1090      has no loads, no stores, and no live-out values.  */
1091   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092     return false;
1093 
1094   /* Work out how many bits we need to represent the limit.  */
1095   min_ni_width
1096     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1097 
1098   /* Find a scalar mode for which WHILE_ULT is supported.  */
1099   opt_scalar_int_mode cmp_mode_iter;
1100   tree cmp_type = NULL_TREE;
1101   tree iv_type = NULL_TREE;
1102   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103   unsigned int iv_precision = UINT_MAX;
1104 
1105   if (iv_limit != -1)
1106     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107 				      UNSIGNED);
1108 
1109   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1110     {
1111       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112       if (cmp_bits >= min_ni_width
1113 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1114 	{
1115 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116 	  if (this_type
1117 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1118 	    {
1119 	      /* Although we could stop as soon as we find a valid mode,
1120 		 there are at least two reasons why that's not always the
1121 		 best choice:
1122 
1123 		 - An IV that's Pmode or wider is more likely to be reusable
1124 		   in address calculations than an IV that's narrower than
1125 		   Pmode.
1126 
1127 		 - Doing the comparison in IV_PRECISION or wider allows
1128 		   a natural 0-based IV, whereas using a narrower comparison
1129 		   type requires mitigations against wrap-around.
1130 
1131 		 Conversely, if the IV limit is variable, doing the comparison
1132 		 in a wider type than the original type can introduce
1133 		 unnecessary extensions, so picking the widest valid mode
1134 		 is not always a good choice either.
1135 
1136 		 Here we prefer the first IV type that's Pmode or wider,
1137 		 and the first comparison type that's IV_PRECISION or wider.
1138 		 (The comparison type must be no wider than the IV type,
1139 		 to avoid extensions in the vector loop.)
1140 
1141 		 ??? We might want to try continuing beyond Pmode for ILP32
1142 		 targets if CMP_BITS < IV_PRECISION.  */
1143 	      iv_type = this_type;
1144 	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145 		cmp_type = this_type;
1146 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147 		break;
1148 	    }
1149 	}
1150     }
1151 
1152   if (!cmp_type)
1153     return false;
1154 
1155   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157   return true;
1158 }
1159 
1160 /* Check whether we can use vector access with length based on precison
1161    comparison.  So far, to keep it simple, we only allow the case that the
1162    precision of the target supported length is larger than the precision
1163    required by loop niters.  */
1164 
1165 static bool
vect_verify_loop_lens(loop_vec_info loop_vinfo)1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1167 {
1168   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169     return false;
1170 
1171   machine_mode len_load_mode = get_len_load_store_mode
1172     (loop_vinfo->vector_mode, true).require ();
1173   machine_mode len_store_mode = get_len_load_store_mode
1174     (loop_vinfo->vector_mode, false).require ();
1175 
1176   signed char partial_load_bias = internal_len_load_store_bias
1177     (IFN_LEN_LOAD, len_load_mode);
1178 
1179   signed char partial_store_bias = internal_len_load_store_bias
1180     (IFN_LEN_STORE, len_store_mode);
1181 
1182   gcc_assert (partial_load_bias == partial_store_bias);
1183 
1184   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185     return false;
1186 
1187   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188      len_loads with a length of zero.  In order to avoid that we prohibit
1189      more than one loop length here.  */
1190   if (partial_load_bias == -1
1191       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192     return false;
1193 
1194   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1195 
1196   unsigned int max_nitems_per_iter = 1;
1197   unsigned int i;
1198   rgroup_controls *rgl;
1199   /* Find the maximum number of items per iteration for every rgroup.  */
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1201     {
1202       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1204     }
1205 
1206   /* Work out how many bits we need to represent the length limit.  */
1207   unsigned int min_ni_prec
1208     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1209 
1210   /* Now use the maximum of below precisions for one suitable IV type:
1211      - the IV's natural precision
1212      - the precision needed to hold: the maximum number of scalar
1213        iterations multiplied by the scale factor (min_ni_prec above)
1214      - the Pmode precision
1215 
1216      If min_ni_prec is less than the precision of the current niters,
1217      we perfer to still use the niters type.  Prefer to use Pmode and
1218      wider IV to avoid narrow conversions.  */
1219 
1220   unsigned int ni_prec
1221     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222   min_ni_prec = MAX (min_ni_prec, ni_prec);
1223   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1224 
1225   tree iv_type = NULL_TREE;
1226   opt_scalar_int_mode tmode_iter;
1227   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1228     {
1229       scalar_mode tmode = tmode_iter.require ();
1230       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1231 
1232       /* ??? Do we really want to construct one IV whose precision exceeds
1233 	 BITS_PER_WORD?  */
1234       if (tbits > BITS_PER_WORD)
1235 	break;
1236 
1237       /* Find the first available standard integral type.  */
1238       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1239 	{
1240 	  iv_type = build_nonstandard_integer_type (tbits, true);
1241 	  break;
1242 	}
1243     }
1244 
1245   if (!iv_type)
1246     {
1247       if (dump_enabled_p ())
1248 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 			 "can't vectorize with length-based partial vectors"
1250 			 " because there is no suitable iv type.\n");
1251       return false;
1252     }
1253 
1254   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1256 
1257   return true;
1258 }
1259 
1260 /* Calculate the cost of one scalar iteration of the loop.  */
1261 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1263 {
1264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266   int nbbs = loop->num_nodes, factor;
1267   int innerloop_iters, i;
1268 
1269   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1270 
1271   /* Gather costs for statements in the scalar loop.  */
1272 
1273   /* FORNOW.  */
1274   innerloop_iters = 1;
1275   if (loop->inner)
1276     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1277 
1278   for (i = 0; i < nbbs; i++)
1279     {
1280       gimple_stmt_iterator si;
1281       basic_block bb = bbs[i];
1282 
1283       if (bb->loop_father == loop->inner)
1284         factor = innerloop_iters;
1285       else
1286         factor = 1;
1287 
1288       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1289         {
1290 	  gimple *stmt = gsi_stmt (si);
1291 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1292 
1293           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294             continue;
1295 
1296           /* Skip stmts that are not vectorized inside the loop.  */
1297 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299               && (!STMT_VINFO_LIVE_P (vstmt_info)
1300                   || !VECTORIZABLE_CYCLE_DEF
1301 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1302             continue;
1303 
1304 	  vect_cost_for_stmt kind;
1305           if (STMT_VINFO_DATA_REF (stmt_info))
1306             {
1307               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308                kind = scalar_load;
1309              else
1310                kind = scalar_store;
1311             }
1312 	  else if (vect_nop_conversion_p (stmt_info))
1313 	    continue;
1314 	  else
1315             kind = scalar_stmt;
1316 
1317 	  /* We are using vect_prologue here to avoid scaling twice
1318 	     by the inner loop factor.  */
1319 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320 			    factor, kind, stmt_info, 0, vect_prologue);
1321         }
1322     }
1323 
1324   /* Now accumulate cost.  */
1325   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326   add_stmt_costs (loop_vinfo->scalar_costs,
1327 		  &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328   loop_vinfo->scalar_costs->finish_cost (nullptr);
1329 }
1330 
1331 
1332 /* Function vect_analyze_loop_form.
1333 
1334    Verify that certain CFG restrictions hold, including:
1335    - the loop has a pre-header
1336    - the loop has a single entry and exit
1337    - the loop exit condition is simple enough
1338    - the number of iterations can be analyzed, i.e, a countable loop.  The
1339      niter could be analyzed under some assumptions.  */
1340 
1341 opt_result
vect_analyze_loop_form(class loop * loop,vect_loop_form_info * info)1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1343 {
1344   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1345 
1346   /* Different restrictions apply when we are considering an inner-most loop,
1347      vs. an outer (nested) loop.
1348      (FORNOW. May want to relax some of these restrictions in the future).  */
1349 
1350   info->inner_loop_cond = NULL;
1351   if (!loop->inner)
1352     {
1353       /* Inner-most loop.  We currently require that the number of BBs is
1354 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1355 	 look like this:
1356 
1357                         (pre-header)
1358                            |
1359                           header <--------+
1360                            | |            |
1361                            | +--> latch --+
1362                            |
1363                         (exit-bb)  */
1364 
1365       if (loop->num_nodes != 2)
1366 	return opt_result::failure_at (vect_location,
1367 				       "not vectorized:"
1368 				       " control flow in loop.\n");
1369 
1370       if (empty_block_p (loop->header))
1371 	return opt_result::failure_at (vect_location,
1372 				       "not vectorized: empty loop.\n");
1373     }
1374   else
1375     {
1376       class loop *innerloop = loop->inner;
1377       edge entryedge;
1378 
1379       /* Nested loop. We currently require that the loop is doubly-nested,
1380 	 contains a single inner loop, and the number of BBs is exactly 5.
1381 	 Vectorizable outer-loops look like this:
1382 
1383 			(pre-header)
1384 			   |
1385 			  header <---+
1386 			   |         |
1387 		          inner-loop |
1388 			   |         |
1389 			  tail ------+
1390 			   |
1391 		        (exit-bb)
1392 
1393 	 The inner-loop has the properties expected of inner-most loops
1394 	 as described above.  */
1395 
1396       if ((loop->inner)->inner || (loop->inner)->next)
1397 	return opt_result::failure_at (vect_location,
1398 				       "not vectorized:"
1399 				       " multiple nested loops.\n");
1400 
1401       if (loop->num_nodes != 5)
1402 	return opt_result::failure_at (vect_location,
1403 				       "not vectorized:"
1404 				       " control flow in loop.\n");
1405 
1406       entryedge = loop_preheader_edge (innerloop);
1407       if (entryedge->src != loop->header
1408 	  || !single_exit (innerloop)
1409 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410 	return opt_result::failure_at (vect_location,
1411 				       "not vectorized:"
1412 				       " unsupported outerloop form.\n");
1413 
1414       /* Analyze the inner-loop.  */
1415       vect_loop_form_info inner;
1416       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417       if (!res)
1418 	{
1419 	  if (dump_enabled_p ())
1420 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421 			     "not vectorized: Bad inner loop.\n");
1422 	  return res;
1423 	}
1424 
1425       /* Don't support analyzing niter under assumptions for inner
1426 	 loop.  */
1427       if (!integer_onep (inner.assumptions))
1428 	return opt_result::failure_at (vect_location,
1429 				       "not vectorized: Bad inner loop.\n");
1430 
1431       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432 	return opt_result::failure_at (vect_location,
1433 				       "not vectorized: inner-loop count not"
1434 				       " invariant.\n");
1435 
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_NOTE, vect_location,
1438 			 "Considering outer-loop vectorization.\n");
1439       info->inner_loop_cond = inner.loop_cond;
1440     }
1441 
1442   if (!single_exit (loop))
1443     return opt_result::failure_at (vect_location,
1444 				   "not vectorized: multiple exits.\n");
1445   if (EDGE_COUNT (loop->header->preds) != 2)
1446     return opt_result::failure_at (vect_location,
1447 				   "not vectorized:"
1448 				   " too many incoming edges.\n");
1449 
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     return opt_result::failure_at (vect_location,
1457 				   "not vectorized: latch block not empty.\n");
1458 
1459   /* Make sure the exit is not abnormal.  */
1460   edge e = single_exit (loop);
1461   if (e->flags & EDGE_ABNORMAL)
1462     return opt_result::failure_at (vect_location,
1463 				   "not vectorized:"
1464 				   " abnormal loop exit edge.\n");
1465 
1466   info->loop_cond
1467     = vect_get_loop_niters (loop, &info->assumptions,
1468 			    &info->number_of_iterations,
1469 			    &info->number_of_iterationsm1);
1470   if (!info->loop_cond)
1471     return opt_result::failure_at
1472       (vect_location,
1473        "not vectorized: complicated exit condition.\n");
1474 
1475   if (integer_zerop (info->assumptions)
1476       || !info->number_of_iterations
1477       || chrec_contains_undetermined (info->number_of_iterations))
1478     return opt_result::failure_at
1479       (info->loop_cond,
1480        "not vectorized: number of iterations cannot be computed.\n");
1481 
1482   if (integer_zerop (info->number_of_iterations))
1483     return opt_result::failure_at
1484       (info->loop_cond,
1485        "not vectorized: number of iterations = 0.\n");
1486 
1487   if (!(tree_fits_shwi_p (info->number_of_iterations)
1488 	&& tree_to_shwi (info->number_of_iterations) > 0))
1489     {
1490       if (dump_enabled_p ())
1491 	{
1492 	  dump_printf_loc (MSG_NOTE, vect_location,
1493 			   "Symbolic number of iterations is ");
1494 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495 	  dump_printf (MSG_NOTE, "\n");
1496 	}
1497     }
1498 
1499   return opt_result::success ();
1500 }
1501 
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503    vect_analyze_loop_form result.  */
1504 
1505 loop_vec_info
vect_create_loop_vinfo(class loop * loop,vec_info_shared * shared,const vect_loop_form_info * info,loop_vec_info main_loop_info)1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507 			const vect_loop_form_info *info,
1508 			loop_vec_info main_loop_info)
1509 {
1510   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515   /* Also record the assumptions for versioning.  */
1516   if (!integer_onep (info->assumptions) && !main_loop_info)
1517     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1518 
1519   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521   if (info->inner_loop_cond)
1522     {
1523       stmt_vec_info inner_loop_cond_info
1524 	= loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526       /* If we have an estimate on the number of iterations of the inner
1527 	 loop use that to limit the scale for costing, otherwise use
1528 	 --param vect-inner-loop-cost-factor literally.  */
1529       widest_int nit;
1530       if (estimated_stmt_executions (loop->inner, &nit))
1531 	LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532 	  = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1533     }
1534 
1535   return loop_vinfo;
1536 }
1537 
1538 
1539 
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541    statements update the vectorization factor.  */
1542 
1543 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1545 {
1546   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548   int nbbs = loop->num_nodes;
1549   poly_uint64 vectorization_factor;
1550   int i;
1551 
1552   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1553 
1554   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555   gcc_assert (known_ne (vectorization_factor, 0U));
1556 
1557   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558      vectorization factor of the loop is the unrolling factor required by
1559      the SLP instances.  If that unrolling factor is 1, we say, that we
1560      perform pure SLP on loop - cross iteration parallelism is not
1561      exploited.  */
1562   bool only_slp_in_loop = true;
1563   for (i = 0; i < nbbs; i++)
1564     {
1565       basic_block bb = bbs[i];
1566       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567 	   gsi_next (&si))
1568 	{
1569 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570 	  if (!stmt_info)
1571 	    continue;
1572 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 	      && !PURE_SLP_STMT (stmt_info))
1575 	    /* STMT needs both SLP and loop-based vectorization.  */
1576 	    only_slp_in_loop = false;
1577 	}
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 	   gsi_next (&si))
1580 	{
1581 	  if (is_gimple_debug (gsi_stmt (si)))
1582 	    continue;
1583 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1585 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587 	      && !PURE_SLP_STMT (stmt_info))
1588 	    /* STMT needs both SLP and loop-based vectorization.  */
1589 	    only_slp_in_loop = false;
1590 	}
1591     }
1592 
1593   if (only_slp_in_loop)
1594     {
1595       if (dump_enabled_p ())
1596 	dump_printf_loc (MSG_NOTE, vect_location,
1597 			 "Loop contains only SLP stmts\n");
1598       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1599     }
1600   else
1601     {
1602       if (dump_enabled_p ())
1603 	dump_printf_loc (MSG_NOTE, vect_location,
1604 			 "Loop contains SLP and non-SLP stmts\n");
1605       /* Both the vectorization factor and unroll factor have the form
1606 	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607 	 so they must have a common multiple.  */
1608       vectorization_factor
1609 	= force_common_multiple (vectorization_factor,
1610 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611     }
1612 
1613   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614   if (dump_enabled_p ())
1615     {
1616       dump_printf_loc (MSG_NOTE, vect_location,
1617 		       "Updating vectorization factor to ");
1618       dump_dec (MSG_NOTE, vectorization_factor);
1619       dump_printf (MSG_NOTE, ".\n");
1620     }
1621 }
1622 
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624    the other phi in the reduction is also relevant for vectorization.
1625    This rejects cases such as:
1626 
1627       outer1:
1628 	x_1 = PHI <x_3(outer2), ...>;
1629 	...
1630 
1631       inner:
1632 	x_2 = ...;
1633 	...
1634 
1635       outer2:
1636 	x_3 = PHI <x_2(inner)>;
1637 
1638    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1639 
1640 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1642 {
1643   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644     return false;
1645 
1646   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 }
1648 
1649 /* Function vect_analyze_loop_operations.
1650 
1651    Scan the loop stmts and make sure they are all vectorizable.  */
1652 
1653 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1655 {
1656   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658   int nbbs = loop->num_nodes;
1659   int i;
1660   stmt_vec_info stmt_info;
1661   bool need_to_vectorize = false;
1662   bool ok;
1663 
1664   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1665 
1666   auto_vec<stmt_info_for_cost> cost_vec;
1667 
1668   for (i = 0; i < nbbs; i++)
1669     {
1670       basic_block bb = bbs[i];
1671 
1672       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673 	   gsi_next (&si))
1674         {
1675           gphi *phi = si.phi ();
1676           ok = true;
1677 
1678 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1679           if (dump_enabled_p ())
1680 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681 	  if (virtual_operand_p (gimple_phi_result (phi)))
1682 	    continue;
1683 
1684           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685              (i.e., a phi in the tail of the outer-loop).  */
1686           if (! is_loop_header_bb_p (bb))
1687             {
1688               /* FORNOW: we currently don't support the case that these phis
1689                  are not used in the outerloop (unless it is double reduction,
1690                  i.e., this phi is vect_reduction_def), cause this case
1691                  requires to actually do something here.  */
1692               if (STMT_VINFO_LIVE_P (stmt_info)
1693 		  && !vect_active_double_reduction_p (stmt_info))
1694 		return opt_result::failure_at (phi,
1695 					       "Unsupported loop-closed phi"
1696 					       " in outer-loop.\n");
1697 
1698               /* If PHI is used in the outer loop, we check that its operand
1699                  is defined in the inner loop.  */
1700               if (STMT_VINFO_RELEVANT_P (stmt_info))
1701                 {
1702                   tree phi_op;
1703 
1704                   if (gimple_phi_num_args (phi) != 1)
1705                     return opt_result::failure_at (phi, "unsupported phi");
1706 
1707                   phi_op = PHI_ARG_DEF (phi, 0);
1708 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709 		  if (!op_def_info)
1710 		    return opt_result::failure_at (phi, "unsupported phi\n");
1711 
1712 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713 		      && (STMT_VINFO_RELEVANT (op_def_info)
1714 			  != vect_used_in_outer_by_reduction))
1715 		    return opt_result::failure_at (phi, "unsupported phi\n");
1716 
1717 		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718 		       || (STMT_VINFO_DEF_TYPE (stmt_info)
1719 			   == vect_double_reduction_def))
1720 		      && !vectorizable_lc_phi (loop_vinfo,
1721 					       stmt_info, NULL, NULL))
1722 		    return opt_result::failure_at (phi, "unsupported phi\n");
1723                 }
1724 
1725               continue;
1726             }
1727 
1728           gcc_assert (stmt_info);
1729 
1730           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731                || STMT_VINFO_LIVE_P (stmt_info))
1732               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733 	    /* A scalar-dependence cycle that we don't support.  */
1734 	    return opt_result::failure_at (phi,
1735 					   "not vectorized:"
1736 					   " scalar dependence cycle.\n");
1737 
1738           if (STMT_VINFO_RELEVANT_P (stmt_info))
1739             {
1740               need_to_vectorize = true;
1741               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742 		  && ! PURE_SLP_STMT (stmt_info))
1743 		ok = vectorizable_induction (loop_vinfo,
1744 					     stmt_info, NULL, NULL,
1745 					     &cost_vec);
1746 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747 			|| (STMT_VINFO_DEF_TYPE (stmt_info)
1748 			    == vect_double_reduction_def)
1749 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750 		       && ! PURE_SLP_STMT (stmt_info))
1751 		ok = vectorizable_reduction (loop_vinfo,
1752 					     stmt_info, NULL, NULL, &cost_vec);
1753             }
1754 
1755 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1756 	  if (ok
1757 	      && STMT_VINFO_LIVE_P (stmt_info)
1758 	      && !PURE_SLP_STMT (stmt_info))
1759 	    ok = vectorizable_live_operation (loop_vinfo,
1760 					      stmt_info, NULL, NULL, NULL,
1761 					      -1, false, &cost_vec);
1762 
1763           if (!ok)
1764 	    return opt_result::failure_at (phi,
1765 					   "not vectorized: relevant phi not "
1766 					   "supported: %G",
1767 					   static_cast <gimple *> (phi));
1768         }
1769 
1770       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771 	   gsi_next (&si))
1772         {
1773 	  gimple *stmt = gsi_stmt (si);
1774 	  if (!gimple_clobber_p (stmt)
1775 	      && !is_gimple_debug (stmt))
1776 	    {
1777 	      opt_result res
1778 		= vect_analyze_stmt (loop_vinfo,
1779 				     loop_vinfo->lookup_stmt (stmt),
1780 				     &need_to_vectorize,
1781 				     NULL, NULL, &cost_vec);
1782 	      if (!res)
1783 		return res;
1784 	    }
1785         }
1786     } /* bbs */
1787 
1788   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1789 
1790   /* All operations in the loop are either irrelevant (deal with loop
1791      control, or dead), or only used outside the loop and can be moved
1792      out of the loop (e.g. invariants, inductions).  The loop can be
1793      optimized away by scalar optimizations.  We're better off not
1794      touching this loop.  */
1795   if (!need_to_vectorize)
1796     {
1797       if (dump_enabled_p ())
1798         dump_printf_loc (MSG_NOTE, vect_location,
1799 			 "All the computation can be taken out of the loop.\n");
1800       return opt_result::failure_at
1801 	(vect_location,
1802 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1803     }
1804 
1805   return opt_result::success ();
1806 }
1807 
1808 /* Return true if we know that the iteration count is smaller than the
1809    vectorization factor.  Return false if it isn't, or if we can't be sure
1810    either way.  */
1811 
1812 static bool
vect_known_niters_smaller_than_vf(loop_vec_info loop_vinfo)1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 
1817   HOST_WIDE_INT max_niter;
1818   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820   else
1821     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1822 
1823   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824     return true;
1825 
1826   return false;
1827 }
1828 
1829 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1830    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1831    definitely no, or -1 if it's worth retrying.  */
1832 
1833 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo,unsigned * suggested_unroll_factor)1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835 			   unsigned *suggested_unroll_factor)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1839 
1840   /* Only loops that can handle partially-populated vectors can have iteration
1841      counts less than the vectorization factor.  */
1842   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1843     {
1844       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1845 	{
1846 	  if (dump_enabled_p ())
1847 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 			     "not vectorized: iteration count smaller than "
1849 			     "vectorization factor.\n");
1850 	  return 0;
1851 	}
1852     }
1853 
1854   /* If using the "very cheap" model. reject cases in which we'd keep
1855      a copy of the scalar code (even if we might be able to vectorize it).  */
1856   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858 	  || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859 	  || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1860     {
1861       if (dump_enabled_p ())
1862 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 			 "some scalar iterations would need to be peeled\n");
1864       return 0;
1865     }
1866 
1867   int min_profitable_iters, min_profitable_estimate;
1868   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869 				      &min_profitable_estimate,
1870 				      suggested_unroll_factor);
1871 
1872   if (min_profitable_iters < 0)
1873     {
1874       if (dump_enabled_p ())
1875 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 			 "not vectorized: vectorization not profitable.\n");
1877       if (dump_enabled_p ())
1878 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879 			 "not vectorized: vector version will never be "
1880 			 "profitable.\n");
1881       return -1;
1882     }
1883 
1884   int min_scalar_loop_bound = (param_min_vect_loop_bound
1885 			       * assumed_vf);
1886 
1887   /* Use the cost model only if it is more conservative than user specified
1888      threshold.  */
1889   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890 				    min_profitable_iters);
1891 
1892   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1893 
1894   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1896     {
1897       if (dump_enabled_p ())
1898 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 			 "not vectorized: vectorization not profitable.\n");
1900       if (dump_enabled_p ())
1901 	dump_printf_loc (MSG_NOTE, vect_location,
1902 			 "not vectorized: iteration count smaller than user "
1903 			 "specified loop bound parameter or minimum profitable "
1904 			 "iterations (whichever is more conservative).\n");
1905       return 0;
1906     }
1907 
1908   /* The static profitablity threshold min_profitable_estimate includes
1909      the cost of having to check at runtime whether the scalar loop
1910      should be used instead.  If it turns out that we don't need or want
1911      such a check, the threshold we should use for the static estimate
1912      is simply the point at which the vector loop becomes more profitable
1913      than the scalar loop.  */
1914   if (min_profitable_estimate > min_profitable_iters
1915       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1919     {
1920       if (dump_enabled_p ())
1921 	dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922 			 " choice between the scalar and vector loops\n");
1923       min_profitable_estimate = min_profitable_iters;
1924     }
1925 
1926   /* If the vector loop needs multiple iterations to be beneficial then
1927      things are probably too close to call, and the conservative thing
1928      would be to stick with the scalar code.  */
1929   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1931     {
1932       if (dump_enabled_p ())
1933 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934 			 "one iteration of the vector loop would be"
1935 			 " more expensive than the equivalent number of"
1936 			 " iterations of the scalar loop\n");
1937       return 0;
1938     }
1939 
1940   HOST_WIDE_INT estimated_niter;
1941 
1942   /* If we are vectorizing an epilogue then we know the maximum number of
1943      scalar iterations it will cover is at least one lower than the
1944      vectorization factor of the main loop.  */
1945   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946     estimated_niter
1947       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948   else
1949     {
1950       estimated_niter = estimated_stmt_executions_int (loop);
1951       if (estimated_niter == -1)
1952 	estimated_niter = likely_max_stmt_executions_int (loop);
1953     }
1954   if (estimated_niter != -1
1955       && ((unsigned HOST_WIDE_INT) estimated_niter
1956 	  < MAX (th, (unsigned) min_profitable_estimate)))
1957     {
1958       if (dump_enabled_p ())
1959 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960 			 "not vectorized: estimated iteration count too "
1961 			 "small.\n");
1962       if (dump_enabled_p ())
1963 	dump_printf_loc (MSG_NOTE, vect_location,
1964 			 "not vectorized: estimated iteration count smaller "
1965 			 "than specified loop bound parameter or minimum "
1966 			 "profitable iterations (whichever is more "
1967 			 "conservative).\n");
1968       return -1;
1969     }
1970 
1971   return 1;
1972 }
1973 
1974 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976 			   vec<data_reference_p> *datarefs,
1977 			   unsigned int *n_stmts)
1978 {
1979   *n_stmts = 0;
1980   for (unsigned i = 0; i < loop->num_nodes; i++)
1981     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982 	 !gsi_end_p (gsi); gsi_next (&gsi))
1983       {
1984 	gimple *stmt = gsi_stmt (gsi);
1985 	if (is_gimple_debug (stmt))
1986 	  continue;
1987 	++(*n_stmts);
1988 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989 							NULL, 0);
1990 	if (!res)
1991 	  {
1992 	    if (is_gimple_call (stmt) && loop->safelen)
1993 	      {
1994 		tree fndecl = gimple_call_fndecl (stmt), op;
1995 		if (fndecl != NULL_TREE)
1996 		  {
1997 		    cgraph_node *node = cgraph_node::get (fndecl);
1998 		    if (node != NULL && node->simd_clones != NULL)
1999 		      {
2000 			unsigned int j, n = gimple_call_num_args (stmt);
2001 			for (j = 0; j < n; j++)
2002 			  {
2003 			    op = gimple_call_arg (stmt, j);
2004 			    if (DECL_P (op)
2005 				|| (REFERENCE_CLASS_P (op)
2006 				    && get_base_address (op)))
2007 			      break;
2008 			  }
2009 			op = gimple_call_lhs (stmt);
2010 			/* Ignore #pragma omp declare simd functions
2011 			   if they don't have data references in the
2012 			   call stmt itself.  */
2013 			if (j == n
2014 			    && !(op
2015 				 && (DECL_P (op)
2016 				     || (REFERENCE_CLASS_P (op)
2017 					 && get_base_address (op)))))
2018 			  continue;
2019 		      }
2020 		  }
2021 	      }
2022 	    return res;
2023 	  }
2024 	/* If dependence analysis will give up due to the limit on the
2025 	   number of datarefs stop here and fail fatally.  */
2026 	if (datarefs->length ()
2027 	    > (unsigned)param_loop_max_datarefs_for_datadeps)
2028 	  return opt_result::failure_at (stmt, "exceeded param "
2029 					 "loop-max-datarefs-for-datadeps\n");
2030       }
2031   return opt_result::success ();
2032 }
2033 
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035    group.  */
2036 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2038 {
2039   unsigned int i;
2040   struct data_reference *dr;
2041 
2042   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2043 
2044   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045   FOR_EACH_VEC_ELT (datarefs, i, dr)
2046     {
2047       gcc_assert (DR_REF (dr));
2048       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2049 
2050       /* Check if the load is a part of an interleaving chain.  */
2051       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 	{
2053 	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054 	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055 	  unsigned int group_size = DR_GROUP_SIZE (first_element);
2056 
2057 	  /* Check if SLP-only groups.  */
2058 	  if (!STMT_SLP_TYPE (stmt_info)
2059 	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
2060 	    {
2061 	      /* Dissolve the group.  */
2062 	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2063 
2064 	      stmt_vec_info vinfo = first_element;
2065 	      while (vinfo)
2066 		{
2067 		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068 		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069 		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070 		  DR_GROUP_SIZE (vinfo) = 1;
2071 		  if (STMT_VINFO_STRIDED_P (first_element))
2072 		    DR_GROUP_GAP (vinfo) = 0;
2073 		  else
2074 		    DR_GROUP_GAP (vinfo) = group_size - 1;
2075 		  /* Duplicate and adjust alignment info, it needs to
2076 		     be present on each group leader, see dr_misalignment.  */
2077 		  if (vinfo != first_element)
2078 		    {
2079 		      dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080 		      dr_info2->target_alignment = dr_info->target_alignment;
2081 		      int misalignment = dr_info->misalignment;
2082 		      if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2083 			{
2084 			  HOST_WIDE_INT diff
2085 			    = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086 			       - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087 			  unsigned HOST_WIDE_INT align_c
2088 			    = dr_info->target_alignment.to_constant ();
2089 			  misalignment = (misalignment + diff) % align_c;
2090 			}
2091 		      dr_info2->misalignment = misalignment;
2092 		    }
2093 		  vinfo = next;
2094 		}
2095 	    }
2096 	}
2097     }
2098 }
2099 
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101    some scalar iterations still to do.  If so, decide how we should
2102    handle those scalar iterations.  The possibilities are:
2103 
2104    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105        In this case:
2106 
2107 	 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108 	 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109 	 LOOP_VINFO_PEELING_FOR_NITER == false
2110 
2111    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112        to handle the remaining scalar iterations.  In this case:
2113 
2114 	 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115 	 LOOP_VINFO_PEELING_FOR_NITER == true
2116 
2117        There are two choices:
2118 
2119        (2a) Consider vectorizing the epilogue loop at the same VF as the
2120 	    main loop, but using partial vectors instead of full vectors.
2121 	    In this case:
2122 
2123 	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2124 
2125        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126 	    In this case:
2127 
2128 	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2129 
2130    When FOR_EPILOGUE_P is true, make this determination based on the
2131    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132    based on the assumption that LOOP_VINFO is the main loop.  The caller
2133    has made sure that the number of iterations is set appropriately for
2134    this value of FOR_EPILOGUE_P.  */
2135 
2136 opt_result
vect_determine_partial_vectors_and_peeling(loop_vec_info loop_vinfo,bool for_epilogue_p)2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138 					    bool for_epilogue_p)
2139 {
2140   /* Determine whether there would be any scalar iterations left over.  */
2141   bool need_peeling_or_partial_vectors_p
2142     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2143 
2144   /* Decide whether to vectorize the loop with partial vectors.  */
2145   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148       && need_peeling_or_partial_vectors_p)
2149     {
2150       /* For partial-vector-usage=1, try to push the handling of partial
2151 	 vectors to the epilogue, with the main loop continuing to operate
2152 	 on full vectors.
2153 
2154 	 If we are unrolling we also do not want to use partial vectors. This
2155 	 is to avoid the overhead of generating multiple masks and also to
2156 	 avoid having to execute entire iterations of FALSE masked instructions
2157 	 when dealing with one or less full iterations.
2158 
2159 	 ??? We could then end up failing to use partial vectors if we
2160 	 decide to peel iterations into a prologue, and if the main loop
2161 	 then ends up processing fewer than VF iterations.  */
2162       if ((param_vect_partial_vector_usage == 1
2163 	   || loop_vinfo->suggested_unroll_factor > 1)
2164 	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165 	  && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166 	LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167       else
2168 	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2169     }
2170 
2171   if (dump_enabled_p ())
2172     {
2173       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174 	dump_printf_loc (MSG_NOTE, vect_location,
2175 			 "operating on partial vectors%s.\n",
2176 			 for_epilogue_p ? " for epilogue loop" : "");
2177       else
2178 	dump_printf_loc (MSG_NOTE, vect_location,
2179 			 "operating only on full vectors%s.\n",
2180 			 for_epilogue_p ? " for epilogue loop" : "");
2181     }
2182 
2183   if (for_epilogue_p)
2184     {
2185       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186       gcc_assert (orig_loop_vinfo);
2187       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188 	gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189 			      LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2190     }
2191 
2192   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2194     {
2195       /* Check that the loop processes at least one full vector.  */
2196       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198       if (known_lt (wi::to_widest (scalar_niters), vf))
2199 	return opt_result::failure_at (vect_location,
2200 				       "loop does not have enough iterations"
2201 				       " to support vectorization.\n");
2202 
2203       /* If we need to peel an extra epilogue iteration to handle data
2204 	 accesses with gaps, check that there are enough scalar iterations
2205 	 available.
2206 
2207 	 The check above is redundant with this one when peeling for gaps,
2208 	 but the distinction is useful for diagnostics.  */
2209       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211 	  && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212 	return opt_result::failure_at (vect_location,
2213 				       "loop does not have enough iterations"
2214 				       " to support peeling for gaps.\n");
2215     }
2216 
2217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219        && need_peeling_or_partial_vectors_p);
2220 
2221   return opt_result::success ();
2222 }
2223 
2224 /* Function vect_analyze_loop_2.
2225 
2226    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227    for it.  The different analyses will record information in the
2228    loop_vec_info struct.  */
2229 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * suggested_unroll_factor)2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231 		     unsigned *suggested_unroll_factor)
2232 {
2233   opt_result ok = opt_result::success ();
2234   int res;
2235   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236   poly_uint64 min_vf = 2;
2237   loop_vec_info orig_loop_vinfo = NULL;
2238 
2239   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240      loop_vec_info of the first vectorized loop.  */
2241   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243   else
2244     orig_loop_vinfo = loop_vinfo;
2245   gcc_assert (orig_loop_vinfo);
2246 
2247   /* The first group of checks is independent of the vector size.  */
2248   fatal = true;
2249 
2250   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252     return opt_result::failure_at (vect_location,
2253 				   "not vectorized: simd if(0)\n");
2254 
2255   /* Find all data references in the loop (which correspond to vdefs/vuses)
2256      and analyze their evolution in the loop.  */
2257 
2258   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2259 
2260   /* Gather the data references and count stmts in the loop.  */
2261   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2262     {
2263       opt_result res
2264 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
2266 				     &LOOP_VINFO_N_STMTS (loop_vinfo));
2267       if (!res)
2268 	{
2269 	  if (dump_enabled_p ())
2270 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 			     "not vectorized: loop contains function "
2272 			     "calls or data references that cannot "
2273 			     "be analyzed\n");
2274 	  return res;
2275 	}
2276       loop_vinfo->shared->save_datarefs ();
2277     }
2278   else
2279     loop_vinfo->shared->check_datarefs ();
2280 
2281   /* Analyze the data references and also adjust the minimal
2282      vectorization factor according to the loads and stores.  */
2283 
2284   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 			 "bad data references.\n");
2290       return ok;
2291     }
2292 
2293   /* Classify all cross-iteration scalar data-flow cycles.
2294      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2295   vect_analyze_scalar_cycles (loop_vinfo);
2296 
2297   vect_pattern_recog (loop_vinfo);
2298 
2299   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2300 
2301   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2303 
2304   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 			 "bad data access.\n");
2310       return ok;
2311     }
2312 
2313   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2314 
2315   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316   if (!ok)
2317     {
2318       if (dump_enabled_p ())
2319 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 			 "unexpected pattern.\n");
2321       return ok;
2322     }
2323 
2324   /* While the rest of the analysis below depends on it in some way.  */
2325   fatal = false;
2326 
2327   /* Analyze data dependences between the data-refs in the loop
2328      and adjust the maximum vectorization factor according to
2329      the dependences.
2330      FORNOW: fail at the first data dependence that we encounter.  */
2331 
2332   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333   if (!ok)
2334     {
2335       if (dump_enabled_p ())
2336 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 			 "bad data dependence.\n");
2338       return ok;
2339     }
2340   if (max_vf != MAX_VECTORIZATION_FACTOR
2341       && maybe_lt (max_vf, min_vf))
2342     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2344 
2345   ok = vect_determine_vectorization_factor (loop_vinfo);
2346   if (!ok)
2347     {
2348       if (dump_enabled_p ())
2349 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350 			 "can't determine vectorization factor.\n");
2351       return ok;
2352     }
2353   if (max_vf != MAX_VECTORIZATION_FACTOR
2354       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2356 
2357   /* Compute the scalar iteration cost.  */
2358   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2359 
2360   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 
2362   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2363   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364   if (!ok)
2365     return ok;
2366 
2367   /* If there are any SLP instances mark them as pure_slp.  */
2368   bool slp = vect_make_slp_decision (loop_vinfo);
2369   if (slp)
2370     {
2371       /* Find stmts that need to be both vectorized and SLPed.  */
2372       vect_detect_hybrid_slp (loop_vinfo);
2373 
2374       /* Update the vectorization factor based on the SLP decision.  */
2375       vect_update_vf_for_slp (loop_vinfo);
2376 
2377       /* Optimize the SLP graph with the vectorization factor fixed.  */
2378       vect_optimize_slp (loop_vinfo);
2379 
2380       /* Gather the loads reachable from the SLP graph entries.  */
2381       vect_gather_slp_loads (loop_vinfo);
2382     }
2383 
2384   bool saved_can_use_partial_vectors_p
2385     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2386 
2387   /* We don't expect to have to roll back to anything other than an empty
2388      set of rgroups.  */
2389   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2390 
2391   /* This is the point where we can re-start analysis with SLP forced off.  */
2392 start_over:
2393 
2394   /* Apply the suggested unrolling factor, this was determined by the backend
2395      during finish_cost the first time we ran the analyzis for this
2396      vector mode.  */
2397   if (loop_vinfo->suggested_unroll_factor > 1)
2398     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2399 
2400   /* Now the vectorization factor is final.  */
2401   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402   gcc_assert (known_ne (vectorization_factor, 0U));
2403 
2404   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2405     {
2406       dump_printf_loc (MSG_NOTE, vect_location,
2407 		       "vectorization_factor = ");
2408       dump_dec (MSG_NOTE, vectorization_factor);
2409       dump_printf (MSG_NOTE, ", niters = %wd\n",
2410 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2411     }
2412 
2413   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2414 
2415   /* Analyze the alignment of the data-refs in the loop.
2416      Fail if a data reference is found that cannot be vectorized.  */
2417 
2418   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419   if (!ok)
2420     {
2421       if (dump_enabled_p ())
2422 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423 			 "bad data alignment.\n");
2424       return ok;
2425     }
2426 
2427   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428      It is important to call pruning after vect_analyze_data_ref_accesses,
2429      since we use grouping information gathered by interleaving analysis.  */
2430   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431   if (!ok)
2432     return ok;
2433 
2434   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435      vectorization, since we do not want to add extra peeling or
2436      add versioning for alignment.  */
2437   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438     /* This pass will decide on using loop versioning and/or loop peeling in
2439        order to enhance the alignment of data references in the loop.  */
2440     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441   if (!ok)
2442     return ok;
2443 
2444   if (slp)
2445     {
2446       /* Analyze operations in the SLP instances.  Note this may
2447 	 remove unsupported SLP instances which makes the above
2448 	 SLP kind detection invalid.  */
2449       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450       vect_slp_analyze_operations (loop_vinfo);
2451       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2452 	{
2453 	  ok = opt_result::failure_at (vect_location,
2454 				       "unsupported SLP instances\n");
2455 	  goto again;
2456 	}
2457 
2458       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2459       slp_tree load_node, slp_root;
2460       unsigned i, x;
2461       slp_instance instance;
2462       bool can_use_lanes = true;
2463       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2464 	{
2465 	  slp_root = SLP_INSTANCE_TREE (instance);
2466 	  int group_size = SLP_TREE_LANES (slp_root);
2467 	  tree vectype = SLP_TREE_VECTYPE (slp_root);
2468 	  bool loads_permuted = false;
2469 	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2470 	    {
2471 	      if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472 		continue;
2473 	      unsigned j;
2474 	      stmt_vec_info load_info;
2475 	      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476 		if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2477 		  {
2478 		    loads_permuted = true;
2479 		    break;
2480 		  }
2481 	    }
2482 
2483 	  /* If the loads and stores can be handled with load/store-lane
2484 	     instructions record it and move on to the next instance.  */
2485 	  if (loads_permuted
2486 	      && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487 	      && vect_store_lanes_supported (vectype, group_size, false))
2488 	    {
2489 	      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2490 		{
2491 		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492 		      (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493 		  /* Use SLP for strided accesses (or if we can't
2494 		     load-lanes).  */
2495 		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496 		      || ! vect_load_lanes_supported
2497 			    (STMT_VINFO_VECTYPE (stmt_vinfo),
2498 			     DR_GROUP_SIZE (stmt_vinfo), false))
2499 		    break;
2500 		}
2501 
2502 	      can_use_lanes
2503 		= can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2504 
2505 	      if (can_use_lanes && dump_enabled_p ())
2506 		dump_printf_loc (MSG_NOTE, vect_location,
2507 				 "SLP instance %p can use load/store-lanes\n",
2508 				 instance);
2509 	    }
2510 	  else
2511 	    {
2512 	      can_use_lanes = false;
2513 	      break;
2514 	    }
2515 	}
2516 
2517       /* If all SLP instances can use load/store-lanes abort SLP and try again
2518 	 with SLP disabled.  */
2519       if (can_use_lanes)
2520 	{
2521 	  ok = opt_result::failure_at (vect_location,
2522 				       "Built SLP cancelled: can use "
2523 				       "load/store-lanes\n");
2524 	  if (dump_enabled_p ())
2525 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 			     "Built SLP cancelled: all SLP instances support "
2527 			     "load/store-lanes\n");
2528 	  goto again;
2529 	}
2530     }
2531 
2532   /* Dissolve SLP-only groups.  */
2533   vect_dissolve_slp_only_groups (loop_vinfo);
2534 
2535   /* Scan all the remaining operations in the loop that are not subject
2536      to SLP and make sure they are vectorizable.  */
2537   ok = vect_analyze_loop_operations (loop_vinfo);
2538   if (!ok)
2539     {
2540       if (dump_enabled_p ())
2541 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542 			 "bad operation or unsupported loop bound.\n");
2543       return ok;
2544     }
2545 
2546   /* For now, we don't expect to mix both masking and length approaches for one
2547      loop, disable it if both are recorded.  */
2548   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2551     {
2552       if (dump_enabled_p ())
2553 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 			 "can't vectorize a loop with partial vectors"
2555 			 " because we don't expect to mix different"
2556 			 " approaches with partial vectors for the"
2557 			 " same loop.\n");
2558       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2559     }
2560 
2561   /* If we still have the option of using partial vectors,
2562      check whether we can generate the necessary loop controls.  */
2563   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564       && !vect_verify_full_masking (loop_vinfo)
2565       && !vect_verify_loop_lens (loop_vinfo))
2566     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2567 
2568   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569      to be able to handle fewer than VF scalars, or needs to have a lower VF
2570      than the main loop.  */
2571   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574 		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575     return opt_result::failure_at (vect_location,
2576 				   "Vectorization factor too high for"
2577 				   " epilogue loop.\n");
2578 
2579   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580      assuming that the loop will be used as a main loop.  We will redo
2581      this analysis later if we instead decide to use the loop as an
2582      epilogue loop.  */
2583   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584   if (!ok)
2585     return ok;
2586 
2587   /* Check the costings of the loop make vectorizing worthwhile.  */
2588   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589   if (res < 0)
2590     {
2591       ok = opt_result::failure_at (vect_location,
2592 				   "Loop costings may not be worthwhile.\n");
2593       goto again;
2594     }
2595   if (!res)
2596     return opt_result::failure_at (vect_location,
2597 				   "Loop costings not worthwhile.\n");
2598 
2599   /* If an epilogue loop is required make sure we can create one.  */
2600   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602     {
2603       if (dump_enabled_p ())
2604         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605       if (!vect_can_advance_ivs_p (loop_vinfo)
2606 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607 					   single_exit (LOOP_VINFO_LOOP
2608 							 (loop_vinfo))))
2609         {
2610 	  ok = opt_result::failure_at (vect_location,
2611 				       "not vectorized: can't create required "
2612 				       "epilog loop\n");
2613           goto again;
2614         }
2615     }
2616 
2617   /* During peeling, we need to check if number of loop iterations is
2618      enough for both peeled prolog loop and vector loop.  This check
2619      can be merged along with threshold check of loop versioning, so
2620      increase threshold for this case if necessary.
2621 
2622      If we are analyzing an epilogue we still want to check what its
2623      versioning threshold would be.  If we decide to vectorize the epilogues we
2624      will want to use the lowest versioning threshold of all epilogues and main
2625      loop.  This will enable us to enter a vectorized epilogue even when
2626      versioning the loop.  We can't simply check whether the epilogue requires
2627      versioning though since we may have skipped some versioning checks when
2628      analyzing the epilogue.  For instance, checks for alias versioning will be
2629      skipped when dealing with epilogues as we assume we already checked them
2630      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2631   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2632     {
2633       poly_uint64 niters_th = 0;
2634       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2635 
2636       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2637 	{
2638 	  /* Niters for peeled prolog loop.  */
2639 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2640 	    {
2641 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2644 	    }
2645 	  else
2646 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647 	}
2648 
2649       /* Niters for at least one iteration of vectorized loop.  */
2650       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652       /* One additional iteration because of peeling for gap.  */
2653       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654 	niters_th += 1;
2655 
2656       /*  Use the same condition as vect_transform_loop to decide when to use
2657 	  the cost to determine a versioning threshold.  */
2658       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659 	  && ordered_p (th, niters_th))
2660 	niters_th = ordered_max (poly_uint64 (th), niters_th);
2661 
2662       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2663     }
2664 
2665   gcc_assert (known_eq (vectorization_factor,
2666 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2667 
2668   /* Ok to vectorize!  */
2669   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670   return opt_result::success ();
2671 
2672 again:
2673   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2674   gcc_assert (!ok);
2675 
2676   /* Try again with SLP forced off but if we didn't do any SLP there is
2677      no point in re-trying.  */
2678   if (!slp)
2679     return ok;
2680 
2681   /* If there are reduction chains re-trying will fail anyway.  */
2682   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683     return ok;
2684 
2685   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686      via interleaving or lane instructions.  */
2687   slp_instance instance;
2688   slp_tree node;
2689   unsigned i, j;
2690   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691     {
2692       stmt_vec_info vinfo;
2693       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695 	continue;
2696       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697       unsigned int size = DR_GROUP_SIZE (vinfo);
2698       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699       if (! vect_store_lanes_supported (vectype, size, false)
2700 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701 	 && ! vect_grouped_store_supported (vectype, size))
2702 	return opt_result::failure_at (vinfo->stmt,
2703 				       "unsupported grouped store\n");
2704       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2705 	{
2706 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709 	  size = DR_GROUP_SIZE (vinfo);
2710 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2711 	  if (! vect_load_lanes_supported (vectype, size, false)
2712 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2713 						size))
2714 	    return opt_result::failure_at (vinfo->stmt,
2715 					   "unsupported grouped load\n");
2716 	}
2717     }
2718 
2719   if (dump_enabled_p ())
2720     dump_printf_loc (MSG_NOTE, vect_location,
2721 		     "re-trying with SLP disabled\n");
2722 
2723   /* Roll back state appropriately.  No SLP this time.  */
2724   slp = false;
2725   /* Restore vectorization factor as it were without SLP.  */
2726   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727   /* Free the SLP instances.  */
2728   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729     vect_free_slp_instance (instance);
2730   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731   /* Reset SLP type to loop_vect on all stmts.  */
2732   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2733     {
2734       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736 	   !gsi_end_p (si); gsi_next (&si))
2737 	{
2738 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2740 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741 	      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2742 	    {
2743 	      /* vectorizable_reduction adjusts reduction stmt def-types,
2744 		 restore them to that of the PHI.  */
2745 	      STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746 		= STMT_VINFO_DEF_TYPE (stmt_info);
2747 	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748 					(STMT_VINFO_REDUC_DEF (stmt_info)))
2749 		= STMT_VINFO_DEF_TYPE (stmt_info);
2750 	    }
2751 	}
2752       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753 	   !gsi_end_p (si); gsi_next (&si))
2754 	{
2755 	  if (is_gimple_debug (gsi_stmt (si)))
2756 	    continue;
2757 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2759 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2760 	    {
2761 	      stmt_vec_info pattern_stmt_info
2762 		= STMT_VINFO_RELATED_STMT (stmt_info);
2763 	      if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764 		STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2765 
2766 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767 	      STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769 		   !gsi_end_p (pi); gsi_next (&pi))
2770 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771 		  = loop_vect;
2772 	    }
2773 	}
2774     }
2775   /* Free optimized alias test DDRS.  */
2776   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779   /* Reset target cost data.  */
2780   delete loop_vinfo->vector_costs;
2781   loop_vinfo->vector_costs = nullptr;
2782   /* Reset accumulated rgroup information.  */
2783   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785   /* Reset assorted flags.  */
2786   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791     = saved_can_use_partial_vectors_p;
2792 
2793   goto start_over;
2794 }
2795 
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2798    OLD_LOOP_VINFO is better unless something specifically indicates
2799    otherwise.
2800 
2801    Note that this deliberately isn't a partial order.  */
2802 
2803 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805 			  loop_vec_info old_loop_vinfo)
2806 {
2807   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2809 
2810   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2812 
2813   /* Always prefer a VF of loop->simdlen over any other VF.  */
2814   if (loop->simdlen)
2815     {
2816       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818       if (new_simdlen_p != old_simdlen_p)
2819 	return new_simdlen_p;
2820     }
2821 
2822   const auto *old_costs = old_loop_vinfo->vector_costs;
2823   const auto *new_costs = new_loop_vinfo->vector_costs;
2824   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2826 
2827   return new_costs->better_main_loop_than_p (old_costs);
2828 }
2829 
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2831    true if we should.  */
2832 
2833 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835 			loop_vec_info old_loop_vinfo)
2836 {
2837   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838     return false;
2839 
2840   if (dump_enabled_p ())
2841     dump_printf_loc (MSG_NOTE, vect_location,
2842 		     "***** Preferring vector mode %s to vector mode %s\n",
2843 		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844 		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845   return true;
2846 }
2847 
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850    MODE_I to the next mode useful to analyze.
2851    Return the loop_vinfo on success and wrapped null on failure.  */
2852 
2853 static opt_loop_vec_info
vect_analyze_loop_1(class loop * loop,vec_info_shared * shared,const vect_loop_form_info * loop_form_info,loop_vec_info main_loop_vinfo,const vector_modes & vector_modes,unsigned & mode_i,machine_mode & autodetected_vector_mode,bool & fatal)2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855 		     const vect_loop_form_info *loop_form_info,
2856 		     loop_vec_info main_loop_vinfo,
2857 		     const vector_modes &vector_modes, unsigned &mode_i,
2858 		     machine_mode &autodetected_vector_mode,
2859 		     bool &fatal)
2860 {
2861   loop_vec_info loop_vinfo
2862     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2863 
2864   machine_mode vector_mode = vector_modes[mode_i];
2865   loop_vinfo->vector_mode = vector_mode;
2866   unsigned int suggested_unroll_factor = 1;
2867 
2868   /* Run the main analysis.  */
2869   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870 					&suggested_unroll_factor);
2871   if (dump_enabled_p ())
2872     dump_printf_loc (MSG_NOTE, vect_location,
2873 		     "***** Analysis %s with vector mode %s\n",
2874 		     res ? "succeeded" : " failed",
2875 		     GET_MODE_NAME (loop_vinfo->vector_mode));
2876 
2877   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
2878     {
2879       if (dump_enabled_p ())
2880 	dump_printf_loc (MSG_NOTE, vect_location,
2881 			 "***** Re-trying analysis for unrolling"
2882 			 " with unroll factor %d.\n",
2883 			 suggested_unroll_factor);
2884       loop_vec_info unroll_vinfo
2885 	= vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886       unroll_vinfo->vector_mode = vector_mode;
2887       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889       if (new_res)
2890 	{
2891 	  delete loop_vinfo;
2892 	  loop_vinfo = unroll_vinfo;
2893 	}
2894       else
2895 	delete unroll_vinfo;
2896     }
2897 
2898   /* Remember the autodetected vector mode.  */
2899   if (vector_mode == VOIDmode)
2900     autodetected_vector_mode = loop_vinfo->vector_mode;
2901 
2902   /* Advance mode_i, first skipping modes that would result in the
2903      same analysis result.  */
2904   while (mode_i + 1 < vector_modes.length ()
2905 	 && vect_chooses_same_modes_p (loop_vinfo,
2906 				       vector_modes[mode_i + 1]))
2907     {
2908       if (dump_enabled_p ())
2909 	dump_printf_loc (MSG_NOTE, vect_location,
2910 			 "***** The result for vector mode %s would"
2911 			 " be the same\n",
2912 			 GET_MODE_NAME (vector_modes[mode_i + 1]));
2913       mode_i += 1;
2914     }
2915   if (mode_i + 1 < vector_modes.length ()
2916       && VECTOR_MODE_P (autodetected_vector_mode)
2917       && (related_vector_mode (vector_modes[mode_i + 1],
2918 			       GET_MODE_INNER (autodetected_vector_mode))
2919 	  == autodetected_vector_mode)
2920       && (related_vector_mode (autodetected_vector_mode,
2921 			       GET_MODE_INNER (vector_modes[mode_i + 1]))
2922 	  == vector_modes[mode_i + 1]))
2923     {
2924       if (dump_enabled_p ())
2925 	dump_printf_loc (MSG_NOTE, vect_location,
2926 			 "***** Skipping vector mode %s, which would"
2927 			 " repeat the analysis for %s\n",
2928 			 GET_MODE_NAME (vector_modes[mode_i + 1]),
2929 			 GET_MODE_NAME (autodetected_vector_mode));
2930       mode_i += 1;
2931     }
2932   mode_i++;
2933 
2934   if (!res)
2935     {
2936       delete loop_vinfo;
2937       if (fatal)
2938 	gcc_checking_assert (main_loop_vinfo == NULL);
2939       return opt_loop_vec_info::propagate_failure (res);
2940     }
2941 
2942   return opt_loop_vec_info::success (loop_vinfo);
2943 }
2944 
2945 /* Function vect_analyze_loop.
2946 
2947    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948    for it.  The different analyses will record information in the
2949    loop_vec_info struct.  */
2950 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2952 {
2953   DUMP_VECT_SCOPE ("analyze_loop_nest");
2954 
2955   if (loop_outer (loop)
2956       && loop_vec_info_for_loop (loop_outer (loop))
2957       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958     return opt_loop_vec_info::failure_at (vect_location,
2959 					  "outer-loop already vectorized.\n");
2960 
2961   if (!find_loop_nest (loop, &shared->loop_nest))
2962     return opt_loop_vec_info::failure_at
2963       (vect_location,
2964        "not vectorized: loop nest containing two or more consecutive inner"
2965        " loops cannot be vectorized\n");
2966 
2967   /* Analyze the loop form.  */
2968   vect_loop_form_info loop_form_info;
2969   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970   if (!res)
2971     {
2972       if (dump_enabled_p ())
2973 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 			 "bad loop form.\n");
2975       return opt_loop_vec_info::propagate_failure (res);
2976     }
2977   if (!integer_onep (loop_form_info.assumptions))
2978     {
2979       /* We consider to vectorize this loop by versioning it under
2980 	 some assumptions.  In order to do this, we need to clear
2981 	 existing information computed by scev and niter analyzer.  */
2982       scev_reset_htab ();
2983       free_numbers_of_iterations_estimates (loop);
2984       /* Also set flag for this loop so that following scev and niter
2985 	 analysis are done under the assumptions.  */
2986       loop_constraint_set (loop, LOOP_C_FINITE);
2987     }
2988 
2989   auto_vector_modes vector_modes;
2990   /* Autodetect first vector size we try.  */
2991   vector_modes.safe_push (VOIDmode);
2992   unsigned int autovec_flags
2993     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994 						    loop->simdlen != 0);
2995   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996 			     && !unlimited_cost_model (loop));
2997   machine_mode autodetected_vector_mode = VOIDmode;
2998   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999   unsigned int mode_i = 0;
3000   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3001 
3002   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3003      a mode has not been analyzed.  */
3004   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005   for (unsigned i = 0; i < vector_modes.length (); ++i)
3006     cached_vf_per_mode.safe_push (0);
3007 
3008   /* First determine the main loop vectorization mode, either the first
3009      one that works, starting with auto-detecting the vector mode and then
3010      following the targets order of preference, or the one with the
3011      lowest cost if pick_lowest_cost_p.  */
3012   while (1)
3013     {
3014       bool fatal;
3015       unsigned int last_mode_i = mode_i;
3016       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017 	 failed.  */
3018       cached_vf_per_mode[last_mode_i] = -1;
3019       opt_loop_vec_info loop_vinfo
3020 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021 			       NULL, vector_modes, mode_i,
3022 			       autodetected_vector_mode, fatal);
3023       if (fatal)
3024 	break;
3025 
3026       if (loop_vinfo)
3027 	{
3028 	  /*  Analyzis has been successful so update the VF value.  The
3029 	      VF should always be a multiple of unroll_factor and we want to
3030 	      capture the original VF here.  */
3031 	  cached_vf_per_mode[last_mode_i]
3032 	    = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033 			 loop_vinfo->suggested_unroll_factor);
3034 	  /* Once we hit the desired simdlen for the first time,
3035 	     discard any previous attempts.  */
3036 	  if (simdlen
3037 	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038 	    {
3039 	      delete first_loop_vinfo;
3040 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041 	      simdlen = 0;
3042 	    }
3043 	  else if (pick_lowest_cost_p
3044 		   && first_loop_vinfo
3045 		   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046 	    {
3047 	      /* Pick loop_vinfo over first_loop_vinfo.  */
3048 	      delete first_loop_vinfo;
3049 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050 	    }
3051 	  if (first_loop_vinfo == NULL)
3052 	    first_loop_vinfo = loop_vinfo;
3053 	  else
3054 	    {
3055 	      delete loop_vinfo;
3056 	      loop_vinfo = opt_loop_vec_info::success (NULL);
3057 	    }
3058 
3059 	  /* Commit to first_loop_vinfo if we have no reason to try
3060 	     alternatives.  */
3061 	  if (!simdlen && !pick_lowest_cost_p)
3062 	    break;
3063 	}
3064       if (mode_i == vector_modes.length ()
3065 	  || autodetected_vector_mode == VOIDmode)
3066 	break;
3067 
3068       /* Try the next biggest vector size.  */
3069       if (dump_enabled_p ())
3070 	dump_printf_loc (MSG_NOTE, vect_location,
3071 			 "***** Re-trying analysis with vector mode %s\n",
3072 			 GET_MODE_NAME (vector_modes[mode_i]));
3073     }
3074   if (!first_loop_vinfo)
3075     return opt_loop_vec_info::propagate_failure (res);
3076 
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079 		     "***** Choosing vector mode %s\n",
3080 		     GET_MODE_NAME (first_loop_vinfo->vector_mode));
3081 
3082   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083      enabled, SIMDUID is not set, it is the innermost loop and we have
3084      either already found the loop's SIMDLEN or there was no SIMDLEN to
3085      begin with.
3086      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3087   bool vect_epilogues = (!simdlen
3088 			 && loop->inner == NULL
3089 			 && param_vect_epilogues_nomask
3090 			 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091 			 && !loop->simduid);
3092   if (!vect_epilogues)
3093     return first_loop_vinfo;
3094 
3095   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3096   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3097 
3098   /* For epilogues start the analysis from the first mode.  The motivation
3099      behind starting from the beginning comes from cases where the VECTOR_MODES
3100      array may contain length-agnostic and length-specific modes.  Their
3101      ordering is not guaranteed, so we could end up picking a mode for the main
3102      loop that is after the epilogue's optimal mode.  */
3103   vector_modes[0] = autodetected_vector_mode;
3104   mode_i = 0;
3105 
3106   bool supports_partial_vectors =
3107     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3109 
3110   while (1)
3111     {
3112       /* If the target does not support partial vectors we can shorten the
3113 	 number of modes to analyze for the epilogue as we know we can't pick a
3114 	 mode that would lead to a VF at least as big as the
3115 	 FIRST_VINFO_VF.  */
3116       if (!supports_partial_vectors
3117 	  && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3118 	{
3119 	  mode_i++;
3120 	  if (mode_i == vector_modes.length ())
3121 	    break;
3122 	  continue;
3123 	}
3124 
3125       if (dump_enabled_p ())
3126 	dump_printf_loc (MSG_NOTE, vect_location,
3127 			 "***** Re-trying epilogue analysis with vector "
3128 			 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3129 
3130       bool fatal;
3131       opt_loop_vec_info loop_vinfo
3132 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133 			       first_loop_vinfo,
3134 			       vector_modes, mode_i,
3135 			       autodetected_vector_mode, fatal);
3136       if (fatal)
3137 	break;
3138 
3139       if (loop_vinfo)
3140 	{
3141 	  if (pick_lowest_cost_p)
3142 	    {
3143 	      /* Keep trying to roll back vectorization attempts while the
3144 		 loop_vec_infos they produced were worse than this one.  */
3145 	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146 	      while (!vinfos.is_empty ()
3147 		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3148 		{
3149 		  gcc_assert (vect_epilogues);
3150 		  delete vinfos.pop ();
3151 		}
3152 	    }
3153 	  /* For now only allow one epilogue loop.  */
3154 	  if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3155 	    {
3156 	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157 	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158 	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159 			  || maybe_ne (lowest_th, 0U));
3160 	      /* Keep track of the known smallest versioning
3161 		 threshold.  */
3162 	      if (ordered_p (lowest_th, th))
3163 		lowest_th = ordered_min (lowest_th, th);
3164 	    }
3165 	  else
3166 	    {
3167 	      delete loop_vinfo;
3168 	      loop_vinfo = opt_loop_vec_info::success (NULL);
3169 	    }
3170 
3171 	  /* For now only allow one epilogue loop, but allow
3172 	     pick_lowest_cost_p to replace it, so commit to the
3173 	     first epilogue if we have no reason to try alternatives.  */
3174 	  if (!pick_lowest_cost_p)
3175 	    break;
3176 	}
3177 
3178       if (mode_i == vector_modes.length ())
3179 	break;
3180 
3181     }
3182 
3183   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3184     {
3185       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186       if (dump_enabled_p ())
3187 	dump_printf_loc (MSG_NOTE, vect_location,
3188 			 "***** Choosing epilogue vector mode %s\n",
3189 			 GET_MODE_NAME
3190 			   (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3191     }
3192 
3193   return first_loop_vinfo;
3194 }
3195 
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197    it in *REDUC_FN if so.  */
3198 
3199 static bool
fold_left_reduction_fn(code_helper code,internal_fn * reduc_fn)3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3201 {
3202   if (code == PLUS_EXPR)
3203     {
3204       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205       return true;
3206     }
3207   return false;
3208 }
3209 
3210 /* Function reduction_fn_for_scalar_code
3211 
3212    Input:
3213    CODE - tree_code of a reduction operations.
3214 
3215    Output:
3216    REDUC_FN - the corresponding internal function to be used to reduce the
3217       vector of partial results into a single scalar result, or IFN_LAST
3218       if the operation is a supported reduction operation, but does not have
3219       such an internal function.
3220 
3221    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3222 
3223 bool
reduction_fn_for_scalar_code(code_helper code,internal_fn * reduc_fn)3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3225 {
3226   if (code.is_tree_code ())
3227     switch (tree_code (code))
3228       {
3229       case MAX_EXPR:
3230 	*reduc_fn = IFN_REDUC_MAX;
3231 	return true;
3232 
3233       case MIN_EXPR:
3234 	*reduc_fn = IFN_REDUC_MIN;
3235 	return true;
3236 
3237       case PLUS_EXPR:
3238 	*reduc_fn = IFN_REDUC_PLUS;
3239 	return true;
3240 
3241       case BIT_AND_EXPR:
3242 	*reduc_fn = IFN_REDUC_AND;
3243 	return true;
3244 
3245       case BIT_IOR_EXPR:
3246 	*reduc_fn = IFN_REDUC_IOR;
3247 	return true;
3248 
3249       case BIT_XOR_EXPR:
3250 	*reduc_fn = IFN_REDUC_XOR;
3251 	return true;
3252 
3253       case MULT_EXPR:
3254       case MINUS_EXPR:
3255 	*reduc_fn = IFN_LAST;
3256 	return true;
3257 
3258       default:
3259 	return false;
3260       }
3261   else
3262     switch (combined_fn (code))
3263       {
3264       CASE_CFN_FMAX:
3265 	*reduc_fn = IFN_REDUC_FMAX;
3266 	return true;
3267 
3268       CASE_CFN_FMIN:
3269 	*reduc_fn = IFN_REDUC_FMIN;
3270 	return true;
3271 
3272       default:
3273 	return false;
3274       }
3275 }
3276 
3277 /* If there is a neutral value X such that a reduction would not be affected
3278    by the introduction of additional X elements, return that X, otherwise
3279    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3280    of the scalar elements.  If the reduction has just a single initial value
3281    then INITIAL_VALUE is that value, otherwise it is null.  */
3282 
3283 tree
neutral_op_for_reduction(tree scalar_type,code_helper code,tree initial_value)3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285 			  tree initial_value)
3286 {
3287   if (code.is_tree_code ())
3288     switch (tree_code (code))
3289       {
3290       case WIDEN_SUM_EXPR:
3291       case DOT_PROD_EXPR:
3292       case SAD_EXPR:
3293       case PLUS_EXPR:
3294       case MINUS_EXPR:
3295       case BIT_IOR_EXPR:
3296       case BIT_XOR_EXPR:
3297 	return build_zero_cst (scalar_type);
3298 
3299       case MULT_EXPR:
3300 	return build_one_cst (scalar_type);
3301 
3302       case BIT_AND_EXPR:
3303 	return build_all_ones_cst (scalar_type);
3304 
3305       case MAX_EXPR:
3306       case MIN_EXPR:
3307 	return initial_value;
3308 
3309       default:
3310 	return NULL_TREE;
3311       }
3312   else
3313     switch (combined_fn (code))
3314       {
3315       CASE_CFN_FMIN:
3316       CASE_CFN_FMAX:
3317 	return initial_value;
3318 
3319       default:
3320 	return NULL_TREE;
3321       }
3322 }
3323 
3324 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3325    STMT is printed with a message MSG. */
3326 
3327 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3329 {
3330   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3331 }
3332 
3333 /* Return true if we need an in-order reduction for operation CODE
3334    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335    overflow must wrap.  */
3336 
3337 bool
needs_fold_left_reduction_p(tree type,code_helper code)3338 needs_fold_left_reduction_p (tree type, code_helper code)
3339 {
3340   /* CHECKME: check for !flag_finite_math_only too?  */
3341   if (SCALAR_FLOAT_TYPE_P (type))
3342     {
3343       if (code.is_tree_code ())
3344 	switch (tree_code (code))
3345 	  {
3346 	  case MIN_EXPR:
3347 	  case MAX_EXPR:
3348 	    return false;
3349 
3350 	  default:
3351 	    return !flag_associative_math;
3352 	  }
3353       else
3354 	switch (combined_fn (code))
3355 	  {
3356 	  CASE_CFN_FMIN:
3357 	  CASE_CFN_FMAX:
3358 	    return false;
3359 
3360 	  default:
3361 	    return !flag_associative_math;
3362 	  }
3363     }
3364 
3365   if (INTEGRAL_TYPE_P (type))
3366     return (!code.is_tree_code ()
3367 	    || !operation_no_trapping_overflow (type, tree_code (code)));
3368 
3369   if (SAT_FIXED_POINT_TYPE_P (type))
3370     return true;
3371 
3372   return false;
3373 }
3374 
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376    has a handled computation expression.  Store the main reduction
3377    operation in *CODE.  */
3378 
3379 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,code_helper * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381 		      tree loop_arg, code_helper *code,
3382 		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3383 {
3384   auto_bitmap visited;
3385   tree lookfor = PHI_RESULT (phi);
3386   ssa_op_iter curri;
3387   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388   while (USE_FROM_PTR (curr) != loop_arg)
3389     curr = op_iter_next_use (&curri);
3390   curri.i = curri.numops;
3391   do
3392     {
3393       path.safe_push (std::make_pair (curri, curr));
3394       tree use = USE_FROM_PTR (curr);
3395       if (use == lookfor)
3396 	break;
3397       gimple *def = SSA_NAME_DEF_STMT (use);
3398       if (gimple_nop_p (def)
3399 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3400 	{
3401 pop:
3402 	  do
3403 	    {
3404 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405 	      curri = x.first;
3406 	      curr = x.second;
3407 	      do
3408 		curr = op_iter_next_use (&curri);
3409 	      /* Skip already visited or non-SSA operands (from iterating
3410 	         over PHI args).  */
3411 	      while (curr != NULL_USE_OPERAND_P
3412 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413 			 || ! bitmap_set_bit (visited,
3414 					      SSA_NAME_VERSION
3415 					        (USE_FROM_PTR (curr)))));
3416 	    }
3417 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418 	  if (curr == NULL_USE_OPERAND_P)
3419 	    break;
3420 	}
3421       else
3422 	{
3423 	  if (gimple_code (def) == GIMPLE_PHI)
3424 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425 	  else
3426 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427 	  while (curr != NULL_USE_OPERAND_P
3428 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429 		     || ! bitmap_set_bit (visited,
3430 					  SSA_NAME_VERSION
3431 					    (USE_FROM_PTR (curr)))))
3432 	    curr = op_iter_next_use (&curri);
3433 	  if (curr == NULL_USE_OPERAND_P)
3434 	    goto pop;
3435 	}
3436     }
3437   while (1);
3438   if (dump_file && (dump_flags & TDF_DETAILS))
3439     {
3440       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441       unsigned i;
3442       std::pair<ssa_op_iter, use_operand_p> *x;
3443       FOR_EACH_VEC_ELT (path, i, x)
3444 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445       dump_printf (MSG_NOTE, "\n");
3446     }
3447 
3448   /* Check whether the reduction path detected is valid.  */
3449   bool fail = path.length () == 0;
3450   bool neg = false;
3451   int sign = -1;
3452   *code = ERROR_MARK;
3453   for (unsigned i = 1; i < path.length (); ++i)
3454     {
3455       gimple *use_stmt = USE_STMT (path[i].second);
3456       gimple_match_op op;
3457       if (!gimple_extract_op (use_stmt, &op))
3458 	{
3459 	  fail = true;
3460 	  break;
3461 	}
3462       unsigned int opi = op.num_ops;
3463       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3464 	{
3465 	  /* The following make sure we can compute the operand index
3466 	     easily plus it mostly disallows chaining via COND_EXPR condition
3467 	     operands.  */
3468 	  for (opi = 0; opi < op.num_ops; ++opi)
3469 	    if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470 	      break;
3471 	}
3472       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3473 	{
3474 	  for (opi = 0; opi < op.num_ops; ++opi)
3475 	    if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476 	      break;
3477 	}
3478       if (opi == op.num_ops)
3479 	{
3480 	  fail = true;
3481 	  break;
3482 	}
3483       op.code = canonicalize_code (op.code, op.type);
3484       if (op.code == MINUS_EXPR)
3485 	{
3486 	  op.code = PLUS_EXPR;
3487 	  /* Track whether we negate the reduction value each iteration.  */
3488 	  if (op.ops[1] == op.ops[opi])
3489 	    neg = ! neg;
3490 	}
3491       if (CONVERT_EXPR_CODE_P (op.code)
3492 	  && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3493 	;
3494       else if (*code == ERROR_MARK)
3495 	{
3496 	  *code = op.code;
3497 	  sign = TYPE_SIGN (op.type);
3498 	}
3499       else if (op.code != *code)
3500 	{
3501 	  fail = true;
3502 	  break;
3503 	}
3504       else if ((op.code == MIN_EXPR
3505 		|| op.code == MAX_EXPR)
3506 	       && sign != TYPE_SIGN (op.type))
3507 	{
3508 	  fail = true;
3509 	  break;
3510 	}
3511       /* Check there's only a single stmt the op is used on.  For the
3512 	 not value-changing tail and the last stmt allow out-of-loop uses.
3513 	 ???  We could relax this and handle arbitrary live stmts by
3514 	 forcing a scalar epilogue for example.  */
3515       imm_use_iterator imm_iter;
3516       use_operand_p use_p;
3517       gimple *op_use_stmt;
3518       unsigned cnt = 0;
3519       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3520 	if (!is_gimple_debug (op_use_stmt)
3521 	    && (*code != ERROR_MARK
3522 		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3523 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3524 	    cnt++;
3525       if (cnt != 1)
3526 	{
3527 	  fail = true;
3528 	  break;
3529 	}
3530     }
3531   return ! fail && ! neg && *code != ERROR_MARK;
3532 }
3533 
3534 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3535 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3536 		      tree loop_arg, enum tree_code code)
3537 {
3538   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3539   code_helper code_;
3540   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3541 	  && code_ == code);
3542 }
3543 
3544 
3545 
3546 /* Function vect_is_simple_reduction
3547 
3548    (1) Detect a cross-iteration def-use cycle that represents a simple
3549    reduction computation.  We look for the following pattern:
3550 
3551    loop_header:
3552      a1 = phi < a0, a2 >
3553      a3 = ...
3554      a2 = operation (a3, a1)
3555 
3556    or
3557 
3558    a3 = ...
3559    loop_header:
3560      a1 = phi < a0, a2 >
3561      a2 = operation (a3, a1)
3562 
3563    such that:
3564    1. operation is commutative and associative and it is safe to
3565       change the order of the computation
3566    2. no uses for a2 in the loop (a2 is used out of the loop)
3567    3. no uses of a1 in the loop besides the reduction operation
3568    4. no uses of a1 outside the loop.
3569 
3570    Conditions 1,4 are tested here.
3571    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3572 
3573    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3574    nested cycles.
3575 
3576    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3577    reductions:
3578 
3579      a1 = phi < a0, a2 >
3580      inner loop (def of a3)
3581      a2 = phi < a3 >
3582 
3583    (4) Detect condition expressions, ie:
3584      for (int i = 0; i < N; i++)
3585        if (a[i] < val)
3586 	ret_val = a[i];
3587 
3588 */
3589 
3590 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3591 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3592 			  bool *double_reduc, bool *reduc_chain_p)
3593 {
3594   gphi *phi = as_a <gphi *> (phi_info->stmt);
3595   gimple *phi_use_stmt = NULL;
3596   imm_use_iterator imm_iter;
3597   use_operand_p use_p;
3598 
3599   *double_reduc = false;
3600   *reduc_chain_p = false;
3601   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3602 
3603   tree phi_name = PHI_RESULT (phi);
3604   /* ???  If there are no uses of the PHI result the inner loop reduction
3605      won't be detected as possibly double-reduction by vectorizable_reduction
3606      because that tries to walk the PHI arg from the preheader edge which
3607      can be constant.  See PR60382.  */
3608   if (has_zero_uses (phi_name))
3609     return NULL;
3610   class loop *loop = (gimple_bb (phi))->loop_father;
3611   unsigned nphi_def_loop_uses = 0;
3612   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3613     {
3614       gimple *use_stmt = USE_STMT (use_p);
3615       if (is_gimple_debug (use_stmt))
3616 	continue;
3617 
3618       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3619         {
3620           if (dump_enabled_p ())
3621 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 			     "intermediate value used outside loop.\n");
3623 
3624           return NULL;
3625         }
3626 
3627       nphi_def_loop_uses++;
3628       phi_use_stmt = use_stmt;
3629     }
3630 
3631   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3632   if (TREE_CODE (latch_def) != SSA_NAME)
3633     {
3634       if (dump_enabled_p ())
3635 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636 			 "reduction: not ssa_name: %T\n", latch_def);
3637       return NULL;
3638     }
3639 
3640   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3641   if (!def_stmt_info
3642       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3643     return NULL;
3644 
3645   bool nested_in_vect_loop
3646     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3647   unsigned nlatch_def_loop_uses = 0;
3648   auto_vec<gphi *, 3> lcphis;
3649   bool inner_loop_of_double_reduc = false;
3650   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3651     {
3652       gimple *use_stmt = USE_STMT (use_p);
3653       if (is_gimple_debug (use_stmt))
3654 	continue;
3655       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3656 	nlatch_def_loop_uses++;
3657       else
3658 	{
3659 	  /* We can have more than one loop-closed PHI.  */
3660 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
3661 	  if (nested_in_vect_loop
3662 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3663 		  == vect_double_reduction_def))
3664 	    inner_loop_of_double_reduc = true;
3665 	}
3666     }
3667 
3668   /* If we are vectorizing an inner reduction we are executing that
3669      in the original order only in case we are not dealing with a
3670      double reduction.  */
3671   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3672     {
3673       if (dump_enabled_p ())
3674 	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3675 			"detected nested cycle: ");
3676       return def_stmt_info;
3677     }
3678 
3679   /* When the inner loop of a double reduction ends up with more than
3680      one loop-closed PHI we have failed to classify alternate such
3681      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3682   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3683     {
3684       if (dump_enabled_p ())
3685 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3686 			 "unhandle double reduction\n");
3687       return NULL;
3688     }
3689 
3690   /* If this isn't a nested cycle or if the nested cycle reduction value
3691      is used ouside of the inner loop we cannot handle uses of the reduction
3692      value.  */
3693   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3694     {
3695       if (dump_enabled_p ())
3696 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 			 "reduction used in loop.\n");
3698       return NULL;
3699     }
3700 
3701   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3702      defined in the inner loop.  */
3703   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3704     {
3705       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3706       if (gimple_phi_num_args (def_stmt) != 1
3707           || TREE_CODE (op1) != SSA_NAME)
3708         {
3709           if (dump_enabled_p ())
3710 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711 			     "unsupported phi node definition.\n");
3712 
3713           return NULL;
3714         }
3715 
3716       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3717       if (gimple_bb (def1)
3718 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3719 	  && loop->inner
3720 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3721 	  && (is_gimple_assign (def1) || is_gimple_call (def1))
3722 	  && is_a <gphi *> (phi_use_stmt)
3723 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3724         {
3725           if (dump_enabled_p ())
3726             report_vect_op (MSG_NOTE, def_stmt,
3727 			    "detected double reduction: ");
3728 
3729           *double_reduc = true;
3730 	  return def_stmt_info;
3731         }
3732 
3733       return NULL;
3734     }
3735 
3736   /* Look for the expression computing latch_def from then loop PHI result.  */
3737   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3738   code_helper code;
3739   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3740 			    path))
3741     {
3742       STMT_VINFO_REDUC_CODE (phi_info) = code;
3743       if (code == COND_EXPR && !nested_in_vect_loop)
3744 	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3745 
3746       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3747 	 reduction chain for which the additional restriction is that
3748 	 all operations in the chain are the same.  */
3749       auto_vec<stmt_vec_info, 8> reduc_chain;
3750       unsigned i;
3751       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3752       for (i = path.length () - 1; i >= 1; --i)
3753 	{
3754 	  gimple *stmt = USE_STMT (path[i].second);
3755 	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3756 	  gimple_match_op op;
3757 	  if (!gimple_extract_op (stmt, &op))
3758 	    gcc_unreachable ();
3759 	  if (gassign *assign = dyn_cast<gassign *> (stmt))
3760 	    STMT_VINFO_REDUC_IDX (stmt_info)
3761 	      = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3762 	  else
3763 	    {
3764 	      gcall *call = as_a<gcall *> (stmt);
3765 	      STMT_VINFO_REDUC_IDX (stmt_info)
3766 		= path[i].second->use - gimple_call_arg_ptr (call, 0);
3767 	    }
3768 	  bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3769 				     && (i == 1 || i == path.length () - 1));
3770 	  if ((op.code != code && !leading_conversion)
3771 	      /* We can only handle the final value in epilogue
3772 		 generation for reduction chains.  */
3773 	      || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3774 	    is_slp_reduc = false;
3775 	  /* For reduction chains we support a trailing/leading
3776 	     conversions.  We do not store those in the actual chain.  */
3777 	  if (leading_conversion)
3778 	    continue;
3779 	  reduc_chain.safe_push (stmt_info);
3780 	}
3781       if (is_slp_reduc && reduc_chain.length () > 1)
3782 	{
3783 	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3784 	    {
3785 	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3786 	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3787 	    }
3788 	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3789 	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3790 
3791 	  /* Save the chain for further analysis in SLP detection.  */
3792 	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3793 	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3794 
3795 	  *reduc_chain_p = true;
3796 	  if (dump_enabled_p ())
3797 	    dump_printf_loc (MSG_NOTE, vect_location,
3798 			    "reduction: detected reduction chain\n");
3799 	}
3800       else if (dump_enabled_p ())
3801 	dump_printf_loc (MSG_NOTE, vect_location,
3802 			 "reduction: detected reduction\n");
3803 
3804       return def_stmt_info;
3805     }
3806 
3807   if (dump_enabled_p ())
3808     dump_printf_loc (MSG_NOTE, vect_location,
3809 		     "reduction: unknown pattern\n");
3810 
3811   return NULL;
3812 }
3813 
3814 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3815    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3816    or -1 if not known.  */
3817 
3818 static int
vect_get_peel_iters_epilogue(loop_vec_info loop_vinfo,int peel_iters_prologue)3819 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3820 {
3821   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3822   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3823     {
3824       if (dump_enabled_p ())
3825 	dump_printf_loc (MSG_NOTE, vect_location,
3826 			 "cost model: epilogue peel iters set to vf/2 "
3827 			 "because loop iterations are unknown .\n");
3828       return assumed_vf / 2;
3829     }
3830   else
3831     {
3832       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3833       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3834       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3835       /* If we need to peel for gaps, but no peeling is required, we have to
3836 	 peel VF iterations.  */
3837       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3838 	peel_iters_epilogue = assumed_vf;
3839       return peel_iters_epilogue;
3840     }
3841 }
3842 
3843 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3844 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3845 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3846 			     int *peel_iters_epilogue,
3847 			     stmt_vector_for_cost *scalar_cost_vec,
3848 			     stmt_vector_for_cost *prologue_cost_vec,
3849 			     stmt_vector_for_cost *epilogue_cost_vec)
3850 {
3851   int retval = 0;
3852 
3853   *peel_iters_epilogue
3854     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3855 
3856   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3857     {
3858       /* If peeled iterations are known but number of scalar loop
3859 	 iterations are unknown, count a taken branch per peeled loop.  */
3860       if (peel_iters_prologue > 0)
3861 	retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3862 				   vect_prologue);
3863       if (*peel_iters_epilogue > 0)
3864 	retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3865 				    vect_epilogue);
3866     }
3867 
3868   stmt_info_for_cost *si;
3869   int j;
3870   if (peel_iters_prologue)
3871     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3872       retval += record_stmt_cost (prologue_cost_vec,
3873 				  si->count * peel_iters_prologue,
3874 				  si->kind, si->stmt_info, si->misalign,
3875 				  vect_prologue);
3876   if (*peel_iters_epilogue)
3877     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3878       retval += record_stmt_cost (epilogue_cost_vec,
3879 				  si->count * *peel_iters_epilogue,
3880 				  si->kind, si->stmt_info, si->misalign,
3881 				  vect_epilogue);
3882 
3883   return retval;
3884 }
3885 
3886 /* Function vect_estimate_min_profitable_iters
3887 
3888    Return the number of iterations required for the vector version of the
3889    loop to be profitable relative to the cost of the scalar version of the
3890    loop.
3891 
3892    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3893    of iterations for vectorization.  -1 value means loop vectorization
3894    is not profitable.  This returned value may be used for dynamic
3895    profitability check.
3896 
3897    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3898    for static check against estimated number of iterations.  */
3899 
3900 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate,unsigned * suggested_unroll_factor)3901 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3902 				    int *ret_min_profitable_niters,
3903 				    int *ret_min_profitable_estimate,
3904 				    unsigned *suggested_unroll_factor)
3905 {
3906   int min_profitable_iters;
3907   int min_profitable_estimate;
3908   int peel_iters_prologue;
3909   int peel_iters_epilogue;
3910   unsigned vec_inside_cost = 0;
3911   int vec_outside_cost = 0;
3912   unsigned vec_prologue_cost = 0;
3913   unsigned vec_epilogue_cost = 0;
3914   int scalar_single_iter_cost = 0;
3915   int scalar_outside_cost = 0;
3916   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3917   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3918   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3919 
3920   /* Cost model disabled.  */
3921   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3922     {
3923       if (dump_enabled_p ())
3924 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3925       *ret_min_profitable_niters = 0;
3926       *ret_min_profitable_estimate = 0;
3927       return;
3928     }
3929 
3930   /* Requires loop versioning tests to handle misalignment.  */
3931   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3932     {
3933       /*  FIXME: Make cost depend on complexity of individual check.  */
3934       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3935       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3936       if (dump_enabled_p ())
3937 	dump_printf (MSG_NOTE,
3938 		     "cost model: Adding cost of checks for loop "
3939 		     "versioning to treat misalignment.\n");
3940     }
3941 
3942   /* Requires loop versioning with alias checks.  */
3943   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3944     {
3945       /*  FIXME: Make cost depend on complexity of individual check.  */
3946       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3947       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3948       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3949       if (len)
3950 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3951 	(void) add_stmt_cost (target_cost_data, len * 2 - 1,
3952 			      scalar_stmt, vect_prologue);
3953       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3954       if (len)
3955 	{
3956 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3957 	  unsigned int nstmts = len * 2 - 1;
3958 	  /* +1 for each bias that needs adding.  */
3959 	  for (unsigned int i = 0; i < len; ++i)
3960 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3961 	      nstmts += 1;
3962 	  (void) add_stmt_cost (target_cost_data, nstmts,
3963 				scalar_stmt, vect_prologue);
3964 	}
3965       if (dump_enabled_p ())
3966 	dump_printf (MSG_NOTE,
3967 		     "cost model: Adding cost of checks for loop "
3968 		     "versioning aliasing.\n");
3969     }
3970 
3971   /* Requires loop versioning with niter checks.  */
3972   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3973     {
3974       /*  FIXME: Make cost depend on complexity of individual check.  */
3975       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3976 			    NULL, NULL, NULL_TREE, 0, vect_prologue);
3977       if (dump_enabled_p ())
3978 	dump_printf (MSG_NOTE,
3979 		     "cost model: Adding cost of checks for loop "
3980 		     "versioning niters.\n");
3981     }
3982 
3983   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3984     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3985 			  vect_prologue);
3986 
3987   /* Count statements in scalar loop.  Using this as scalar cost for a single
3988      iteration for now.
3989 
3990      TODO: Add outer loop support.
3991 
3992      TODO: Consider assigning different costs to different scalar
3993      statements.  */
3994 
3995   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3996 
3997   /* Add additional cost for the peeled instructions in prologue and epilogue
3998      loop.  (For fully-masked loops there will be no peeling.)
3999 
4000      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4001      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4002 
4003      TODO: Build an expression that represents peel_iters for prologue and
4004      epilogue to be used in a run-time test.  */
4005 
4006   bool prologue_need_br_taken_cost = false;
4007   bool prologue_need_br_not_taken_cost = false;
4008 
4009   /* Calculate peel_iters_prologue.  */
4010   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4011     peel_iters_prologue = 0;
4012   else if (npeel < 0)
4013     {
4014       peel_iters_prologue = assumed_vf / 2;
4015       if (dump_enabled_p ())
4016 	dump_printf (MSG_NOTE, "cost model: "
4017 		     "prologue peel iters set to vf/2.\n");
4018 
4019       /* If peeled iterations are unknown, count a taken branch and a not taken
4020 	 branch per peeled loop.  Even if scalar loop iterations are known,
4021 	 vector iterations are not known since peeled prologue iterations are
4022 	 not known.  Hence guards remain the same.  */
4023       prologue_need_br_taken_cost = true;
4024       prologue_need_br_not_taken_cost = true;
4025     }
4026   else
4027     {
4028       peel_iters_prologue = npeel;
4029       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4030 	/* If peeled iterations are known but number of scalar loop
4031 	   iterations are unknown, count a taken branch per peeled loop.  */
4032 	prologue_need_br_taken_cost = true;
4033     }
4034 
4035   bool epilogue_need_br_taken_cost = false;
4036   bool epilogue_need_br_not_taken_cost = false;
4037 
4038   /* Calculate peel_iters_epilogue.  */
4039   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4040     /* We need to peel exactly one iteration for gaps.  */
4041     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4042   else if (npeel < 0)
4043     {
4044       /* If peeling for alignment is unknown, loop bound of main loop
4045 	 becomes unknown.  */
4046       peel_iters_epilogue = assumed_vf / 2;
4047       if (dump_enabled_p ())
4048 	dump_printf (MSG_NOTE, "cost model: "
4049 		     "epilogue peel iters set to vf/2 because "
4050 		     "peeling for alignment is unknown.\n");
4051 
4052       /* See the same reason above in peel_iters_prologue calculation.  */
4053       epilogue_need_br_taken_cost = true;
4054       epilogue_need_br_not_taken_cost = true;
4055     }
4056   else
4057     {
4058       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4059       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4060 	/* If peeled iterations are known but number of scalar loop
4061 	   iterations are unknown, count a taken branch per peeled loop.  */
4062 	epilogue_need_br_taken_cost = true;
4063     }
4064 
4065   stmt_info_for_cost *si;
4066   int j;
4067   /* Add costs associated with peel_iters_prologue.  */
4068   if (peel_iters_prologue)
4069     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4070       {
4071 	(void) add_stmt_cost (target_cost_data,
4072 			      si->count * peel_iters_prologue, si->kind,
4073 			      si->stmt_info, si->node, si->vectype,
4074 			      si->misalign, vect_prologue);
4075       }
4076 
4077   /* Add costs associated with peel_iters_epilogue.  */
4078   if (peel_iters_epilogue)
4079     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4080       {
4081 	(void) add_stmt_cost (target_cost_data,
4082 			      si->count * peel_iters_epilogue, si->kind,
4083 			      si->stmt_info, si->node, si->vectype,
4084 			      si->misalign, vect_epilogue);
4085       }
4086 
4087   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4088 
4089   if (prologue_need_br_taken_cost)
4090     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4091 			  vect_prologue);
4092 
4093   if (prologue_need_br_not_taken_cost)
4094     (void) add_stmt_cost (target_cost_data, 1,
4095 			  cond_branch_not_taken, vect_prologue);
4096 
4097   if (epilogue_need_br_taken_cost)
4098     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4099 			  vect_epilogue);
4100 
4101   if (epilogue_need_br_not_taken_cost)
4102     (void) add_stmt_cost (target_cost_data, 1,
4103 			  cond_branch_not_taken, vect_epilogue);
4104 
4105   /* Take care of special costs for rgroup controls of partial vectors.  */
4106   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4107     {
4108       /* Calculate how many masks we need to generate.  */
4109       unsigned int num_masks = 0;
4110       rgroup_controls *rgm;
4111       unsigned int num_vectors_m1;
4112       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4113 	if (rgm->type)
4114 	  num_masks += num_vectors_m1 + 1;
4115       gcc_assert (num_masks > 0);
4116 
4117       /* In the worst case, we need to generate each mask in the prologue
4118 	 and in the loop body.  One of the loop body mask instructions
4119 	 replaces the comparison in the scalar loop, and since we don't
4120 	 count the scalar comparison against the scalar body, we shouldn't
4121 	 count that vector instruction against the vector body either.
4122 
4123 	 Sometimes we can use unpacks instead of generating prologue
4124 	 masks and sometimes the prologue mask will fold to a constant,
4125 	 so the actual prologue cost might be smaller.  However, it's
4126 	 simpler and safer to use the worst-case cost; if this ends up
4127 	 being the tie-breaker between vectorizing or not, then it's
4128 	 probably better not to vectorize.  */
4129       (void) add_stmt_cost (target_cost_data, num_masks,
4130 			    vector_stmt, NULL, NULL, NULL_TREE, 0,
4131 			    vect_prologue);
4132       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4133 			    vector_stmt, NULL, NULL, NULL_TREE, 0,
4134 			    vect_body);
4135     }
4136   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4137     {
4138       /* Referring to the functions vect_set_loop_condition_partial_vectors
4139 	 and vect_set_loop_controls_directly, we need to generate each
4140 	 length in the prologue and in the loop body if required. Although
4141 	 there are some possible optimizations, we consider the worst case
4142 	 here.  */
4143 
4144       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4145       signed char partial_load_store_bias
4146 	= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4147       bool need_iterate_p
4148 	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4149 	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
4150 
4151       /* Calculate how many statements to be added.  */
4152       unsigned int prologue_stmts = 0;
4153       unsigned int body_stmts = 0;
4154 
4155       rgroup_controls *rgc;
4156       unsigned int num_vectors_m1;
4157       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4158 	if (rgc->type)
4159 	  {
4160 	    /* May need one SHIFT for nitems_total computation.  */
4161 	    unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4162 	    if (nitems != 1 && !niters_known_p)
4163 	      prologue_stmts += 1;
4164 
4165 	    /* May need one MAX and one MINUS for wrap around.  */
4166 	    if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4167 	      prologue_stmts += 2;
4168 
4169 	    /* Need one MAX and one MINUS for each batch limit excepting for
4170 	       the 1st one.  */
4171 	    prologue_stmts += num_vectors_m1 * 2;
4172 
4173 	    unsigned int num_vectors = num_vectors_m1 + 1;
4174 
4175 	    /* Need to set up lengths in prologue, only one MIN required
4176 	       for each since start index is zero.  */
4177 	    prologue_stmts += num_vectors;
4178 
4179 	    /* If we have a non-zero partial load bias, we need one PLUS
4180 	       to adjust the load length.  */
4181 	    if (partial_load_store_bias != 0)
4182 	      body_stmts += 1;
4183 
4184 	    /* Each may need two MINs and one MINUS to update lengths in body
4185 	       for next iteration.  */
4186 	    if (need_iterate_p)
4187 	      body_stmts += 3 * num_vectors;
4188 	  }
4189 
4190       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4191 			    scalar_stmt, vect_prologue);
4192       (void) add_stmt_cost (target_cost_data, body_stmts,
4193 			    scalar_stmt, vect_body);
4194     }
4195 
4196   /* FORNOW: The scalar outside cost is incremented in one of the
4197      following ways:
4198 
4199      1. The vectorizer checks for alignment and aliasing and generates
4200      a condition that allows dynamic vectorization.  A cost model
4201      check is ANDED with the versioning condition.  Hence scalar code
4202      path now has the added cost of the versioning check.
4203 
4204        if (cost > th & versioning_check)
4205          jmp to vector code
4206 
4207      Hence run-time scalar is incremented by not-taken branch cost.
4208 
4209      2. The vectorizer then checks if a prologue is required.  If the
4210      cost model check was not done before during versioning, it has to
4211      be done before the prologue check.
4212 
4213        if (cost <= th)
4214          prologue = scalar_iters
4215        if (prologue == 0)
4216          jmp to vector code
4217        else
4218          execute prologue
4219        if (prologue == num_iters)
4220 	 go to exit
4221 
4222      Hence the run-time scalar cost is incremented by a taken branch,
4223      plus a not-taken branch, plus a taken branch cost.
4224 
4225      3. The vectorizer then checks if an epilogue is required.  If the
4226      cost model check was not done before during prologue check, it
4227      has to be done with the epilogue check.
4228 
4229        if (prologue == 0)
4230          jmp to vector code
4231        else
4232          execute prologue
4233        if (prologue == num_iters)
4234 	 go to exit
4235        vector code:
4236          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4237            jmp to epilogue
4238 
4239      Hence the run-time scalar cost should be incremented by 2 taken
4240      branches.
4241 
4242      TODO: The back end may reorder the BBS's differently and reverse
4243      conditions/branch directions.  Change the estimates below to
4244      something more reasonable.  */
4245 
4246   /* If the number of iterations is known and we do not do versioning, we can
4247      decide whether to vectorize at compile time.  Hence the scalar version
4248      do not carry cost model guard costs.  */
4249   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4250       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4251     {
4252       /* Cost model check occurs at versioning.  */
4253       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4254 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4255       else
4256 	{
4257 	  /* Cost model check occurs at prologue generation.  */
4258 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4259 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4260 	      + vect_get_stmt_cost (cond_branch_not_taken);
4261 	  /* Cost model check occurs at epilogue generation.  */
4262 	  else
4263 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4264 	}
4265     }
4266 
4267   /* Complete the target-specific cost calculations.  */
4268   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4269 	       &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4270 	       suggested_unroll_factor);
4271 
4272   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4273       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4274       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4275 		    *suggested_unroll_factor,
4276 		    LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4277     {
4278       if (dump_enabled_p ())
4279 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4280 			 "can't unroll as unrolled vectorization factor larger"
4281 			 " than maximum vectorization factor: "
4282 			 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4283 			 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4284       *suggested_unroll_factor = 1;
4285     }
4286 
4287   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4288 
4289   if (dump_enabled_p ())
4290     {
4291       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4292       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4293                    vec_inside_cost);
4294       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4295                    vec_prologue_cost);
4296       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4297                    vec_epilogue_cost);
4298       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4299                    scalar_single_iter_cost);
4300       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4301                    scalar_outside_cost);
4302       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4303                    vec_outside_cost);
4304       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4305                    peel_iters_prologue);
4306       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4307                    peel_iters_epilogue);
4308     }
4309 
4310   /* Calculate number of iterations required to make the vector version
4311      profitable, relative to the loop bodies only.  The following condition
4312      must hold true:
4313      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4314      where
4315      SIC = scalar iteration cost, VIC = vector iteration cost,
4316      VOC = vector outside cost, VF = vectorization factor,
4317      NPEEL = prologue iterations + epilogue iterations,
4318      SOC = scalar outside cost for run time cost model check.  */
4319 
4320   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4321 			  - vec_inside_cost);
4322   if (saving_per_viter <= 0)
4323     {
4324       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4325 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4326 		    "vectorization did not happen for a simd loop");
4327 
4328       if (dump_enabled_p ())
4329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4330 			 "cost model: the vector iteration cost = %d "
4331 			 "divided by the scalar iteration cost = %d "
4332 			 "is greater or equal to the vectorization factor = %d"
4333                          ".\n",
4334 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4335       *ret_min_profitable_niters = -1;
4336       *ret_min_profitable_estimate = -1;
4337       return;
4338     }
4339 
4340   /* ??? The "if" arm is written to handle all cases; see below for what
4341      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4342   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4343     {
4344       /* Rewriting the condition above in terms of the number of
4345 	 vector iterations (vniters) rather than the number of
4346 	 scalar iterations (niters) gives:
4347 
4348 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4349 
4350 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4351 
4352 	 For integer N, X and Y when X > 0:
4353 
4354 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4355       int outside_overhead = (vec_outside_cost
4356 			      - scalar_single_iter_cost * peel_iters_prologue
4357 			      - scalar_single_iter_cost * peel_iters_epilogue
4358 			      - scalar_outside_cost);
4359       /* We're only interested in cases that require at least one
4360 	 vector iteration.  */
4361       int min_vec_niters = 1;
4362       if (outside_overhead > 0)
4363 	min_vec_niters = outside_overhead / saving_per_viter + 1;
4364 
4365       if (dump_enabled_p ())
4366 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4367 		     min_vec_niters);
4368 
4369       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4370 	{
4371 	  /* Now that we know the minimum number of vector iterations,
4372 	     find the minimum niters for which the scalar cost is larger:
4373 
4374 	     SIC * niters > VIC * vniters + VOC - SOC
4375 
4376 	     We know that the minimum niters is no more than
4377 	     vniters * VF + NPEEL, but it might be (and often is) less
4378 	     than that if a partial vector iteration is cheaper than the
4379 	     equivalent scalar code.  */
4380 	  int threshold = (vec_inside_cost * min_vec_niters
4381 			   + vec_outside_cost
4382 			   - scalar_outside_cost);
4383 	  if (threshold <= 0)
4384 	    min_profitable_iters = 1;
4385 	  else
4386 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4387 	}
4388       else
4389 	/* Convert the number of vector iterations into a number of
4390 	   scalar iterations.  */
4391 	min_profitable_iters = (min_vec_niters * assumed_vf
4392 				+ peel_iters_prologue
4393 				+ peel_iters_epilogue);
4394     }
4395   else
4396     {
4397       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4398 			      * assumed_vf
4399 			      - vec_inside_cost * peel_iters_prologue
4400 			      - vec_inside_cost * peel_iters_epilogue);
4401       if (min_profitable_iters <= 0)
4402         min_profitable_iters = 0;
4403       else
4404 	{
4405 	  min_profitable_iters /= saving_per_viter;
4406 
4407 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4408 	      <= (((int) vec_inside_cost * min_profitable_iters)
4409 		  + (((int) vec_outside_cost - scalar_outside_cost)
4410 		     * assumed_vf)))
4411 	    min_profitable_iters++;
4412 	}
4413     }
4414 
4415   if (dump_enabled_p ())
4416     dump_printf (MSG_NOTE,
4417 		 "  Calculated minimum iters for profitability: %d\n",
4418 		 min_profitable_iters);
4419 
4420   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4421       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4422     /* We want the vectorized loop to execute at least once.  */
4423     min_profitable_iters = assumed_vf + peel_iters_prologue;
4424   else if (min_profitable_iters < peel_iters_prologue)
4425     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4426        vectorized loop executes at least once.  */
4427     min_profitable_iters = peel_iters_prologue;
4428 
4429   if (dump_enabled_p ())
4430     dump_printf_loc (MSG_NOTE, vect_location,
4431                      "  Runtime profitability threshold = %d\n",
4432                      min_profitable_iters);
4433 
4434   *ret_min_profitable_niters = min_profitable_iters;
4435 
4436   /* Calculate number of iterations required to make the vector version
4437      profitable, relative to the loop bodies only.
4438 
4439      Non-vectorized variant is SIC * niters and it must win over vector
4440      variant on the expected loop trip count.  The following condition must hold true:
4441      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4442 
4443   if (vec_outside_cost <= 0)
4444     min_profitable_estimate = 0;
4445   /* ??? This "else if" arm is written to handle all cases; see below for
4446      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4447   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4448     {
4449       /* This is a repeat of the code above, but with + SOC rather
4450 	 than - SOC.  */
4451       int outside_overhead = (vec_outside_cost
4452 			      - scalar_single_iter_cost * peel_iters_prologue
4453 			      - scalar_single_iter_cost * peel_iters_epilogue
4454 			      + scalar_outside_cost);
4455       int min_vec_niters = 1;
4456       if (outside_overhead > 0)
4457 	min_vec_niters = outside_overhead / saving_per_viter + 1;
4458 
4459       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4460 	{
4461 	  int threshold = (vec_inside_cost * min_vec_niters
4462 			   + vec_outside_cost
4463 			   + scalar_outside_cost);
4464 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4465 	}
4466       else
4467 	min_profitable_estimate = (min_vec_niters * assumed_vf
4468 				   + peel_iters_prologue
4469 				   + peel_iters_epilogue);
4470     }
4471   else
4472     {
4473       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4474 				 * assumed_vf
4475 				 - vec_inside_cost * peel_iters_prologue
4476 				 - vec_inside_cost * peel_iters_epilogue)
4477 				 / ((scalar_single_iter_cost * assumed_vf)
4478 				   - vec_inside_cost);
4479     }
4480   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4481   if (dump_enabled_p ())
4482     dump_printf_loc (MSG_NOTE, vect_location,
4483 		     "  Static estimate profitability threshold = %d\n",
4484 		     min_profitable_estimate);
4485 
4486   *ret_min_profitable_estimate = min_profitable_estimate;
4487 }
4488 
4489 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4490    vector elements (not bits) for a vector with NELT elements.  */
4491 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4492 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4493 			      vec_perm_builder *sel)
4494 {
4495   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4496      by vec_perm_indices.  */
4497   sel->new_vector (nelt, 1, 3);
4498   for (unsigned int i = 0; i < 3; i++)
4499     sel->quick_push (i + offset);
4500 }
4501 
4502 /* Checks whether the target supports whole-vector shifts for vectors of mode
4503    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4504    it supports vec_perm_const with masks for all necessary shift amounts.  */
4505 static bool
have_whole_vector_shift(machine_mode mode)4506 have_whole_vector_shift (machine_mode mode)
4507 {
4508   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4509     return true;
4510 
4511   /* Variable-length vectors should be handled via the optab.  */
4512   unsigned int nelt;
4513   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4514     return false;
4515 
4516   vec_perm_builder sel;
4517   vec_perm_indices indices;
4518   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4519     {
4520       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4521       indices.new_vector (sel, 2, nelt);
4522       if (!can_vec_perm_const_p (mode, indices, false))
4523 	return false;
4524     }
4525   return true;
4526 }
4527 
4528 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4529    functions. Design better to avoid maintenance issues.  */
4530 
4531 /* Function vect_model_reduction_cost.
4532 
4533    Models cost for a reduction operation, including the vector ops
4534    generated within the strip-mine loop in some cases, the initial
4535    definition before the loop, and the epilogue code that must be generated.  */
4536 
4537 static void
vect_model_reduction_cost(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)4538 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4539 			   stmt_vec_info stmt_info, internal_fn reduc_fn,
4540 			   vect_reduction_type reduction_type,
4541 			   int ncopies, stmt_vector_for_cost *cost_vec)
4542 {
4543   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4544   tree vectype;
4545   machine_mode mode;
4546   class loop *loop = NULL;
4547 
4548   if (loop_vinfo)
4549     loop = LOOP_VINFO_LOOP (loop_vinfo);
4550 
4551   /* Condition reductions generate two reductions in the loop.  */
4552   if (reduction_type == COND_REDUCTION)
4553     ncopies *= 2;
4554 
4555   vectype = STMT_VINFO_VECTYPE (stmt_info);
4556   mode = TYPE_MODE (vectype);
4557   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4558 
4559   gimple_match_op op;
4560   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4561     gcc_unreachable ();
4562 
4563   if (reduction_type == EXTRACT_LAST_REDUCTION)
4564     /* No extra instructions are needed in the prologue.  The loop body
4565        operations are costed in vectorizable_condition.  */
4566     inside_cost = 0;
4567   else if (reduction_type == FOLD_LEFT_REDUCTION)
4568     {
4569       /* No extra instructions needed in the prologue.  */
4570       prologue_cost = 0;
4571 
4572       if (reduc_fn != IFN_LAST)
4573 	/* Count one reduction-like operation per vector.  */
4574 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4575 					stmt_info, 0, vect_body);
4576       else
4577 	{
4578 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4579 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4580 	  inside_cost = record_stmt_cost (cost_vec, nelements,
4581 					  vec_to_scalar, stmt_info, 0,
4582 					  vect_body);
4583 	  inside_cost += record_stmt_cost (cost_vec, nelements,
4584 					   scalar_stmt, stmt_info, 0,
4585 					   vect_body);
4586 	}
4587     }
4588   else
4589     {
4590       /* Add in cost for initial definition.
4591 	 For cond reduction we have four vectors: initial index, step,
4592 	 initial result of the data reduction, initial value of the index
4593 	 reduction.  */
4594       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4595       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4596 					 scalar_to_vec, stmt_info, 0,
4597 					 vect_prologue);
4598     }
4599 
4600   /* Determine cost of epilogue code.
4601 
4602      We have a reduction operator that will reduce the vector in one statement.
4603      Also requires scalar extract.  */
4604 
4605   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4606     {
4607       if (reduc_fn != IFN_LAST)
4608 	{
4609 	  if (reduction_type == COND_REDUCTION)
4610 	    {
4611 	      /* An EQ stmt and an COND_EXPR stmt.  */
4612 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4613 						 vector_stmt, stmt_info, 0,
4614 						 vect_epilogue);
4615 	      /* Reduction of the max index and a reduction of the found
4616 		 values.  */
4617 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4618 						 vec_to_scalar, stmt_info, 0,
4619 						 vect_epilogue);
4620 	      /* A broadcast of the max value.  */
4621 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4622 						 scalar_to_vec, stmt_info, 0,
4623 						 vect_epilogue);
4624 	    }
4625 	  else
4626 	    {
4627 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4628 						 stmt_info, 0, vect_epilogue);
4629 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4630 						 vec_to_scalar, stmt_info, 0,
4631 						 vect_epilogue);
4632 	    }
4633 	}
4634       else if (reduction_type == COND_REDUCTION)
4635 	{
4636 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4637 	  /* Extraction of scalar elements.  */
4638 	  epilogue_cost += record_stmt_cost (cost_vec,
4639 					     2 * estimated_nunits,
4640 					     vec_to_scalar, stmt_info, 0,
4641 					     vect_epilogue);
4642 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4643 	  epilogue_cost += record_stmt_cost (cost_vec,
4644 					     2 * estimated_nunits - 3,
4645 					     scalar_stmt, stmt_info, 0,
4646 					     vect_epilogue);
4647 	}
4648       else if (reduction_type == EXTRACT_LAST_REDUCTION
4649 	       || reduction_type == FOLD_LEFT_REDUCTION)
4650 	/* No extra instructions need in the epilogue.  */
4651 	;
4652       else
4653 	{
4654 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4655 	  tree bitsize = TYPE_SIZE (op.type);
4656 	  int element_bitsize = tree_to_uhwi (bitsize);
4657 	  int nelements = vec_size_in_bits / element_bitsize;
4658 
4659 	  if (op.code == COND_EXPR)
4660 	    op.code = MAX_EXPR;
4661 
4662 	  /* We have a whole vector shift available.  */
4663 	  if (VECTOR_MODE_P (mode)
4664 	      && directly_supported_p (op.code, vectype)
4665 	      && have_whole_vector_shift (mode))
4666 	    {
4667 	      /* Final reduction via vector shifts and the reduction operator.
4668 		 Also requires scalar extract.  */
4669 	      epilogue_cost += record_stmt_cost (cost_vec,
4670 						 exact_log2 (nelements) * 2,
4671 						 vector_stmt, stmt_info, 0,
4672 						 vect_epilogue);
4673 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4674 						 vec_to_scalar, stmt_info, 0,
4675 						 vect_epilogue);
4676 	    }
4677 	  else
4678 	    /* Use extracts and reduction op for final reduction.  For N
4679 	       elements, we have N extracts and N-1 reduction ops.  */
4680 	    epilogue_cost += record_stmt_cost (cost_vec,
4681 					       nelements + nelements - 1,
4682 					       vector_stmt, stmt_info, 0,
4683 					       vect_epilogue);
4684 	}
4685     }
4686 
4687   if (dump_enabled_p ())
4688     dump_printf (MSG_NOTE,
4689                  "vect_model_reduction_cost: inside_cost = %d, "
4690                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4691                  prologue_cost, epilogue_cost);
4692 }
4693 
4694 /* SEQ is a sequence of instructions that initialize the reduction
4695    described by REDUC_INFO.  Emit them in the appropriate place.  */
4696 
4697 static void
vect_emit_reduction_init_stmts(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,gimple * seq)4698 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4699 				stmt_vec_info reduc_info, gimple *seq)
4700 {
4701   if (reduc_info->reused_accumulator)
4702     {
4703       /* When reusing an accumulator from the main loop, we only need
4704 	 initialization instructions if the main loop can be skipped.
4705 	 In that case, emit the initialization instructions at the end
4706 	 of the guard block that does the skip.  */
4707       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4708       gcc_assert (skip_edge);
4709       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4710       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4711     }
4712   else
4713     {
4714       /* The normal case: emit the initialization instructions on the
4715 	 preheader edge.  */
4716       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4717       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4718     }
4719 }
4720 
4721 /* Function get_initial_def_for_reduction
4722 
4723    Input:
4724    REDUC_INFO - the info_for_reduction
4725    INIT_VAL - the initial value of the reduction variable
4726    NEUTRAL_OP - a value that has no effect on the reduction, as per
4727 		neutral_op_for_reduction
4728 
4729    Output:
4730    Return a vector variable, initialized according to the operation that
4731 	STMT_VINFO performs. This vector will be used as the initial value
4732 	of the vector of partial results.
4733 
4734    The value we need is a vector in which element 0 has value INIT_VAL
4735    and every other element has value NEUTRAL_OP.  */
4736 
4737 static tree
get_initial_def_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,tree init_val,tree neutral_op)4738 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4739 			       stmt_vec_info reduc_info,
4740 			       tree init_val, tree neutral_op)
4741 {
4742   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4743   tree scalar_type = TREE_TYPE (init_val);
4744   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4745   tree init_def;
4746   gimple_seq stmts = NULL;
4747 
4748   gcc_assert (vectype);
4749 
4750   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4751 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4752 
4753   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4754 	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4755 
4756   if (operand_equal_p (init_val, neutral_op))
4757     {
4758       /* If both elements are equal then the vector described above is
4759 	 just a splat.  */
4760       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4761       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4762     }
4763   else
4764     {
4765       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4766       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4767       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4768 	{
4769 	  /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4770 	     element 0.  */
4771 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4772 						   neutral_op);
4773 	  init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4774 				   vectype, init_def, init_val);
4775 	}
4776       else
4777 	{
4778 	  /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4779 	  tree_vector_builder elts (vectype, 1, 2);
4780 	  elts.quick_push (init_val);
4781 	  elts.quick_push (neutral_op);
4782 	  init_def = gimple_build_vector (&stmts, &elts);
4783 	}
4784     }
4785 
4786   if (stmts)
4787     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4788   return init_def;
4789 }
4790 
4791 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4792    which performs a reduction involving GROUP_SIZE scalar statements.
4793    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4794    is nonnull, introducing extra elements of that value will not change the
4795    result.  */
4796 
4797 static void
get_initial_defs_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,vec<tree> * vec_oprnds,unsigned int number_of_vectors,unsigned int group_size,tree neutral_op)4798 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4799 				stmt_vec_info reduc_info,
4800 				vec<tree> *vec_oprnds,
4801 				unsigned int number_of_vectors,
4802 				unsigned int group_size, tree neutral_op)
4803 {
4804   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4805   unsigned HOST_WIDE_INT nunits;
4806   unsigned j, number_of_places_left_in_vector;
4807   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4808   unsigned int i;
4809 
4810   gcc_assert (group_size == initial_values.length () || neutral_op);
4811 
4812   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4813      created vectors. It is greater than 1 if unrolling is performed.
4814 
4815      For example, we have two scalar operands, s1 and s2 (e.g., group of
4816      strided accesses of size two), while NUNITS is four (i.e., four scalars
4817      of this type can be packed in a vector).  The output vector will contain
4818      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4819      will be 2).
4820 
4821      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4822      vectors containing the operands.
4823 
4824      For example, NUNITS is four as before, and the group size is 8
4825      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4826      {s5, s6, s7, s8}.  */
4827 
4828   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4829     nunits = group_size;
4830 
4831   number_of_places_left_in_vector = nunits;
4832   bool constant_p = true;
4833   tree_vector_builder elts (vector_type, nunits, 1);
4834   elts.quick_grow (nunits);
4835   gimple_seq ctor_seq = NULL;
4836   for (j = 0; j < nunits * number_of_vectors; ++j)
4837     {
4838       tree op;
4839       i = j % group_size;
4840 
4841       /* Get the def before the loop.  In reduction chain we have only
4842 	 one initial value.  Else we have as many as PHIs in the group.  */
4843       if (i >= initial_values.length () || (j > i && neutral_op))
4844 	op = neutral_op;
4845       else
4846 	op = initial_values[i];
4847 
4848       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4849       number_of_places_left_in_vector--;
4850       elts[nunits - number_of_places_left_in_vector - 1] = op;
4851       if (!CONSTANT_CLASS_P (op))
4852 	constant_p = false;
4853 
4854       if (number_of_places_left_in_vector == 0)
4855 	{
4856 	  tree init;
4857 	  if (constant_p && !neutral_op
4858 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4859 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4860 	    /* Build the vector directly from ELTS.  */
4861 	    init = gimple_build_vector (&ctor_seq, &elts);
4862 	  else if (neutral_op)
4863 	    {
4864 	      /* Build a vector of the neutral value and shift the
4865 		 other elements into place.  */
4866 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4867 						   neutral_op);
4868 	      int k = nunits;
4869 	      while (k > 0 && elts[k - 1] == neutral_op)
4870 		k -= 1;
4871 	      while (k > 0)
4872 		{
4873 		  k -= 1;
4874 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4875 				       vector_type, init, elts[k]);
4876 		}
4877 	    }
4878 	  else
4879 	    {
4880 	      /* First time round, duplicate ELTS to fill the
4881 		 required number of vectors.  */
4882 	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4883 					elts, number_of_vectors, *vec_oprnds);
4884 	      break;
4885 	    }
4886 	  vec_oprnds->quick_push (init);
4887 
4888 	  number_of_places_left_in_vector = nunits;
4889 	  elts.new_vector (vector_type, nunits, 1);
4890 	  elts.quick_grow (nunits);
4891 	  constant_p = true;
4892 	}
4893     }
4894   if (ctor_seq != NULL)
4895     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4896 }
4897 
4898 /* For a statement STMT_INFO taking part in a reduction operation return
4899    the stmt_vec_info the meta information is stored on.  */
4900 
4901 stmt_vec_info
info_for_reduction(vec_info * vinfo,stmt_vec_info stmt_info)4902 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4903 {
4904   stmt_info = vect_orig_stmt (stmt_info);
4905   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4906   if (!is_a <gphi *> (stmt_info->stmt)
4907       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4908     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4909   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4910   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4911     {
4912       if (gimple_phi_num_args (phi) == 1)
4913 	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4914     }
4915   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4916     {
4917       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4918       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4919 	stmt_info = info;
4920     }
4921   return stmt_info;
4922 }
4923 
4924 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4925    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4926    return false.  */
4927 
4928 static bool
vect_find_reusable_accumulator(loop_vec_info loop_vinfo,stmt_vec_info reduc_info)4929 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4930 				stmt_vec_info reduc_info)
4931 {
4932   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4933   if (!main_loop_vinfo)
4934     return false;
4935 
4936   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4937     return false;
4938 
4939   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4940   auto_vec<tree, 16> main_loop_results (num_phis);
4941   auto_vec<tree, 16> initial_values (num_phis);
4942   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4943     {
4944       /* The epilogue loop can be entered either from the main loop or
4945 	 from an earlier guard block.  */
4946       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4947       for (tree incoming_value : reduc_info->reduc_initial_values)
4948 	{
4949 	  /* Look for:
4950 
4951 	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4952 				    INITIAL_VALUE(guard block)>.  */
4953 	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4954 
4955 	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4956 	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4957 
4958 	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4959 	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4960 
4961 	  main_loop_results.quick_push (from_main_loop);
4962 	  initial_values.quick_push (from_skip);
4963 	}
4964     }
4965   else
4966     /* The main loop dominates the epilogue loop.  */
4967     main_loop_results.splice (reduc_info->reduc_initial_values);
4968 
4969   /* See if the main loop has the kind of accumulator we need.  */
4970   vect_reusable_accumulator *accumulator
4971     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4972   if (!accumulator
4973       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4974       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4975 		      accumulator->reduc_info->reduc_scalar_results.begin ()))
4976     return false;
4977 
4978   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4979   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4980   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4981   unsigned HOST_WIDE_INT m;
4982   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4983 			    TYPE_VECTOR_SUBPARTS (vectype), &m))
4984     return false;
4985   /* Check the intermediate vector types and operations are available.  */
4986   tree prev_vectype = old_vectype;
4987   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4988   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4989     {
4990       intermediate_nunits = exact_div (intermediate_nunits, 2);
4991       tree intermediate_vectype = get_related_vectype_for_scalar_type
4992 	(TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
4993       if (!intermediate_vectype
4994 	  || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
4995 				    intermediate_vectype)
4996 	  || !can_vec_extract (TYPE_MODE (prev_vectype),
4997 			       TYPE_MODE (intermediate_vectype)))
4998 	return false;
4999       prev_vectype = intermediate_vectype;
5000     }
5001 
5002   /* Non-SLP reductions might apply an adjustment after the reduction
5003      operation, in order to simplify the initialization of the accumulator.
5004      If the epilogue loop carries on from where the main loop left off,
5005      it should apply the same adjustment to the final reduction result.
5006 
5007      If the epilogue loop can also be entered directly (rather than via
5008      the main loop), we need to be able to handle that case in the same way,
5009      with the same adjustment.  (In principle we could add a PHI node
5010      to select the correct adjustment, but in practice that shouldn't be
5011      necessary.)  */
5012   tree main_adjustment
5013     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5014   if (loop_vinfo->main_loop_edge && main_adjustment)
5015     {
5016       gcc_assert (num_phis == 1);
5017       tree initial_value = initial_values[0];
5018       /* Check that we can use INITIAL_VALUE as the adjustment and
5019 	 initialize the accumulator with a neutral value instead.  */
5020       if (!operand_equal_p (initial_value, main_adjustment))
5021 	return false;
5022       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5023       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5024 						    code, initial_value);
5025     }
5026   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5027   reduc_info->reduc_initial_values.truncate (0);
5028   reduc_info->reduc_initial_values.splice (initial_values);
5029   reduc_info->reused_accumulator = accumulator;
5030   return true;
5031 }
5032 
5033 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5034    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5035 
5036 static tree
vect_create_partial_epilog(tree vec_def,tree vectype,code_helper code,gimple_seq * seq)5037 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5038 			    gimple_seq *seq)
5039 {
5040   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5041   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5042   tree stype = TREE_TYPE (vectype);
5043   tree new_temp = vec_def;
5044   while (nunits > nunits1)
5045     {
5046       nunits /= 2;
5047       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5048 							   stype, nunits);
5049       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5050 
5051       /* The target has to make sure we support lowpart/highpart
5052 	 extraction, either via direct vector extract or through
5053 	 an integer mode punning.  */
5054       tree dst1, dst2;
5055       gimple *epilog_stmt;
5056       if (convert_optab_handler (vec_extract_optab,
5057 				 TYPE_MODE (TREE_TYPE (new_temp)),
5058 				 TYPE_MODE (vectype1))
5059 	  != CODE_FOR_nothing)
5060 	{
5061 	  /* Extract sub-vectors directly once vec_extract becomes
5062 	     a conversion optab.  */
5063 	  dst1 = make_ssa_name (vectype1);
5064 	  epilog_stmt
5065 	      = gimple_build_assign (dst1, BIT_FIELD_REF,
5066 				     build3 (BIT_FIELD_REF, vectype1,
5067 					     new_temp, TYPE_SIZE (vectype1),
5068 					     bitsize_int (0)));
5069 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5070 	  dst2 =  make_ssa_name (vectype1);
5071 	  epilog_stmt
5072 	      = gimple_build_assign (dst2, BIT_FIELD_REF,
5073 				     build3 (BIT_FIELD_REF, vectype1,
5074 					     new_temp, TYPE_SIZE (vectype1),
5075 					     bitsize_int (bitsize)));
5076 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5077 	}
5078       else
5079 	{
5080 	  /* Extract via punning to appropriately sized integer mode
5081 	     vector.  */
5082 	  tree eltype = build_nonstandard_integer_type (bitsize, 1);
5083 	  tree etype = build_vector_type (eltype, 2);
5084 	  gcc_assert (convert_optab_handler (vec_extract_optab,
5085 					     TYPE_MODE (etype),
5086 					     TYPE_MODE (eltype))
5087 		      != CODE_FOR_nothing);
5088 	  tree tem = make_ssa_name (etype);
5089 	  epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5090 					     build1 (VIEW_CONVERT_EXPR,
5091 						     etype, new_temp));
5092 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5093 	  new_temp = tem;
5094 	  tem = make_ssa_name (eltype);
5095 	  epilog_stmt
5096 	      = gimple_build_assign (tem, BIT_FIELD_REF,
5097 				     build3 (BIT_FIELD_REF, eltype,
5098 					     new_temp, TYPE_SIZE (eltype),
5099 					     bitsize_int (0)));
5100 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5101 	  dst1 = make_ssa_name (vectype1);
5102 	  epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5103 					     build1 (VIEW_CONVERT_EXPR,
5104 						     vectype1, tem));
5105 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5106 	  tem = make_ssa_name (eltype);
5107 	  epilog_stmt
5108 	      = gimple_build_assign (tem, BIT_FIELD_REF,
5109 				     build3 (BIT_FIELD_REF, eltype,
5110 					     new_temp, TYPE_SIZE (eltype),
5111 					     bitsize_int (bitsize)));
5112 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5113 	  dst2 =  make_ssa_name (vectype1);
5114 	  epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5115 					     build1 (VIEW_CONVERT_EXPR,
5116 						     vectype1, tem));
5117 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5118 	}
5119 
5120       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5121     }
5122 
5123   return new_temp;
5124 }
5125 
5126 /* Function vect_create_epilog_for_reduction
5127 
5128    Create code at the loop-epilog to finalize the result of a reduction
5129    computation.
5130 
5131    STMT_INFO is the scalar reduction stmt that is being vectorized.
5132    SLP_NODE is an SLP node containing a group of reduction statements. The
5133      first one in this group is STMT_INFO.
5134    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5135    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5136      (counting from 0)
5137 
5138    This function:
5139    1. Completes the reduction def-use cycles.
5140    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5141       by calling the function specified by REDUC_FN if available, or by
5142       other means (whole-vector shifts or a scalar loop).
5143       The function also creates a new phi node at the loop exit to preserve
5144       loop-closed form, as illustrated below.
5145 
5146      The flow at the entry to this function:
5147 
5148         loop:
5149           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5150           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5151           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5152         loop_exit:
5153           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5154           use <s_out0>
5155           use <s_out0>
5156 
5157      The above is transformed by this function into:
5158 
5159         loop:
5160           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5161           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5162           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5163         loop_exit:
5164           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5165           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5166           v_out2 = reduce <v_out1>
5167           s_out3 = extract_field <v_out2, 0>
5168           s_out4 = adjust_result <s_out3>
5169           use <s_out4>
5170           use <s_out4>
5171 */
5172 
5173 static void
vect_create_epilog_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)5174 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5175 				  stmt_vec_info stmt_info,
5176 				  slp_tree slp_node,
5177 				  slp_instance slp_node_instance)
5178 {
5179   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5180   gcc_assert (reduc_info->is_reduc_info);
5181   /* For double reductions we need to get at the inner loop reduction
5182      stmt which has the meta info attached.  Our stmt_info is that of the
5183      loop-closed PHI of the inner loop which we remember as
5184      def for the reduction PHI generation.  */
5185   bool double_reduc = false;
5186   stmt_vec_info rdef_info = stmt_info;
5187   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5188     {
5189       gcc_assert (!slp_node);
5190       double_reduc = true;
5191       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5192 					    (stmt_info->stmt, 0));
5193       stmt_info = vect_stmt_to_vectorize (stmt_info);
5194     }
5195   gphi *reduc_def_stmt
5196     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5197   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5198   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5199   tree vectype;
5200   machine_mode mode;
5201   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5202   basic_block exit_bb;
5203   tree scalar_dest;
5204   tree scalar_type;
5205   gimple *new_phi = NULL, *phi;
5206   gimple_stmt_iterator exit_gsi;
5207   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5208   gimple *epilog_stmt = NULL;
5209   gimple *exit_phi;
5210   tree bitsize;
5211   tree def;
5212   tree orig_name, scalar_result;
5213   imm_use_iterator imm_iter, phi_imm_iter;
5214   use_operand_p use_p, phi_use_p;
5215   gimple *use_stmt;
5216   auto_vec<tree> reduc_inputs;
5217   int j, i;
5218   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5219   unsigned int group_size = 1, k;
5220   auto_vec<gimple *> phis;
5221   /* SLP reduction without reduction chain, e.g.,
5222      # a1 = phi <a2, a0>
5223      # b1 = phi <b2, b0>
5224      a2 = operation (a1)
5225      b2 = operation (b1)  */
5226   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5227   bool direct_slp_reduc;
5228   tree induction_index = NULL_TREE;
5229 
5230   if (slp_node)
5231     group_size = SLP_TREE_LANES (slp_node);
5232 
5233   if (nested_in_vect_loop_p (loop, stmt_info))
5234     {
5235       outer_loop = loop;
5236       loop = loop->inner;
5237       gcc_assert (!slp_node && double_reduc);
5238     }
5239 
5240   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5241   gcc_assert (vectype);
5242   mode = TYPE_MODE (vectype);
5243 
5244   tree induc_val = NULL_TREE;
5245   tree adjustment_def = NULL;
5246   if (slp_node)
5247     ;
5248   else
5249     {
5250       /* Optimize: for induction condition reduction, if we can't use zero
5251          for induc_val, use initial_def.  */
5252       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5253 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5254       else if (double_reduc)
5255 	;
5256       else
5257 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5258     }
5259 
5260   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5261   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5262   if (slp_reduc)
5263     /* All statements produce live-out values.  */
5264     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5265   else if (slp_node)
5266     {
5267       /* The last statement in the reduction chain produces the live-out
5268 	 value.  Note SLP optimization can shuffle scalar stmts to
5269 	 optimize permutations so we have to search for the last stmt.  */
5270       for (k = 0; k < group_size; ++k)
5271 	if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5272 	  {
5273 	    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5274 	    break;
5275 	  }
5276     }
5277 
5278   unsigned vec_num;
5279   int ncopies;
5280   if (slp_node)
5281     {
5282       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5283       ncopies = 1;
5284     }
5285   else
5286     {
5287       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5288       vec_num = 1;
5289       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5290     }
5291 
5292   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5293      which is updated with the current index of the loop for every match of
5294      the original loop's cond_expr (VEC_STMT).  This results in a vector
5295      containing the last time the condition passed for that vector lane.
5296      The first match will be a 1 to allow 0 to be used for non-matching
5297      indexes.  If there are no matches at all then the vector will be all
5298      zeroes.
5299 
5300      PR92772: This algorithm is broken for architectures that support
5301      masked vectors, but do not provide fold_extract_last.  */
5302   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5303     {
5304       auto_vec<std::pair<tree, bool>, 2> ccompares;
5305       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5306       cond_info = vect_stmt_to_vectorize (cond_info);
5307       while (cond_info != reduc_info)
5308 	{
5309 	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5310 	    {
5311 	      gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5312 	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5313 	      ccompares.safe_push
5314 		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5315 				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5316 	    }
5317 	  cond_info
5318 	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5319 						 1 + STMT_VINFO_REDUC_IDX
5320 							(cond_info)));
5321 	  cond_info = vect_stmt_to_vectorize (cond_info);
5322 	}
5323       gcc_assert (ccompares.length () != 0);
5324 
5325       tree indx_before_incr, indx_after_incr;
5326       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5327       int scalar_precision
5328 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5329       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5330       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5331 	(TYPE_MODE (vectype), cr_index_scalar_type,
5332 	 TYPE_VECTOR_SUBPARTS (vectype));
5333 
5334       /* First we create a simple vector induction variable which starts
5335 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5336 	 vector size (STEP).  */
5337 
5338       /* Create a {1,2,3,...} vector.  */
5339       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5340 
5341       /* Create a vector of the step value.  */
5342       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5343       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5344 
5345       /* Create an induction variable.  */
5346       gimple_stmt_iterator incr_gsi;
5347       bool insert_after;
5348       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5349       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5350 		 insert_after, &indx_before_incr, &indx_after_incr);
5351 
5352       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5353 	 filled with zeros (VEC_ZERO).  */
5354 
5355       /* Create a vector of 0s.  */
5356       tree zero = build_zero_cst (cr_index_scalar_type);
5357       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5358 
5359       /* Create a vector phi node.  */
5360       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5361       new_phi = create_phi_node (new_phi_tree, loop->header);
5362       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5363 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
5364 
5365       /* Now take the condition from the loops original cond_exprs
5366 	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5367 	 every match uses values from the induction variable
5368 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5369 	 (NEW_PHI_TREE).
5370 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5371 	 the new cond_expr (INDEX_COND_EXPR).  */
5372       gimple_seq stmts = NULL;
5373       for (int i = ccompares.length () - 1; i != -1; --i)
5374 	{
5375 	  tree ccompare = ccompares[i].first;
5376 	  if (ccompares[i].second)
5377 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5378 					 cr_index_vector_type,
5379 					 ccompare,
5380 					 indx_before_incr, new_phi_tree);
5381 	  else
5382 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5383 					 cr_index_vector_type,
5384 					 ccompare,
5385 					 new_phi_tree, indx_before_incr);
5386 	}
5387       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5388 
5389       /* Update the phi with the vec cond.  */
5390       induction_index = new_phi_tree;
5391       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5392 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
5393     }
5394 
5395   /* 2. Create epilog code.
5396         The reduction epilog code operates across the elements of the vector
5397         of partial results computed by the vectorized loop.
5398         The reduction epilog code consists of:
5399 
5400         step 1: compute the scalar result in a vector (v_out2)
5401         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5402         step 3: adjust the scalar result (s_out3) if needed.
5403 
5404         Step 1 can be accomplished using one the following three schemes:
5405           (scheme 1) using reduc_fn, if available.
5406           (scheme 2) using whole-vector shifts, if available.
5407           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5408                      combined.
5409 
5410           The overall epilog code looks like this:
5411 
5412           s_out0 = phi <s_loop>         # original EXIT_PHI
5413           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5414           v_out2 = reduce <v_out1>              # step 1
5415           s_out3 = extract_field <v_out2, 0>    # step 2
5416           s_out4 = adjust_result <s_out3>       # step 3
5417 
5418           (step 3 is optional, and steps 1 and 2 may be combined).
5419           Lastly, the uses of s_out0 are replaced by s_out4.  */
5420 
5421 
5422   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5423          v_out1 = phi <VECT_DEF>
5424          Store them in NEW_PHIS.  */
5425   if (double_reduc)
5426     loop = outer_loop;
5427   exit_bb = single_exit (loop)->dest;
5428   exit_gsi = gsi_after_labels (exit_bb);
5429   reduc_inputs.create (slp_node ? vec_num : ncopies);
5430   for (unsigned i = 0; i < vec_num; i++)
5431     {
5432       gimple_seq stmts = NULL;
5433       if (slp_node)
5434 	def = vect_get_slp_vect_def (slp_node, i);
5435       else
5436 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5437       for (j = 0; j < ncopies; j++)
5438 	{
5439 	  tree new_def = copy_ssa_name (def);
5440 	  phi = create_phi_node (new_def, exit_bb);
5441 	  if (j)
5442 	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5443 	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5444 	  new_def = gimple_convert (&stmts, vectype, new_def);
5445 	  reduc_inputs.quick_push (new_def);
5446 	}
5447       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5448     }
5449 
5450   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5451          (i.e. when reduc_fn is not available) and in the final adjustment
5452 	 code (if needed).  Also get the original scalar reduction variable as
5453          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5454          represents a reduction pattern), the tree-code and scalar-def are
5455          taken from the original stmt that the pattern-stmt (STMT) replaces.
5456          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5457          are taken from STMT.  */
5458 
5459   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5460   if (orig_stmt_info != stmt_info)
5461     {
5462       /* Reduction pattern  */
5463       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5464       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5465     }
5466 
5467   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5468   scalar_type = TREE_TYPE (scalar_dest);
5469   scalar_results.truncate (0);
5470   scalar_results.reserve_exact (group_size);
5471   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5472   bitsize = TYPE_SIZE (scalar_type);
5473 
5474   /* True if we should implement SLP_REDUC using native reduction operations
5475      instead of scalar operations.  */
5476   direct_slp_reduc = (reduc_fn != IFN_LAST
5477 		      && slp_reduc
5478 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5479 
5480   /* In case of reduction chain, e.g.,
5481      # a1 = phi <a3, a0>
5482      a2 = operation (a1)
5483      a3 = operation (a2),
5484 
5485      we may end up with more than one vector result.  Here we reduce them
5486      to one vector.
5487 
5488      The same is true if we couldn't use a single defuse cycle.  */
5489   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5490       || direct_slp_reduc
5491       || ncopies > 1)
5492     {
5493       gimple_seq stmts = NULL;
5494       tree single_input = reduc_inputs[0];
5495       for (k = 1; k < reduc_inputs.length (); k++)
5496 	single_input = gimple_build (&stmts, code, vectype,
5497 				     single_input, reduc_inputs[k]);
5498       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5499 
5500       reduc_inputs.truncate (0);
5501       reduc_inputs.safe_push (single_input);
5502     }
5503 
5504   tree orig_reduc_input = reduc_inputs[0];
5505 
5506   /* If this loop is an epilogue loop that can be skipped after the
5507      main loop, we can only share a reduction operation between the
5508      main loop and the epilogue if we put it at the target of the
5509      skip edge.
5510 
5511      We can still reuse accumulators if this check fails.  Doing so has
5512      the minor(?) benefit of making the epilogue loop's scalar result
5513      independent of the main loop's scalar result.  */
5514   bool unify_with_main_loop_p = false;
5515   if (reduc_info->reused_accumulator
5516       && loop_vinfo->skip_this_loop_edge
5517       && single_succ_p (exit_bb)
5518       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5519     {
5520       unify_with_main_loop_p = true;
5521 
5522       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5523       reduc_inputs[0] = make_ssa_name (vectype);
5524       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5525       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5526 		   UNKNOWN_LOCATION);
5527       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5528 		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5529       exit_gsi = gsi_after_labels (reduc_block);
5530     }
5531 
5532   /* Shouldn't be used beyond this point.  */
5533   exit_bb = nullptr;
5534 
5535   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5536       && reduc_fn != IFN_LAST)
5537     {
5538       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5539 	 various data values where the condition matched and another vector
5540 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
5541 	 need to extract the last matching index (which will be the index with
5542 	 highest value) and use this to index into the data vector.
5543 	 For the case where there were no matches, the data vector will contain
5544 	 all default values and the index vector will be all zeros.  */
5545 
5546       /* Get various versions of the type of the vector of indexes.  */
5547       tree index_vec_type = TREE_TYPE (induction_index);
5548       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5549       tree index_scalar_type = TREE_TYPE (index_vec_type);
5550       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5551 
5552       /* Get an unsigned integer version of the type of the data vector.  */
5553       int scalar_precision
5554 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5555       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5556       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5557 						vectype);
5558 
5559       /* First we need to create a vector (ZERO_VEC) of zeros and another
5560 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5561 	 can create using a MAX reduction and then expanding.
5562 	 In the case where the loop never made any matches, the max index will
5563 	 be zero.  */
5564 
5565       /* Vector of {0, 0, 0,...}.  */
5566       tree zero_vec = build_zero_cst (vectype);
5567 
5568       /* Find maximum value from the vector of found indexes.  */
5569       tree max_index = make_ssa_name (index_scalar_type);
5570       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5571 							  1, induction_index);
5572       gimple_call_set_lhs (max_index_stmt, max_index);
5573       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5574 
5575       /* Vector of {max_index, max_index, max_index,...}.  */
5576       tree max_index_vec = make_ssa_name (index_vec_type);
5577       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5578 						      max_index);
5579       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5580 							max_index_vec_rhs);
5581       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5582 
5583       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5584 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5585 	 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5586 	 otherwise.  Only one value should match, resulting in a vector
5587 	 (VEC_COND) with one data value and the rest zeros.
5588 	 In the case where the loop never made any matches, every index will
5589 	 match, resulting in a vector with all data values (which will all be
5590 	 the default value).  */
5591 
5592       /* Compare the max index vector to the vector of found indexes to find
5593 	 the position of the max value.  */
5594       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5595       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5596 						      induction_index,
5597 						      max_index_vec);
5598       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5599 
5600       /* Use the compare to choose either values from the data vector or
5601 	 zero.  */
5602       tree vec_cond = make_ssa_name (vectype);
5603       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5604 						   vec_compare,
5605 						   reduc_inputs[0],
5606 						   zero_vec);
5607       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5608 
5609       /* Finally we need to extract the data value from the vector (VEC_COND)
5610 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5611 	 reduction, but because this doesn't exist, we can use a MAX reduction
5612 	 instead.  The data value might be signed or a float so we need to cast
5613 	 it first.
5614 	 In the case where the loop never made any matches, the data values are
5615 	 all identical, and so will reduce down correctly.  */
5616 
5617       /* Make the matched data values unsigned.  */
5618       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5619       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5620 				       vec_cond);
5621       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5622 							VIEW_CONVERT_EXPR,
5623 							vec_cond_cast_rhs);
5624       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5625 
5626       /* Reduce down to a scalar value.  */
5627       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5628       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5629 							   1, vec_cond_cast);
5630       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5631       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5632 
5633       /* Convert the reduced value back to the result type and set as the
5634 	 result.  */
5635       gimple_seq stmts = NULL;
5636       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5637 			       data_reduc);
5638       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5639       scalar_results.safe_push (new_temp);
5640     }
5641   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5642 	   && reduc_fn == IFN_LAST)
5643     {
5644       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5645 	 idx = 0;
5646          idx_val = induction_index[0];
5647 	 val = data_reduc[0];
5648          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5649 	   if (induction_index[i] > idx_val)
5650 	     val = data_reduc[i], idx_val = induction_index[i];
5651 	 return val;  */
5652 
5653       tree data_eltype = TREE_TYPE (vectype);
5654       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5655       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5656       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5657       /* Enforced by vectorizable_reduction, which ensures we have target
5658 	 support before allowing a conditional reduction on variable-length
5659 	 vectors.  */
5660       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5661       tree idx_val = NULL_TREE, val = NULL_TREE;
5662       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5663 	{
5664 	  tree old_idx_val = idx_val;
5665 	  tree old_val = val;
5666 	  idx_val = make_ssa_name (idx_eltype);
5667 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5668 					     build3 (BIT_FIELD_REF, idx_eltype,
5669 						     induction_index,
5670 						     bitsize_int (el_size),
5671 						     bitsize_int (off)));
5672 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5673 	  val = make_ssa_name (data_eltype);
5674 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5675 					     build3 (BIT_FIELD_REF,
5676 						     data_eltype,
5677 						     reduc_inputs[0],
5678 						     bitsize_int (el_size),
5679 						     bitsize_int (off)));
5680 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5681 	  if (off != 0)
5682 	    {
5683 	      tree new_idx_val = idx_val;
5684 	      if (off != v_size - el_size)
5685 		{
5686 		  new_idx_val = make_ssa_name (idx_eltype);
5687 		  epilog_stmt = gimple_build_assign (new_idx_val,
5688 						     MAX_EXPR, idx_val,
5689 						     old_idx_val);
5690 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5691 		}
5692 	      tree new_val = make_ssa_name (data_eltype);
5693 	      epilog_stmt = gimple_build_assign (new_val,
5694 						 COND_EXPR,
5695 						 build2 (GT_EXPR,
5696 							 boolean_type_node,
5697 							 idx_val,
5698 							 old_idx_val),
5699 						 val, old_val);
5700 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5701 	      idx_val = new_idx_val;
5702 	      val = new_val;
5703 	    }
5704 	}
5705       /* Convert the reduced value back to the result type and set as the
5706 	 result.  */
5707       gimple_seq stmts = NULL;
5708       val = gimple_convert (&stmts, scalar_type, val);
5709       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5710       scalar_results.safe_push (val);
5711     }
5712 
5713   /* 2.3 Create the reduction code, using one of the three schemes described
5714          above. In SLP we simply need to extract all the elements from the
5715          vector (without reducing them), so we use scalar shifts.  */
5716   else if (reduc_fn != IFN_LAST && !slp_reduc)
5717     {
5718       tree tmp;
5719       tree vec_elem_type;
5720 
5721       /* Case 1:  Create:
5722          v_out2 = reduc_expr <v_out1>  */
5723 
5724       if (dump_enabled_p ())
5725         dump_printf_loc (MSG_NOTE, vect_location,
5726 			 "Reduce using direct vector reduction.\n");
5727 
5728       gimple_seq stmts = NULL;
5729       vec_elem_type = TREE_TYPE (vectype);
5730       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5731 			       vec_elem_type, reduc_inputs[0]);
5732       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5733       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5734 
5735       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5736 	  && induc_val)
5737 	{
5738 	  /* Earlier we set the initial value to be a vector if induc_val
5739 	     values.  Check the result and if it is induc_val then replace
5740 	     with the original initial value, unless induc_val is
5741 	     the same as initial_def already.  */
5742 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5743 				  induc_val);
5744 	  tree initial_def = reduc_info->reduc_initial_values[0];
5745 
5746 	  tmp = make_ssa_name (new_scalar_dest);
5747 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5748 					     initial_def, new_temp);
5749 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750 	  new_temp = tmp;
5751 	}
5752 
5753       scalar_results.safe_push (new_temp);
5754     }
5755   else if (direct_slp_reduc)
5756     {
5757       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5758 	 with the elements for other SLP statements replaced with the
5759 	 neutral value.  We can then do a normal reduction on each vector.  */
5760 
5761       /* Enforced by vectorizable_reduction.  */
5762       gcc_assert (reduc_inputs.length () == 1);
5763       gcc_assert (pow2p_hwi (group_size));
5764 
5765       gimple_seq seq = NULL;
5766 
5767       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5768 	 and the same element size as VECTYPE.  */
5769       tree index = build_index_vector (vectype, 0, 1);
5770       tree index_type = TREE_TYPE (index);
5771       tree index_elt_type = TREE_TYPE (index_type);
5772       tree mask_type = truth_type_for (index_type);
5773 
5774       /* Create a vector that, for each element, identifies which of
5775 	 the REDUC_GROUP_SIZE results should use it.  */
5776       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5777       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5778 			    build_vector_from_val (index_type, index_mask));
5779 
5780       /* Get a neutral vector value.  This is simply a splat of the neutral
5781 	 scalar value if we have one, otherwise the initial scalar value
5782 	 is itself a neutral value.  */
5783       tree vector_identity = NULL_TREE;
5784       tree neutral_op = NULL_TREE;
5785       if (slp_node)
5786 	{
5787 	  tree initial_value = NULL_TREE;
5788 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5789 	    initial_value = reduc_info->reduc_initial_values[0];
5790 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5791 						 initial_value);
5792 	}
5793       if (neutral_op)
5794 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5795 							neutral_op);
5796       for (unsigned int i = 0; i < group_size; ++i)
5797 	{
5798 	  /* If there's no univeral neutral value, we can use the
5799 	     initial scalar value from the original PHI.  This is used
5800 	     for MIN and MAX reduction, for example.  */
5801 	  if (!neutral_op)
5802 	    {
5803 	      tree scalar_value = reduc_info->reduc_initial_values[i];
5804 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5805 					     scalar_value);
5806 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5807 							      scalar_value);
5808 	    }
5809 
5810 	  /* Calculate the equivalent of:
5811 
5812 	     sel[j] = (index[j] == i);
5813 
5814 	     which selects the elements of REDUC_INPUTS[0] that should
5815 	     be included in the result.  */
5816 	  tree compare_val = build_int_cst (index_elt_type, i);
5817 	  compare_val = build_vector_from_val (index_type, compare_val);
5818 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5819 				   index, compare_val);
5820 
5821 	  /* Calculate the equivalent of:
5822 
5823 	     vec = seq ? reduc_inputs[0] : vector_identity;
5824 
5825 	     VEC is now suitable for a full vector reduction.  */
5826 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5827 				   sel, reduc_inputs[0], vector_identity);
5828 
5829 	  /* Do the reduction and convert it to the appropriate type.  */
5830 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5831 				      TREE_TYPE (vectype), vec);
5832 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5833 	  scalar_results.safe_push (scalar);
5834 	}
5835       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5836     }
5837   else
5838     {
5839       bool reduce_with_shift;
5840       tree vec_temp;
5841 
5842       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5843 
5844       /* See if the target wants to do the final (shift) reduction
5845 	 in a vector mode of smaller size and first reduce upper/lower
5846 	 halves against each other.  */
5847       enum machine_mode mode1 = mode;
5848       tree stype = TREE_TYPE (vectype);
5849       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5850       unsigned nunits1 = nunits;
5851       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5852 	  && reduc_inputs.length () == 1)
5853 	{
5854 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5855 	  /* For SLP reductions we have to make sure lanes match up, but
5856 	     since we're doing individual element final reduction reducing
5857 	     vector width here is even more important.
5858 	     ???  We can also separate lanes with permutes, for the common
5859 	     case of power-of-two group-size odd/even extracts would work.  */
5860 	  if (slp_reduc && nunits != nunits1)
5861 	    {
5862 	      nunits1 = least_common_multiple (nunits1, group_size);
5863 	      gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5864 	    }
5865 	}
5866       if (!slp_reduc
5867 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5868 	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5869 
5870       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5871 							   stype, nunits1);
5872       reduce_with_shift = have_whole_vector_shift (mode1);
5873       if (!VECTOR_MODE_P (mode1)
5874 	  || !directly_supported_p (code, vectype1))
5875 	reduce_with_shift = false;
5876 
5877       /* First reduce the vector to the desired vector size we should
5878 	 do shift reduction on by combining upper and lower halves.  */
5879       gimple_seq stmts = NULL;
5880       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5881 					     code, &stmts);
5882       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5883       reduc_inputs[0] = new_temp;
5884 
5885       if (reduce_with_shift && !slp_reduc)
5886 	{
5887 	  int element_bitsize = tree_to_uhwi (bitsize);
5888 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5889 	     for variable-length vectors and also requires direct target support
5890 	     for loop reductions.  */
5891 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5892 	  int nelements = vec_size_in_bits / element_bitsize;
5893 	  vec_perm_builder sel;
5894 	  vec_perm_indices indices;
5895 
5896           int elt_offset;
5897 
5898           tree zero_vec = build_zero_cst (vectype1);
5899           /* Case 2: Create:
5900              for (offset = nelements/2; offset >= 1; offset/=2)
5901                 {
5902                   Create:  va' = vec_shift <va, offset>
5903                   Create:  va = vop <va, va'>
5904                 }  */
5905 
5906           tree rhs;
5907 
5908           if (dump_enabled_p ())
5909             dump_printf_loc (MSG_NOTE, vect_location,
5910 			     "Reduce using vector shifts\n");
5911 
5912 	  gimple_seq stmts = NULL;
5913 	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
5914           for (elt_offset = nelements / 2;
5915                elt_offset >= 1;
5916                elt_offset /= 2)
5917             {
5918 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5919 	      indices.new_vector (sel, 2, nelements);
5920 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5921 	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5922 				       new_temp, zero_vec, mask);
5923 	      new_temp = gimple_build (&stmts, code,
5924 				       vectype1, new_name, new_temp);
5925             }
5926 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5927 
5928 	  /* 2.4  Extract the final scalar result.  Create:
5929 	     s_out3 = extract_field <v_out2, bitpos>  */
5930 
5931 	  if (dump_enabled_p ())
5932 	    dump_printf_loc (MSG_NOTE, vect_location,
5933 			     "extract scalar result\n");
5934 
5935 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5936 			bitsize, bitsize_zero_node);
5937 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5938 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5939 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5940 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5941 	  scalar_results.safe_push (new_temp);
5942         }
5943       else
5944         {
5945           /* Case 3: Create:
5946              s = extract_field <v_out2, 0>
5947              for (offset = element_size;
5948                   offset < vector_size;
5949                   offset += element_size;)
5950                {
5951                  Create:  s' = extract_field <v_out2, offset>
5952                  Create:  s = op <s, s'>  // For non SLP cases
5953                }  */
5954 
5955           if (dump_enabled_p ())
5956             dump_printf_loc (MSG_NOTE, vect_location,
5957 			     "Reduce using scalar code.\n");
5958 
5959 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5960 	  int element_bitsize = tree_to_uhwi (bitsize);
5961 	  tree compute_type = TREE_TYPE (vectype);
5962 	  gimple_seq stmts = NULL;
5963 	  FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5964             {
5965               int bit_offset;
5966 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5967 				       vec_temp, bitsize, bitsize_zero_node);
5968 
5969               /* In SLP we don't need to apply reduction operation, so we just
5970                  collect s' values in SCALAR_RESULTS.  */
5971               if (slp_reduc)
5972                 scalar_results.safe_push (new_temp);
5973 
5974               for (bit_offset = element_bitsize;
5975                    bit_offset < vec_size_in_bits;
5976                    bit_offset += element_bitsize)
5977                 {
5978                   tree bitpos = bitsize_int (bit_offset);
5979 		  new_name = gimple_build (&stmts, BIT_FIELD_REF,
5980 					   compute_type, vec_temp,
5981 					   bitsize, bitpos);
5982                   if (slp_reduc)
5983                     {
5984                       /* In SLP we don't need to apply reduction operation, so
5985                          we just collect s' values in SCALAR_RESULTS.  */
5986                       new_temp = new_name;
5987                       scalar_results.safe_push (new_name);
5988                     }
5989                   else
5990 		    new_temp = gimple_build (&stmts, code, compute_type,
5991 					     new_name, new_temp);
5992                 }
5993             }
5994 
5995           /* The only case where we need to reduce scalar results in SLP, is
5996              unrolling.  If the size of SCALAR_RESULTS is greater than
5997              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5998              REDUC_GROUP_SIZE.  */
5999           if (slp_reduc)
6000             {
6001               tree res, first_res, new_res;
6002 
6003               /* Reduce multiple scalar results in case of SLP unrolling.  */
6004               for (j = group_size; scalar_results.iterate (j, &res);
6005                    j++)
6006                 {
6007                   first_res = scalar_results[j % group_size];
6008 		  new_res = gimple_build (&stmts, code, compute_type,
6009 					  first_res, res);
6010                   scalar_results[j % group_size] = new_res;
6011                 }
6012 	      scalar_results.truncate (group_size);
6013 	      for (k = 0; k < group_size; k++)
6014 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
6015 						    scalar_results[k]);
6016             }
6017           else
6018 	    {
6019 	      /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6020 	      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6021 	      scalar_results.safe_push (new_temp);
6022 	    }
6023 
6024 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6025         }
6026 
6027       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6028 	  && induc_val)
6029 	{
6030 	  /* Earlier we set the initial value to be a vector if induc_val
6031 	     values.  Check the result and if it is induc_val then replace
6032 	     with the original initial value, unless induc_val is
6033 	     the same as initial_def already.  */
6034 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6035 				  induc_val);
6036 	  tree initial_def = reduc_info->reduc_initial_values[0];
6037 
6038 	  tree tmp = make_ssa_name (new_scalar_dest);
6039 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6040 					     initial_def, new_temp);
6041 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6042 	  scalar_results[0] = tmp;
6043 	}
6044     }
6045 
6046   /* 2.5 Adjust the final result by the initial value of the reduction
6047 	 variable. (When such adjustment is not needed, then
6048 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
6049 	 new_temp = loop_exit_def + adjustment_def  */
6050 
6051   if (adjustment_def)
6052     {
6053       gcc_assert (!slp_reduc);
6054       gimple_seq stmts = NULL;
6055       if (double_reduc)
6056 	{
6057 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6058 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6059 	  new_temp = gimple_build (&stmts, code, vectype,
6060 				   reduc_inputs[0], adjustment_def);
6061 	}
6062       else
6063 	{
6064           new_temp = scalar_results[0];
6065 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6066 	  adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6067 					   adjustment_def);
6068 	  new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6069 	  new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6070 				   new_temp, adjustment_def);
6071 	  new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6072 	}
6073 
6074       epilog_stmt = gimple_seq_last_stmt (stmts);
6075       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6076       scalar_results[0] = new_temp;
6077     }
6078 
6079   /* Record this operation if it could be reused by the epilogue loop.  */
6080   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6081       && vec_num == 1)
6082     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6083 					   { orig_reduc_input, reduc_info });
6084 
6085   if (double_reduc)
6086     loop = outer_loop;
6087 
6088   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6089           phis with new adjusted scalar results, i.e., replace use <s_out0>
6090           with use <s_out4>.
6091 
6092      Transform:
6093         loop_exit:
6094           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6095           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6096           v_out2 = reduce <v_out1>
6097           s_out3 = extract_field <v_out2, 0>
6098           s_out4 = adjust_result <s_out3>
6099           use <s_out0>
6100           use <s_out0>
6101 
6102      into:
6103 
6104         loop_exit:
6105           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6106           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6107           v_out2 = reduce <v_out1>
6108           s_out3 = extract_field <v_out2, 0>
6109           s_out4 = adjust_result <s_out3>
6110           use <s_out4>
6111           use <s_out4> */
6112 
6113   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6114   for (k = 0; k < live_out_stmts.size (); k++)
6115     {
6116       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6117       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6118 
6119       phis.create (3);
6120       /* Find the loop-closed-use at the loop exit of the original scalar
6121          result.  (The reduction result is expected to have two immediate uses,
6122          one at the latch block, and one at the loop exit).  For double
6123          reductions we are looking for exit phis of the outer loop.  */
6124       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6125         {
6126           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6127 	    {
6128 	      if (!is_gimple_debug (USE_STMT (use_p)))
6129 		phis.safe_push (USE_STMT (use_p));
6130 	    }
6131           else
6132             {
6133               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6134                 {
6135                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6136 
6137                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6138                     {
6139                       if (!flow_bb_inside_loop_p (loop,
6140                                              gimple_bb (USE_STMT (phi_use_p)))
6141 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
6142                         phis.safe_push (USE_STMT (phi_use_p));
6143                     }
6144                 }
6145             }
6146         }
6147 
6148       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6149         {
6150           /* Replace the uses:  */
6151           orig_name = PHI_RESULT (exit_phi);
6152 
6153 	  /* Look for a single use at the target of the skip edge.  */
6154 	  if (unify_with_main_loop_p)
6155 	    {
6156 	      use_operand_p use_p;
6157 	      gimple *user;
6158 	      if (!single_imm_use (orig_name, &use_p, &user))
6159 		gcc_unreachable ();
6160 	      orig_name = gimple_get_lhs (user);
6161 	    }
6162 
6163           scalar_result = scalar_results[k];
6164           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6165 	    {
6166 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6167 		SET_USE (use_p, scalar_result);
6168 	      update_stmt (use_stmt);
6169 	    }
6170         }
6171 
6172       phis.release ();
6173     }
6174 }
6175 
6176 /* Return a vector of type VECTYPE that is equal to the vector select
6177    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6178    before GSI.  */
6179 
6180 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6181 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6182 		     tree vec, tree identity)
6183 {
6184   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6185   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6186 					  mask, vec, identity);
6187   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6188   return cond;
6189 }
6190 
6191 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6192    order, starting with LHS.  Insert the extraction statements before GSI and
6193    associate the new scalar SSA names with variable SCALAR_DEST.
6194    Return the SSA name for the result.  */
6195 
6196 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6197 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6198 		       tree_code code, tree lhs, tree vector_rhs)
6199 {
6200   tree vectype = TREE_TYPE (vector_rhs);
6201   tree scalar_type = TREE_TYPE (vectype);
6202   tree bitsize = TYPE_SIZE (scalar_type);
6203   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6204   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6205 
6206   for (unsigned HOST_WIDE_INT bit_offset = 0;
6207        bit_offset < vec_size_in_bits;
6208        bit_offset += element_bitsize)
6209     {
6210       tree bitpos = bitsize_int (bit_offset);
6211       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6212 			 bitsize, bitpos);
6213 
6214       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6215       rhs = make_ssa_name (scalar_dest, stmt);
6216       gimple_assign_set_lhs (stmt, rhs);
6217       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6218 
6219       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6220       tree new_name = make_ssa_name (scalar_dest, stmt);
6221       gimple_assign_set_lhs (stmt, new_name);
6222       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6223       lhs = new_name;
6224     }
6225   return lhs;
6226 }
6227 
6228 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6229    type of the vector input.  */
6230 
6231 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)6232 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6233 {
6234   internal_fn mask_reduc_fn;
6235 
6236   switch (reduc_fn)
6237     {
6238     case IFN_FOLD_LEFT_PLUS:
6239       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6240       break;
6241 
6242     default:
6243       return IFN_LAST;
6244     }
6245 
6246   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6247 				      OPTIMIZE_FOR_SPEED))
6248     return mask_reduc_fn;
6249   return IFN_LAST;
6250 }
6251 
6252 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6253    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6254    statement.  CODE is the operation performed by STMT_INFO and OPS are
6255    its scalar operands.  REDUC_INDEX is the index of the operand in
6256    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6257    implements in-order reduction, or IFN_LAST if we should open-code it.
6258    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6259    that should be used to control the operation in a fully-masked loop.  */
6260 
6261 static bool
vectorize_fold_left_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6262 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6263 			       stmt_vec_info stmt_info,
6264 			       gimple_stmt_iterator *gsi,
6265 			       gimple **vec_stmt, slp_tree slp_node,
6266 			       gimple *reduc_def_stmt,
6267 			       tree_code code, internal_fn reduc_fn,
6268 			       tree ops[3], tree vectype_in,
6269 			       int reduc_index, vec_loop_masks *masks)
6270 {
6271   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6272   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6273   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6274 
6275   int ncopies;
6276   if (slp_node)
6277     ncopies = 1;
6278   else
6279     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6280 
6281   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6282   gcc_assert (ncopies == 1);
6283   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6284 
6285   if (slp_node)
6286     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6287 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
6288 
6289   tree op0 = ops[1 - reduc_index];
6290 
6291   int group_size = 1;
6292   stmt_vec_info scalar_dest_def_info;
6293   auto_vec<tree> vec_oprnds0;
6294   if (slp_node)
6295     {
6296       auto_vec<vec<tree> > vec_defs (2);
6297       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6298       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6299       vec_defs[0].release ();
6300       vec_defs[1].release ();
6301       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6302       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6303     }
6304   else
6305     {
6306       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6307 				     op0, &vec_oprnds0);
6308       scalar_dest_def_info = stmt_info;
6309     }
6310 
6311   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6312   tree scalar_type = TREE_TYPE (scalar_dest);
6313   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6314 
6315   int vec_num = vec_oprnds0.length ();
6316   gcc_assert (vec_num == 1 || slp_node);
6317   tree vec_elem_type = TREE_TYPE (vectype_out);
6318   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6319 
6320   tree vector_identity = NULL_TREE;
6321   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6322     vector_identity = build_zero_cst (vectype_out);
6323 
6324   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6325   int i;
6326   tree def0;
6327   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6328     {
6329       gimple *new_stmt;
6330       tree mask = NULL_TREE;
6331       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6332 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6333 
6334       /* Handle MINUS by adding the negative.  */
6335       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6336 	{
6337 	  tree negated = make_ssa_name (vectype_out);
6338 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6339 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6340 	  def0 = negated;
6341 	}
6342 
6343       if (mask && mask_reduc_fn == IFN_LAST)
6344 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6345 				    vector_identity);
6346 
6347       /* On the first iteration the input is simply the scalar phi
6348 	 result, and for subsequent iterations it is the output of
6349 	 the preceding operation.  */
6350       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6351 	{
6352 	  if (mask && mask_reduc_fn != IFN_LAST)
6353 	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6354 						   def0, mask);
6355 	  else
6356 	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6357 						   def0);
6358 	  /* For chained SLP reductions the output of the previous reduction
6359 	     operation serves as the input of the next. For the final statement
6360 	     the output cannot be a temporary - we reuse the original
6361 	     scalar destination of the last statement.  */
6362 	  if (i != vec_num - 1)
6363 	    {
6364 	      gimple_set_lhs (new_stmt, scalar_dest_var);
6365 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6366 	      gimple_set_lhs (new_stmt, reduc_var);
6367 	    }
6368 	}
6369       else
6370 	{
6371 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6372 					     reduc_var, def0);
6373 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6374 	  /* Remove the statement, so that we can use the same code paths
6375 	     as for statements that we've just created.  */
6376 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6377 	  gsi_remove (&tmp_gsi, true);
6378 	}
6379 
6380       if (i == vec_num - 1)
6381 	{
6382 	  gimple_set_lhs (new_stmt, scalar_dest);
6383 	  vect_finish_replace_stmt (loop_vinfo,
6384 				    scalar_dest_def_info,
6385 				    new_stmt);
6386 	}
6387       else
6388 	vect_finish_stmt_generation (loop_vinfo,
6389 				     scalar_dest_def_info,
6390 				     new_stmt, gsi);
6391 
6392       if (slp_node)
6393 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6394       else
6395 	{
6396 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6397 	  *vec_stmt = new_stmt;
6398 	}
6399     }
6400 
6401   return true;
6402 }
6403 
6404 /* Function is_nonwrapping_integer_induction.
6405 
6406    Check if STMT_VINO (which is part of loop LOOP) both increments and
6407    does not cause overflow.  */
6408 
6409 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)6410 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6411 {
6412   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6413   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6414   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6415   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6416   widest_int ni, max_loop_value, lhs_max;
6417   wi::overflow_type overflow = wi::OVF_NONE;
6418 
6419   /* Make sure the loop is integer based.  */
6420   if (TREE_CODE (base) != INTEGER_CST
6421       || TREE_CODE (step) != INTEGER_CST)
6422     return false;
6423 
6424   /* Check that the max size of the loop will not wrap.  */
6425 
6426   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6427     return true;
6428 
6429   if (! max_stmt_executions (loop, &ni))
6430     return false;
6431 
6432   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6433 			    &overflow);
6434   if (overflow)
6435     return false;
6436 
6437   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6438 			    TYPE_SIGN (lhs_type), &overflow);
6439   if (overflow)
6440     return false;
6441 
6442   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6443 	  <= TYPE_PRECISION (lhs_type));
6444 }
6445 
6446 /* Check if masking can be supported by inserting a conditional expression.
6447    CODE is the code for the operation.  COND_FN is the conditional internal
6448    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6449 static bool
use_mask_by_cond_expr_p(code_helper code,internal_fn cond_fn,tree vectype_in)6450 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6451 			 tree vectype_in)
6452 {
6453   if (cond_fn != IFN_LAST
6454       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6455 					 OPTIMIZE_FOR_SPEED))
6456     return false;
6457 
6458   if (code.is_tree_code ())
6459     switch (tree_code (code))
6460       {
6461       case DOT_PROD_EXPR:
6462       case SAD_EXPR:
6463 	return true;
6464 
6465       default:
6466 	break;
6467       }
6468   return false;
6469 }
6470 
6471 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6472    code for the operation.  VOP is the array of operands.  MASK is the loop
6473    mask.  GSI is a statement iterator used to place the new conditional
6474    expression.  */
6475 static void
build_vect_cond_expr(code_helper code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)6476 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6477 		      gimple_stmt_iterator *gsi)
6478 {
6479   switch (tree_code (code))
6480     {
6481     case DOT_PROD_EXPR:
6482       {
6483 	tree vectype = TREE_TYPE (vop[1]);
6484 	tree zero = build_zero_cst (vectype);
6485 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6486 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6487 					       mask, vop[1], zero);
6488 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
6489 	vop[1] = masked_op1;
6490 	break;
6491       }
6492 
6493     case SAD_EXPR:
6494       {
6495 	tree vectype = TREE_TYPE (vop[1]);
6496 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6497 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6498 					       mask, vop[1], vop[0]);
6499 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
6500 	vop[1] = masked_op1;
6501 	break;
6502       }
6503 
6504     default:
6505       gcc_unreachable ();
6506     }
6507 }
6508 
6509 /* Function vectorizable_reduction.
6510 
6511    Check if STMT_INFO performs a reduction operation that can be vectorized.
6512    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6513    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6514    Return true if STMT_INFO is vectorizable in this way.
6515 
6516    This function also handles reduction idioms (patterns) that have been
6517    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6518    may be of this form:
6519      X = pattern_expr (arg0, arg1, ..., X)
6520    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6521    sequence that had been detected and replaced by the pattern-stmt
6522    (STMT_INFO).
6523 
6524    This function also handles reduction of condition expressions, for example:
6525      for (int i = 0; i < N; i++)
6526        if (a[i] < value)
6527 	 last = a[i];
6528    This is handled by vectorising the loop and creating an additional vector
6529    containing the loop indexes for which "a[i] < value" was true.  In the
6530    function epilogue this is reduced to a single max value and then used to
6531    index into the vector of results.
6532 
6533    In some cases of reduction patterns, the type of the reduction variable X is
6534    different than the type of the other arguments of STMT_INFO.
6535    In such cases, the vectype that is used when transforming STMT_INFO into
6536    a vector stmt is different than the vectype that is used to determine the
6537    vectorization factor, because it consists of a different number of elements
6538    than the actual number of elements that are being operated upon in parallel.
6539 
6540    For example, consider an accumulation of shorts into an int accumulator.
6541    On some targets it's possible to vectorize this pattern operating on 8
6542    shorts at a time (hence, the vectype for purposes of determining the
6543    vectorization factor should be V8HI); on the other hand, the vectype that
6544    is used to create the vector form is actually V4SI (the type of the result).
6545 
6546    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6547    indicates what is the actual level of parallelism (V8HI in the example), so
6548    that the right vectorization factor would be derived.  This vectype
6549    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6550    be used to create the vectorized stmt.  The right vectype for the vectorized
6551    stmt is obtained from the type of the result X:
6552       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6553 
6554    This means that, contrary to "regular" reductions (or "regular" stmts in
6555    general), the following equation:
6556       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6557    does *NOT* necessarily hold for reduction patterns.  */
6558 
6559 bool
vectorizable_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6560 vectorizable_reduction (loop_vec_info loop_vinfo,
6561 			stmt_vec_info stmt_info, slp_tree slp_node,
6562 			slp_instance slp_node_instance,
6563 			stmt_vector_for_cost *cost_vec)
6564 {
6565   tree vectype_in = NULL_TREE;
6566   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
6567   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6568   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6569   stmt_vec_info cond_stmt_vinfo = NULL;
6570   int i;
6571   int ncopies;
6572   bool single_defuse_cycle = false;
6573   bool nested_cycle = false;
6574   bool double_reduc = false;
6575   int vec_num;
6576   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6577   tree cond_reduc_val = NULL_TREE;
6578 
6579   /* Make sure it was already recognized as a reduction computation.  */
6580   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6581       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6582       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6583     return false;
6584 
6585   /* The stmt we store reduction analysis meta on.  */
6586   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6587   reduc_info->is_reduc_info = true;
6588 
6589   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6590     {
6591       if (is_a <gphi *> (stmt_info->stmt))
6592 	{
6593 	  if (slp_node)
6594 	    {
6595 	      /* We eventually need to set a vector type on invariant
6596 		 arguments.  */
6597 	      unsigned j;
6598 	      slp_tree child;
6599 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6600 		if (!vect_maybe_update_slp_op_vectype
6601 		       (child, SLP_TREE_VECTYPE (slp_node)))
6602 		  {
6603 		    if (dump_enabled_p ())
6604 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605 				       "incompatible vector types for "
6606 				       "invariants\n");
6607 		    return false;
6608 		  }
6609 	    }
6610 	  /* Analysis for double-reduction is done on the outer
6611 	     loop PHI, nested cycles have no further restrictions.  */
6612 	  STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6613 	}
6614       else
6615 	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6616       return true;
6617     }
6618 
6619   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6620   stmt_vec_info phi_info = stmt_info;
6621   if (!is_a <gphi *> (stmt_info->stmt))
6622     {
6623       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6624       return true;
6625     }
6626   if (slp_node)
6627     {
6628       slp_node_instance->reduc_phis = slp_node;
6629       /* ???  We're leaving slp_node to point to the PHIs, we only
6630 	 need it to get at the number of vector stmts which wasn't
6631 	 yet initialized for the instance root.  */
6632     }
6633   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6634     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6635   else
6636     {
6637       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6638 		  == vect_double_reduction_def);
6639       use_operand_p use_p;
6640       gimple *use_stmt;
6641       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6642 				 &use_p, &use_stmt);
6643       gcc_assert (res);
6644       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6645       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6646     }
6647 
6648   /* PHIs should not participate in patterns.  */
6649   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6650   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6651 
6652   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6653      and compute the reduction chain length.  Discover the real
6654      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6655   tree reduc_def
6656     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6657 			     loop_latch_edge
6658 			       (gimple_bb (reduc_def_phi)->loop_father));
6659   unsigned reduc_chain_length = 0;
6660   bool only_slp_reduc_chain = true;
6661   stmt_info = NULL;
6662   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6663   while (reduc_def != PHI_RESULT (reduc_def_phi))
6664     {
6665       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6666       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6667       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6668 	{
6669 	  if (dump_enabled_p ())
6670 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6671 			     "reduction chain broken by patterns.\n");
6672 	  return false;
6673 	}
6674       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6675 	only_slp_reduc_chain = false;
6676       /* For epilogue generation live members of the chain need
6677          to point back to the PHI via their original stmt for
6678 	 info_for_reduction to work.  For SLP we need to look at
6679 	 all lanes here - even though we only will vectorize from
6680 	 the SLP node with live lane zero the other live lanes also
6681 	 need to be identified as part of a reduction to be able
6682 	 to skip code generation for them.  */
6683       if (slp_for_stmt_info)
6684 	{
6685 	  for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6686 	    if (STMT_VINFO_LIVE_P (s))
6687 	      STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6688 	}
6689       else if (STMT_VINFO_LIVE_P (vdef))
6690 	STMT_VINFO_REDUC_DEF (def) = phi_info;
6691       gimple_match_op op;
6692       if (!gimple_extract_op (vdef->stmt, &op))
6693 	{
6694 	  if (dump_enabled_p ())
6695 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6696 			     "reduction chain includes unsupported"
6697 			     " statement type.\n");
6698 	  return false;
6699 	}
6700       if (CONVERT_EXPR_CODE_P (op.code))
6701 	{
6702 	  if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6703 	    {
6704 	      if (dump_enabled_p ())
6705 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706 				 "conversion in the reduction chain.\n");
6707 	      return false;
6708 	    }
6709 	}
6710       else if (!stmt_info)
6711 	/* First non-conversion stmt.  */
6712 	stmt_info = vdef;
6713       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6714       reduc_chain_length++;
6715       if (!stmt_info && slp_node)
6716 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6717     }
6718   /* PHIs should not participate in patterns.  */
6719   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6720 
6721   if (nested_in_vect_loop_p (loop, stmt_info))
6722     {
6723       loop = loop->inner;
6724       nested_cycle = true;
6725     }
6726 
6727   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6728      element.  */
6729   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6730     {
6731       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6732       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6733     }
6734   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6735     gcc_assert (slp_node
6736 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6737 
6738   /* 1. Is vectorizable reduction?  */
6739   /* Not supportable if the reduction variable is used in the loop, unless
6740      it's a reduction chain.  */
6741   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6742       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6743     return false;
6744 
6745   /* Reductions that are not used even in an enclosing outer-loop,
6746      are expected to be "live" (used out of the loop).  */
6747   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6748       && !STMT_VINFO_LIVE_P (stmt_info))
6749     return false;
6750 
6751   /* 2. Has this been recognized as a reduction pattern?
6752 
6753      Check if STMT represents a pattern that has been recognized
6754      in earlier analysis stages.  For stmts that represent a pattern,
6755      the STMT_VINFO_RELATED_STMT field records the last stmt in
6756      the original sequence that constitutes the pattern.  */
6757 
6758   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6759   if (orig_stmt_info)
6760     {
6761       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6762       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6763     }
6764 
6765   /* 3. Check the operands of the operation.  The first operands are defined
6766         inside the loop body. The last operand is the reduction variable,
6767         which is defined by the loop-header-phi.  */
6768 
6769   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6770   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6771   gimple_match_op op;
6772   if (!gimple_extract_op (stmt_info->stmt, &op))
6773     gcc_unreachable ();
6774   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6775 			    || op.code == WIDEN_SUM_EXPR
6776 			    || op.code == SAD_EXPR);
6777   enum optab_subtype optab_query_kind = optab_vector;
6778   if (op.code == DOT_PROD_EXPR
6779       && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6780 	  != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6781     optab_query_kind = optab_vector_mixed_sign;
6782 
6783   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6784       && !SCALAR_FLOAT_TYPE_P (op.type))
6785     return false;
6786 
6787   /* Do not try to vectorize bit-precision reductions.  */
6788   if (!type_has_mode_precision_p (op.type))
6789     return false;
6790 
6791   /* For lane-reducing ops we're reducing the number of reduction PHIs
6792      which means the only use of that may be in the lane-reducing operation.  */
6793   if (lane_reduc_code_p
6794       && reduc_chain_length != 1
6795       && !only_slp_reduc_chain)
6796     {
6797       if (dump_enabled_p ())
6798 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 			 "lane-reducing reduction with extra stmts.\n");
6800       return false;
6801     }
6802 
6803   /* All uses but the last are expected to be defined in the loop.
6804      The last use is the reduction variable.  In case of nested cycle this
6805      assumption is not true: we use reduc_index to record the index of the
6806      reduction variable.  */
6807   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6808   /* We need to skip an extra operand for COND_EXPRs with embedded
6809      comparison.  */
6810   unsigned opno_adjust = 0;
6811   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6812     opno_adjust = 1;
6813   for (i = 0; i < (int) op.num_ops; i++)
6814     {
6815       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6816       if (i == 0 && op.code == COND_EXPR)
6817         continue;
6818 
6819       stmt_vec_info def_stmt_info;
6820       enum vect_def_type dt;
6821       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6822 			       i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6823 			       &vectype_op[i], &def_stmt_info))
6824 	{
6825 	  if (dump_enabled_p ())
6826 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6827 			     "use not simple.\n");
6828 	  return false;
6829 	}
6830       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6831 	continue;
6832 
6833       /* There should be only one cycle def in the stmt, the one
6834          leading to reduc_def.  */
6835       if (VECTORIZABLE_CYCLE_DEF (dt))
6836 	return false;
6837 
6838       if (!vectype_op[i])
6839 	vectype_op[i]
6840 	  = get_vectype_for_scalar_type (loop_vinfo,
6841 					 TREE_TYPE (op.ops[i]), slp_op[i]);
6842 
6843       /* To properly compute ncopies we are interested in the widest
6844 	 non-reduction input type in case we're looking at a widening
6845 	 accumulation that we later handle in vect_transform_reduction.  */
6846       if (lane_reduc_code_p
6847 	  && vectype_op[i]
6848 	  && (!vectype_in
6849 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6850 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
6851 	vectype_in = vectype_op[i];
6852 
6853       /* Record how the non-reduction-def value of COND_EXPR is defined.
6854 	 ???  For a chain of multiple CONDs we'd have to match them up all.  */
6855       if (op.code == COND_EXPR && reduc_chain_length == 1)
6856 	{
6857 	  if (dt == vect_constant_def)
6858 	    {
6859 	      cond_reduc_dt = dt;
6860 	      cond_reduc_val = op.ops[i];
6861 	    }
6862 	  else if (dt == vect_induction_def
6863 		   && def_stmt_info
6864 		   && is_nonwrapping_integer_induction (def_stmt_info, loop))
6865 	    {
6866 	      cond_reduc_dt = dt;
6867 	      cond_stmt_vinfo = def_stmt_info;
6868 	    }
6869 	}
6870     }
6871   if (!vectype_in)
6872     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6873   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6874 
6875   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6876   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6877   /* If we have a condition reduction, see if we can simplify it further.  */
6878   if (v_reduc_type == COND_REDUCTION)
6879     {
6880       if (slp_node)
6881 	return false;
6882 
6883       /* When the condition uses the reduction value in the condition, fail.  */
6884       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6885 	{
6886 	  if (dump_enabled_p ())
6887 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6888 			     "condition depends on previous iteration\n");
6889 	  return false;
6890 	}
6891 
6892       if (reduc_chain_length == 1
6893 	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6894 					     vectype_in, OPTIMIZE_FOR_SPEED))
6895 	{
6896 	  if (dump_enabled_p ())
6897 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898 			     "optimizing condition reduction with"
6899 			     " FOLD_EXTRACT_LAST.\n");
6900 	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6901 	}
6902       else if (cond_reduc_dt == vect_induction_def)
6903 	{
6904 	  tree base
6905 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6906 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6907 
6908 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6909 		      && TREE_CODE (step) == INTEGER_CST);
6910 	  cond_reduc_val = NULL_TREE;
6911 	  enum tree_code cond_reduc_op_code = ERROR_MARK;
6912 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6913 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6914 	    ;
6915 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6916 	     above base; punt if base is the minimum value of the type for
6917 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6918 	  else if (tree_int_cst_sgn (step) == -1)
6919 	    {
6920 	      cond_reduc_op_code = MIN_EXPR;
6921 	      if (tree_int_cst_sgn (base) == -1)
6922 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6923 	      else if (tree_int_cst_lt (base,
6924 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6925 		cond_reduc_val
6926 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6927 	    }
6928 	  else
6929 	    {
6930 	      cond_reduc_op_code = MAX_EXPR;
6931 	      if (tree_int_cst_sgn (base) == 1)
6932 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6933 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6934 					base))
6935 		cond_reduc_val
6936 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6937 	    }
6938 	  if (cond_reduc_val)
6939 	    {
6940 	      if (dump_enabled_p ())
6941 		dump_printf_loc (MSG_NOTE, vect_location,
6942 				 "condition expression based on "
6943 				 "integer induction.\n");
6944 	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6945 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6946 		= cond_reduc_val;
6947 	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6948 	    }
6949 	}
6950       else if (cond_reduc_dt == vect_constant_def)
6951 	{
6952 	  enum vect_def_type cond_initial_dt;
6953 	  tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6954 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6955 	  if (cond_initial_dt == vect_constant_def
6956 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6957 				     TREE_TYPE (cond_reduc_val)))
6958 	    {
6959 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6960 				    cond_initial_val, cond_reduc_val);
6961 	      if (e && (integer_onep (e) || integer_zerop (e)))
6962 		{
6963 		  if (dump_enabled_p ())
6964 		    dump_printf_loc (MSG_NOTE, vect_location,
6965 				     "condition expression based on "
6966 				     "compile time constant.\n");
6967 		  /* Record reduction code at analysis stage.  */
6968 		  STMT_VINFO_REDUC_CODE (reduc_info)
6969 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6970 		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6971 		}
6972 	    }
6973 	}
6974     }
6975 
6976   if (STMT_VINFO_LIVE_P (phi_info))
6977     return false;
6978 
6979   if (slp_node)
6980     ncopies = 1;
6981   else
6982     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6983 
6984   gcc_assert (ncopies >= 1);
6985 
6986   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6987 
6988   if (nested_cycle)
6989     {
6990       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6991 		  == vect_double_reduction_def);
6992       double_reduc = true;
6993     }
6994 
6995   /* 4.2. Check support for the epilog operation.
6996 
6997           If STMT represents a reduction pattern, then the type of the
6998           reduction variable may be different than the type of the rest
6999           of the arguments.  For example, consider the case of accumulation
7000           of shorts into an int accumulator; The original code:
7001                         S1: int_a = (int) short_a;
7002           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7003 
7004           was replaced with:
7005                         STMT: int_acc = widen_sum <short_a, int_acc>
7006 
7007           This means that:
7008           1. The tree-code that is used to create the vector operation in the
7009              epilog code (that reduces the partial results) is not the
7010              tree-code of STMT, but is rather the tree-code of the original
7011              stmt from the pattern that STMT is replacing.  I.e, in the example
7012              above we want to use 'widen_sum' in the loop, but 'plus' in the
7013              epilog.
7014           2. The type (mode) we use to check available target support
7015              for the vector operation to be created in the *epilog*, is
7016              determined by the type of the reduction variable (in the example
7017              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7018              However the type (mode) we use to check available target support
7019              for the vector operation to be created *inside the loop*, is
7020              determined by the type of the other arguments to STMT (in the
7021              example we'd check this: optab_handler (widen_sum_optab,
7022 	     vect_short_mode)).
7023 
7024           This is contrary to "regular" reductions, in which the types of all
7025           the arguments are the same as the type of the reduction variable.
7026           For "regular" reductions we can therefore use the same vector type
7027           (and also the same tree-code) when generating the epilog code and
7028           when generating the code inside the loop.  */
7029 
7030   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7031   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7032 
7033   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7034   if (reduction_type == TREE_CODE_REDUCTION)
7035     {
7036       /* Check whether it's ok to change the order of the computation.
7037 	 Generally, when vectorizing a reduction we change the order of the
7038 	 computation.  This may change the behavior of the program in some
7039 	 cases, so we need to check that this is ok.  One exception is when
7040 	 vectorizing an outer-loop: the inner-loop is executed sequentially,
7041 	 and therefore vectorizing reductions in the inner-loop during
7042 	 outer-loop vectorization is safe.  Likewise when we are vectorizing
7043 	 a series of reductions using SLP and the VF is one the reductions
7044 	 are performed in scalar order.  */
7045       if (slp_node
7046 	  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7047 	  && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7048 	;
7049       else if (needs_fold_left_reduction_p (op.type, orig_code))
7050 	{
7051 	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
7052 	     is not directy used in stmt.  */
7053 	  if (!only_slp_reduc_chain
7054 	      && reduc_chain_length != 1)
7055 	    {
7056 	      if (dump_enabled_p ())
7057 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7058 				 "in-order reduction chain without SLP.\n");
7059 	      return false;
7060 	    }
7061 	  STMT_VINFO_REDUC_TYPE (reduc_info)
7062 	    = reduction_type = FOLD_LEFT_REDUCTION;
7063 	}
7064       else if (!commutative_binary_op_p (orig_code, op.type)
7065 	       || !associative_binary_op_p (orig_code, op.type))
7066 	{
7067 	  if (dump_enabled_p ())
7068 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069 			    "reduction: not commutative/associative");
7070 	  return false;
7071 	}
7072     }
7073 
7074   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7075       && ncopies > 1)
7076     {
7077       if (dump_enabled_p ())
7078 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079 			 "multiple types in double reduction or condition "
7080 			 "reduction or fold-left reduction.\n");
7081       return false;
7082     }
7083 
7084   internal_fn reduc_fn = IFN_LAST;
7085   if (reduction_type == TREE_CODE_REDUCTION
7086       || reduction_type == FOLD_LEFT_REDUCTION
7087       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7088       || reduction_type == CONST_COND_REDUCTION)
7089     {
7090       if (reduction_type == FOLD_LEFT_REDUCTION
7091 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
7092 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7093 	{
7094 	  if (reduc_fn != IFN_LAST
7095 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7096 						  OPTIMIZE_FOR_SPEED))
7097 	    {
7098 	      if (dump_enabled_p ())
7099 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7100 				 "reduc op not supported by target.\n");
7101 
7102 	      reduc_fn = IFN_LAST;
7103 	    }
7104 	}
7105       else
7106 	{
7107 	  if (!nested_cycle || double_reduc)
7108 	    {
7109 	      if (dump_enabled_p ())
7110 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111 				 "no reduc code for scalar code.\n");
7112 
7113 	      return false;
7114 	    }
7115 	}
7116     }
7117   else if (reduction_type == COND_REDUCTION)
7118     {
7119       int scalar_precision
7120 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7121       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7122       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7123 						vectype_out);
7124 
7125       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7126 					  OPTIMIZE_FOR_SPEED))
7127 	reduc_fn = IFN_REDUC_MAX;
7128     }
7129   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7130 
7131   if (reduction_type != EXTRACT_LAST_REDUCTION
7132       && (!nested_cycle || double_reduc)
7133       && reduc_fn == IFN_LAST
7134       && !nunits_out.is_constant ())
7135     {
7136       if (dump_enabled_p ())
7137 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7138 			 "missing target support for reduction on"
7139 			 " variable-length vectors.\n");
7140       return false;
7141     }
7142 
7143   /* For SLP reductions, see if there is a neutral value we can use.  */
7144   tree neutral_op = NULL_TREE;
7145   if (slp_node)
7146     {
7147       tree initial_value = NULL_TREE;
7148       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7149 	initial_value = vect_phi_initial_value (reduc_def_phi);
7150       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7151 					     orig_code, initial_value);
7152     }
7153 
7154   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7155     {
7156       /* We can't support in-order reductions of code such as this:
7157 
7158 	   for (int i = 0; i < n1; ++i)
7159 	     for (int j = 0; j < n2; ++j)
7160 	       l += a[j];
7161 
7162 	 since GCC effectively transforms the loop when vectorizing:
7163 
7164 	   for (int i = 0; i < n1 / VF; ++i)
7165 	     for (int j = 0; j < n2; ++j)
7166 	       for (int k = 0; k < VF; ++k)
7167 		 l += a[j];
7168 
7169 	 which is a reassociation of the original operation.  */
7170       if (dump_enabled_p ())
7171 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7172 			 "in-order double reduction not supported.\n");
7173 
7174       return false;
7175     }
7176 
7177   if (reduction_type == FOLD_LEFT_REDUCTION
7178       && slp_node
7179       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7180     {
7181       /* We cannot use in-order reductions in this case because there is
7182 	 an implicit reassociation of the operations involved.  */
7183       if (dump_enabled_p ())
7184 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7185 			 "in-order unchained SLP reductions not supported.\n");
7186       return false;
7187     }
7188 
7189   /* For double reductions, and for SLP reductions with a neutral value,
7190      we construct a variable-length initial vector by loading a vector
7191      full of the neutral value and then shift-and-inserting the start
7192      values into the low-numbered elements.  */
7193   if ((double_reduc || neutral_op)
7194       && !nunits_out.is_constant ()
7195       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7196 					  vectype_out, OPTIMIZE_FOR_SPEED))
7197     {
7198       if (dump_enabled_p ())
7199 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7200 			 "reduction on variable-length vectors requires"
7201 			 " target support for a vector-shift-and-insert"
7202 			 " operation.\n");
7203       return false;
7204     }
7205 
7206   /* Check extra constraints for variable-length unchained SLP reductions.  */
7207   if (slp_node
7208       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7209       && !nunits_out.is_constant ())
7210     {
7211       /* We checked above that we could build the initial vector when
7212 	 there's a neutral element value.  Check here for the case in
7213 	 which each SLP statement has its own initial value and in which
7214 	 that value needs to be repeated for every instance of the
7215 	 statement within the initial vector.  */
7216       unsigned int group_size = SLP_TREE_LANES (slp_node);
7217       if (!neutral_op
7218 	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7219 					      TREE_TYPE (vectype_out)))
7220 	{
7221 	  if (dump_enabled_p ())
7222 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7223 			     "unsupported form of SLP reduction for"
7224 			     " variable-length vectors: cannot build"
7225 			     " initial vector.\n");
7226 	  return false;
7227 	}
7228       /* The epilogue code relies on the number of elements being a multiple
7229 	 of the group size.  The duplicate-and-interleave approach to setting
7230 	 up the initial vector does too.  */
7231       if (!multiple_p (nunits_out, group_size))
7232 	{
7233 	  if (dump_enabled_p ())
7234 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7235 			     "unsupported form of SLP reduction for"
7236 			     " variable-length vectors: the vector size"
7237 			     " is not a multiple of the number of results.\n");
7238 	  return false;
7239 	}
7240     }
7241 
7242   if (reduction_type == COND_REDUCTION)
7243     {
7244       widest_int ni;
7245 
7246       if (! max_loop_iterations (loop, &ni))
7247 	{
7248 	  if (dump_enabled_p ())
7249 	    dump_printf_loc (MSG_NOTE, vect_location,
7250 			     "loop count not known, cannot create cond "
7251 			     "reduction.\n");
7252 	  return false;
7253 	}
7254       /* Convert backedges to iterations.  */
7255       ni += 1;
7256 
7257       /* The additional index will be the same type as the condition.  Check
7258 	 that the loop can fit into this less one (because we'll use up the
7259 	 zero slot for when there are no matches).  */
7260       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7261       if (wi::geu_p (ni, wi::to_widest (max_index)))
7262 	{
7263 	  if (dump_enabled_p ())
7264 	    dump_printf_loc (MSG_NOTE, vect_location,
7265 			     "loop size is greater than data size.\n");
7266 	  return false;
7267 	}
7268     }
7269 
7270   /* In case the vectorization factor (VF) is bigger than the number
7271      of elements that we can fit in a vectype (nunits), we have to generate
7272      more than one vector stmt - i.e - we need to "unroll" the
7273      vector stmt by a factor VF/nunits.  For more details see documentation
7274      in vectorizable_operation.  */
7275 
7276   /* If the reduction is used in an outer loop we need to generate
7277      VF intermediate results, like so (e.g. for ncopies=2):
7278 	r0 = phi (init, r0)
7279 	r1 = phi (init, r1)
7280 	r0 = x0 + r0;
7281         r1 = x1 + r1;
7282     (i.e. we generate VF results in 2 registers).
7283     In this case we have a separate def-use cycle for each copy, and therefore
7284     for each copy we get the vector def for the reduction variable from the
7285     respective phi node created for this copy.
7286 
7287     Otherwise (the reduction is unused in the loop nest), we can combine
7288     together intermediate results, like so (e.g. for ncopies=2):
7289 	r = phi (init, r)
7290 	r = x0 + r;
7291 	r = x1 + r;
7292    (i.e. we generate VF/2 results in a single register).
7293    In this case for each copy we get the vector def for the reduction variable
7294    from the vectorized reduction operation generated in the previous iteration.
7295 
7296    This only works when we see both the reduction PHI and its only consumer
7297    in vectorizable_reduction and there are no intermediate stmts
7298    participating.  When unrolling we want each unrolled iteration to have its
7299    own reduction accumulator since one of the main goals of unrolling a
7300    reduction is to reduce the aggregate loop-carried latency.  */
7301   if (ncopies > 1
7302       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7303       && reduc_chain_length == 1
7304       && loop_vinfo->suggested_unroll_factor == 1)
7305     single_defuse_cycle = true;
7306 
7307   if (single_defuse_cycle || lane_reduc_code_p)
7308     {
7309       gcc_assert (op.code != COND_EXPR);
7310 
7311       /* 4. Supportable by target?  */
7312       bool ok = true;
7313 
7314       /* 4.1. check support for the operation in the loop  */
7315       machine_mode vec_mode = TYPE_MODE (vectype_in);
7316       if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7317         {
7318           if (dump_enabled_p ())
7319             dump_printf (MSG_NOTE, "op not supported by target.\n");
7320 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7321 	      || !vect_can_vectorize_without_simd_p (op.code))
7322 	    ok = false;
7323 	  else
7324 	    if (dump_enabled_p ())
7325 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7326         }
7327 
7328       if (vect_emulated_vector_p (vectype_in)
7329 	  && !vect_can_vectorize_without_simd_p (op.code))
7330 	{
7331 	  if (dump_enabled_p ())
7332 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
7333 	  return false;
7334 	}
7335 
7336       /* lane-reducing operations have to go through vect_transform_reduction.
7337          For the other cases try without the single cycle optimization.  */
7338       if (!ok)
7339 	{
7340 	  if (lane_reduc_code_p)
7341 	    return false;
7342 	  else
7343 	    single_defuse_cycle = false;
7344 	}
7345     }
7346   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7347 
7348   /* If the reduction stmt is one of the patterns that have lane
7349      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7350   if ((ncopies > 1 && ! single_defuse_cycle)
7351       && lane_reduc_code_p)
7352     {
7353       if (dump_enabled_p ())
7354 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7355 			 "multi def-use cycle not possible for lane-reducing "
7356 			 "reduction operation\n");
7357       return false;
7358     }
7359 
7360   if (slp_node
7361       && !(!single_defuse_cycle
7362 	   && !lane_reduc_code_p
7363 	   && reduction_type != FOLD_LEFT_REDUCTION))
7364     for (i = 0; i < (int) op.num_ops; i++)
7365       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7366 	{
7367 	  if (dump_enabled_p ())
7368 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7369 			     "incompatible vector types for invariants\n");
7370 	  return false;
7371 	}
7372 
7373   if (slp_node)
7374     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7375   else
7376     vec_num = 1;
7377 
7378   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7379 			     reduction_type, ncopies, cost_vec);
7380   /* Cost the reduction op inside the loop if transformed via
7381      vect_transform_reduction.  Otherwise this is costed by the
7382      separate vectorizable_* routines.  */
7383   if (single_defuse_cycle || lane_reduc_code_p)
7384     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7385 
7386   if (dump_enabled_p ()
7387       && reduction_type == FOLD_LEFT_REDUCTION)
7388     dump_printf_loc (MSG_NOTE, vect_location,
7389 		     "using an in-order (fold-left) reduction.\n");
7390   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7391   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7392      reductions go through their own vectorizable_* routines.  */
7393   if (!single_defuse_cycle
7394       && !lane_reduc_code_p
7395       && reduction_type != FOLD_LEFT_REDUCTION)
7396     {
7397       stmt_vec_info tem
7398 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7399       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7400 	{
7401 	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7402 	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7403 	}
7404       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7405       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7406     }
7407   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7408     {
7409       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7410       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7411 
7412       if (reduction_type != FOLD_LEFT_REDUCTION
7413 	  && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7414 	  && (cond_fn == IFN_LAST
7415 	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7416 						  OPTIMIZE_FOR_SPEED)))
7417 	{
7418 	  if (dump_enabled_p ())
7419 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420 			     "can't operate on partial vectors because"
7421 			     " no conditional operation is available.\n");
7422 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7423 	}
7424       else if (reduction_type == FOLD_LEFT_REDUCTION
7425 	       && reduc_fn == IFN_LAST
7426 	       && !expand_vec_cond_expr_p (vectype_in,
7427 					   truth_type_for (vectype_in),
7428 					   SSA_NAME))
7429 	{
7430 	  if (dump_enabled_p ())
7431 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7432 			     "can't operate on partial vectors because"
7433 			     " no conditional operation is available.\n");
7434 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7435 	}
7436       else
7437 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7438 			       vectype_in, NULL);
7439     }
7440   return true;
7441 }
7442 
7443 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7444    value.  */
7445 
7446 bool
vect_transform_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)7447 vect_transform_reduction (loop_vec_info loop_vinfo,
7448 			  stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7449 			  gimple **vec_stmt, slp_tree slp_node)
7450 {
7451   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7452   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7453   int i;
7454   int ncopies;
7455   int vec_num;
7456 
7457   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7458   gcc_assert (reduc_info->is_reduc_info);
7459 
7460   if (nested_in_vect_loop_p (loop, stmt_info))
7461     {
7462       loop = loop->inner;
7463       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7464     }
7465 
7466   gimple_match_op op;
7467   if (!gimple_extract_op (stmt_info->stmt, &op))
7468     gcc_unreachable ();
7469 
7470   /* All uses but the last are expected to be defined in the loop.
7471      The last use is the reduction variable.  In case of nested cycle this
7472      assumption is not true: we use reduc_index to record the index of the
7473      reduction variable.  */
7474   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7475   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7476   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7477   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7478 
7479   if (slp_node)
7480     {
7481       ncopies = 1;
7482       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7483     }
7484   else
7485     {
7486       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7487       vec_num = 1;
7488     }
7489 
7490   code_helper code = canonicalize_code (op.code, op.type);
7491   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7492   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7493   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7494 
7495   /* Transform.  */
7496   tree new_temp = NULL_TREE;
7497   auto_vec<tree> vec_oprnds0;
7498   auto_vec<tree> vec_oprnds1;
7499   auto_vec<tree> vec_oprnds2;
7500   tree def0;
7501 
7502   if (dump_enabled_p ())
7503     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7504 
7505   /* FORNOW: Multiple types are not supported for condition.  */
7506   if (code == COND_EXPR)
7507     gcc_assert (ncopies == 1);
7508 
7509   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7510 
7511   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7512   if (reduction_type == FOLD_LEFT_REDUCTION)
7513     {
7514       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7515       gcc_assert (code.is_tree_code ());
7516       return vectorize_fold_left_reduction
7517 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7518 	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7519     }
7520 
7521   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7522   gcc_assert (single_defuse_cycle
7523 	      || code == DOT_PROD_EXPR
7524 	      || code == WIDEN_SUM_EXPR
7525 	      || code == SAD_EXPR);
7526 
7527   /* Create the destination vector  */
7528   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7529   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7530 
7531   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7532 		     single_defuse_cycle && reduc_index == 0
7533 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
7534 		     single_defuse_cycle && reduc_index == 1
7535 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
7536 		     op.num_ops == 3
7537 		     && !(single_defuse_cycle && reduc_index == 2)
7538 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7539   if (single_defuse_cycle)
7540     {
7541       gcc_assert (!slp_node);
7542       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7543 				     op.ops[reduc_index],
7544 				     reduc_index == 0 ? &vec_oprnds0
7545 				     : (reduc_index == 1 ? &vec_oprnds1
7546 					: &vec_oprnds2));
7547     }
7548 
7549   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7550     {
7551       gimple *new_stmt;
7552       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7553       if (masked_loop_p && !mask_by_cond_expr)
7554 	{
7555 	  /* Make sure that the reduction accumulator is vop[0].  */
7556 	  if (reduc_index == 1)
7557 	    {
7558 	      gcc_assert (commutative_binary_op_p (code, op.type));
7559 	      std::swap (vop[0], vop[1]);
7560 	    }
7561 	  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7562 					  vectype_in, i);
7563 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7564 						    vop[0], vop[1], vop[0]);
7565 	  new_temp = make_ssa_name (vec_dest, call);
7566 	  gimple_call_set_lhs (call, new_temp);
7567 	  gimple_call_set_nothrow (call, true);
7568 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7569 	  new_stmt = call;
7570 	}
7571       else
7572 	{
7573 	  if (op.num_ops == 3)
7574 	    vop[2] = vec_oprnds2[i];
7575 
7576 	  if (masked_loop_p && mask_by_cond_expr)
7577 	    {
7578 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7579 					      vectype_in, i);
7580 	      build_vect_cond_expr (code, vop, mask, gsi);
7581 	    }
7582 
7583 	  if (code.is_internal_fn ())
7584 	    new_stmt = gimple_build_call_internal (internal_fn (code),
7585 						   op.num_ops,
7586 						   vop[0], vop[1], vop[2]);
7587 	  else
7588 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7589 					    vop[0], vop[1], vop[2]);
7590 	  new_temp = make_ssa_name (vec_dest, new_stmt);
7591 	  gimple_set_lhs (new_stmt, new_temp);
7592 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7593 	}
7594 
7595       if (slp_node)
7596 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7597       else if (single_defuse_cycle
7598 	       && i < ncopies - 1)
7599 	{
7600 	  if (reduc_index == 0)
7601 	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7602 	  else if (reduc_index == 1)
7603 	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7604 	  else if (reduc_index == 2)
7605 	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7606 	}
7607       else
7608 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7609     }
7610 
7611   if (!slp_node)
7612     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7613 
7614   return true;
7615 }
7616 
7617 /* Transform phase of a cycle PHI.  */
7618 
7619 bool
vect_transform_cycle_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7620 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7621 			  stmt_vec_info stmt_info, gimple **vec_stmt,
7622 			  slp_tree slp_node, slp_instance slp_node_instance)
7623 {
7624   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7625   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7626   int i;
7627   int ncopies;
7628   int j;
7629   bool nested_cycle = false;
7630   int vec_num;
7631 
7632   if (nested_in_vect_loop_p (loop, stmt_info))
7633     {
7634       loop = loop->inner;
7635       nested_cycle = true;
7636     }
7637 
7638   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7639   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7640   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7641   gcc_assert (reduc_info->is_reduc_info);
7642 
7643   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7644       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7645     /* Leave the scalar phi in place.  */
7646     return true;
7647 
7648   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7649   /* For a nested cycle we do not fill the above.  */
7650   if (!vectype_in)
7651     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7652   gcc_assert (vectype_in);
7653 
7654   if (slp_node)
7655     {
7656       /* The size vect_schedule_slp_instance computes is off for us.  */
7657       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7658 				      * SLP_TREE_LANES (slp_node), vectype_in);
7659       ncopies = 1;
7660     }
7661   else
7662     {
7663       vec_num = 1;
7664       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7665     }
7666 
7667   /* Check whether we should use a single PHI node and accumulate
7668      vectors to one before the backedge.  */
7669   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7670     ncopies = 1;
7671 
7672   /* Create the destination vector  */
7673   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7674   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7675 					       vectype_out);
7676 
7677   /* Get the loop-entry arguments.  */
7678   tree vec_initial_def = NULL_TREE;
7679   auto_vec<tree> vec_initial_defs;
7680   if (slp_node)
7681     {
7682       vec_initial_defs.reserve (vec_num);
7683       if (nested_cycle)
7684 	{
7685 	  unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7686 	  vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7687 			     &vec_initial_defs);
7688 	}
7689       else
7690 	{
7691 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
7692 	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
7693 	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7694 
7695 	  unsigned int num_phis = stmts.length ();
7696 	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7697 	    num_phis = 1;
7698 	  initial_values.reserve (num_phis);
7699 	  for (unsigned int i = 0; i < num_phis; ++i)
7700 	    {
7701 	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7702 	      initial_values.quick_push (vect_phi_initial_value (this_phi));
7703 	    }
7704 	  if (vec_num == 1)
7705 	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7706 	  if (!initial_values.is_empty ())
7707 	    {
7708 	      tree initial_value
7709 		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
7710 	      code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7711 	      tree neutral_op
7712 		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
7713 					    code, initial_value);
7714 	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7715 					      &vec_initial_defs, vec_num,
7716 					      stmts.length (), neutral_op);
7717 	    }
7718 	}
7719     }
7720   else
7721     {
7722       /* Get at the scalar def before the loop, that defines the initial
7723 	 value of the reduction variable.  */
7724       tree initial_def = vect_phi_initial_value (phi);
7725       reduc_info->reduc_initial_values.safe_push (initial_def);
7726       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7727 	 and we can't use zero for induc_val, use initial_def.  Similarly
7728 	 for REDUC_MIN and initial_def larger than the base.  */
7729       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7730 	{
7731 	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7732 	  if (TREE_CODE (initial_def) == INTEGER_CST
7733 	      && !integer_zerop (induc_val)
7734 	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7735 		   && tree_int_cst_lt (initial_def, induc_val))
7736 		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7737 		      && tree_int_cst_lt (induc_val, initial_def))))
7738 	    {
7739 	      induc_val = initial_def;
7740 	      /* Communicate we used the initial_def to epilouge
7741 		 generation.  */
7742 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7743 	    }
7744 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7745 	}
7746       else if (nested_cycle)
7747 	{
7748 	  /* Do not use an adjustment def as that case is not supported
7749 	     correctly if ncopies is not one.  */
7750 	  vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7751 					 ncopies, initial_def,
7752 					 &vec_initial_defs);
7753 	}
7754       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7755 	       || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7756 	/* Fill the initial vector with the initial scalar value.  */
7757 	vec_initial_def
7758 	  = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7759 					   initial_def, initial_def);
7760       else
7761 	{
7762 	  if (ncopies == 1)
7763 	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7764 	  if (!reduc_info->reduc_initial_values.is_empty ())
7765 	    {
7766 	      initial_def = reduc_info->reduc_initial_values[0];
7767 	      code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7768 	      tree neutral_op
7769 		= neutral_op_for_reduction (TREE_TYPE (initial_def),
7770 					    code, initial_def);
7771 	      gcc_assert (neutral_op);
7772 	      /* Try to simplify the vector initialization by applying an
7773 		 adjustment after the reduction has been performed.  */
7774 	      if (!reduc_info->reused_accumulator
7775 		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7776 		  && !operand_equal_p (neutral_op, initial_def))
7777 		{
7778 		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7779 		    = initial_def;
7780 		  initial_def = neutral_op;
7781 		}
7782 	      vec_initial_def
7783 		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
7784 						 initial_def, neutral_op);
7785 	    }
7786 	}
7787     }
7788 
7789   if (vec_initial_def)
7790     {
7791       vec_initial_defs.create (ncopies);
7792       for (i = 0; i < ncopies; ++i)
7793 	vec_initial_defs.quick_push (vec_initial_def);
7794     }
7795 
7796   if (auto *accumulator = reduc_info->reused_accumulator)
7797     {
7798       tree def = accumulator->reduc_input;
7799       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7800 	{
7801 	  unsigned int nreduc;
7802 	  bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7803 					    (TREE_TYPE (def)),
7804 					  TYPE_VECTOR_SUBPARTS (vectype_out),
7805 					  &nreduc);
7806 	  gcc_assert (res);
7807 	  gimple_seq stmts = NULL;
7808 	  /* Reduce the single vector to a smaller one.  */
7809 	  if (nreduc != 1)
7810 	    {
7811 	      /* Perform the reduction in the appropriate type.  */
7812 	      tree rvectype = vectype_out;
7813 	      if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7814 					      TREE_TYPE (TREE_TYPE (def))))
7815 		rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7816 					      TYPE_VECTOR_SUBPARTS
7817 						(vectype_out));
7818 	      def = vect_create_partial_epilog (def, rvectype,
7819 						STMT_VINFO_REDUC_CODE
7820 						  (reduc_info),
7821 						&stmts);
7822 	    }
7823 	  /* The epilogue loop might use a different vector mode, like
7824 	     VNx2DI vs. V2DI.  */
7825 	  if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7826 	    {
7827 	      tree reduc_type = build_vector_type_for_mode
7828 		(TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7829 	      def = gimple_convert (&stmts, reduc_type, def);
7830 	    }
7831 	  /* Adjust the input so we pick up the partially reduced value
7832 	     for the skip edge in vect_create_epilog_for_reduction.  */
7833 	  accumulator->reduc_input = def;
7834 	  /* And the reduction could be carried out using a different sign.  */
7835 	  if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7836 	    def = gimple_convert (&stmts, vectype_out, def);
7837 	  if (loop_vinfo->main_loop_edge)
7838 	    {
7839 	      /* While we'd like to insert on the edge this will split
7840 		 blocks and disturb bookkeeping, we also will eventually
7841 		 need this on the skip edge.  Rely on sinking to
7842 		 fixup optimal placement and insert in the pred.  */
7843 	      gimple_stmt_iterator gsi
7844 		= gsi_last_bb (loop_vinfo->main_loop_edge->src);
7845 	      /* Insert before a cond that eventually skips the
7846 		 epilogue.  */
7847 	      if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7848 		gsi_prev (&gsi);
7849 	      gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7850 	    }
7851 	  else
7852 	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7853 					      stmts);
7854 	}
7855       if (loop_vinfo->main_loop_edge)
7856 	vec_initial_defs[0]
7857 	  = vect_get_main_loop_result (loop_vinfo, def,
7858 				       vec_initial_defs[0]);
7859       else
7860 	vec_initial_defs.safe_push (def);
7861     }
7862 
7863   /* Generate the reduction PHIs upfront.  */
7864   for (i = 0; i < vec_num; i++)
7865     {
7866       tree vec_init_def = vec_initial_defs[i];
7867       for (j = 0; j < ncopies; j++)
7868 	{
7869 	  /* Create the reduction-phi that defines the reduction
7870 	     operand.  */
7871 	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
7872 
7873 	  /* Set the loop-entry arg of the reduction-phi.  */
7874 	  if (j != 0 && nested_cycle)
7875 	    vec_init_def = vec_initial_defs[j];
7876 	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7877 		       UNKNOWN_LOCATION);
7878 
7879 	  /* The loop-latch arg is set in epilogue processing.  */
7880 
7881 	  if (slp_node)
7882 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7883 	  else
7884 	    {
7885 	      if (j == 0)
7886 		*vec_stmt = new_phi;
7887 	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7888 	    }
7889 	}
7890     }
7891 
7892   return true;
7893 }
7894 
7895 /* Vectorizes LC PHIs.  */
7896 
7897 bool
vectorizable_lc_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node)7898 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7899 		     stmt_vec_info stmt_info, gimple **vec_stmt,
7900 		     slp_tree slp_node)
7901 {
7902   if (!loop_vinfo
7903       || !is_a <gphi *> (stmt_info->stmt)
7904       || gimple_phi_num_args (stmt_info->stmt) != 1)
7905     return false;
7906 
7907   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7908       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7909     return false;
7910 
7911   if (!vec_stmt) /* transformation not required.  */
7912     {
7913       /* Deal with copies from externs or constants that disguise as
7914 	 loop-closed PHI nodes (PR97886).  */
7915       if (slp_node
7916 	  && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7917 						SLP_TREE_VECTYPE (slp_node)))
7918 	{
7919 	  if (dump_enabled_p ())
7920 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7921 			     "incompatible vector types for invariants\n");
7922 	  return false;
7923 	}
7924       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7925       return true;
7926     }
7927 
7928   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7929   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7930   basic_block bb = gimple_bb (stmt_info->stmt);
7931   edge e = single_pred_edge (bb);
7932   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7933   auto_vec<tree> vec_oprnds;
7934   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7935 		     !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7936 		     gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7937   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7938     {
7939       /* Create the vectorized LC PHI node.  */
7940       gphi *new_phi = create_phi_node (vec_dest, bb);
7941       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7942       if (slp_node)
7943 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7944       else
7945 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7946     }
7947   if (!slp_node)
7948     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7949 
7950   return true;
7951 }
7952 
7953 /* Vectorizes PHIs.  */
7954 
7955 bool
vectorizable_phi(vec_info *,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7956 vectorizable_phi (vec_info *,
7957 		  stmt_vec_info stmt_info, gimple **vec_stmt,
7958 		  slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7959 {
7960   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7961     return false;
7962 
7963   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7964     return false;
7965 
7966   tree vectype = SLP_TREE_VECTYPE (slp_node);
7967 
7968   if (!vec_stmt) /* transformation not required.  */
7969     {
7970       slp_tree child;
7971       unsigned i;
7972       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7973 	if (!child)
7974 	  {
7975 	    if (dump_enabled_p ())
7976 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 			       "PHI node with unvectorized backedge def\n");
7978 	    return false;
7979 	  }
7980 	else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7981 	  {
7982 	    if (dump_enabled_p ())
7983 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7984 			       "incompatible vector types for invariants\n");
7985 	    return false;
7986 	  }
7987 	else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7988 		 && !useless_type_conversion_p (vectype,
7989 						SLP_TREE_VECTYPE (child)))
7990 	  {
7991 	    /* With bools we can have mask and non-mask precision vectors
7992 	       or different non-mask precisions.  while pattern recog is
7993 	       supposed to guarantee consistency here bugs in it can cause
7994 	       mismatches (PR103489 and PR103800 for example).
7995 	       Deal with them here instead of ICEing later.  */
7996 	    if (dump_enabled_p ())
7997 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7998 			       "incompatible vector type setup from "
7999 			       "bool pattern detection\n");
8000 	    return false;
8001 	  }
8002 
8003       /* For single-argument PHIs assume coalescing which means zero cost
8004 	 for the scalar and the vector PHIs.  This avoids artificially
8005 	 favoring the vector path (but may pessimize it in some cases).  */
8006       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8007 	record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8008 			  vector_stmt, stmt_info, vectype, 0, vect_body);
8009       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8010       return true;
8011     }
8012 
8013   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8014   basic_block bb = gimple_bb (stmt_info->stmt);
8015   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8016   auto_vec<gphi *> new_phis;
8017   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8018     {
8019       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8020 
8021       /* Skip not yet vectorized defs.  */
8022       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8023 	  && SLP_TREE_VEC_STMTS (child).is_empty ())
8024 	continue;
8025 
8026       auto_vec<tree> vec_oprnds;
8027       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8028       if (!new_phis.exists ())
8029 	{
8030 	  new_phis.create (vec_oprnds.length ());
8031 	  for (unsigned j = 0; j < vec_oprnds.length (); j++)
8032 	    {
8033 	      /* Create the vectorized LC PHI node.  */
8034 	      new_phis.quick_push (create_phi_node (vec_dest, bb));
8035 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8036 	    }
8037 	}
8038       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8039       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8040 	add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8041     }
8042   /* We should have at least one already vectorized child.  */
8043   gcc_assert (new_phis.exists ());
8044 
8045   return true;
8046 }
8047 
8048 /* Return true if VECTYPE represents a vector that requires lowering
8049    by the vector lowering pass.  */
8050 
8051 bool
vect_emulated_vector_p(tree vectype)8052 vect_emulated_vector_p (tree vectype)
8053 {
8054   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8055 	  && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8056 	      || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8057 }
8058 
8059 /* Return true if we can emulate CODE on an integer mode representation
8060    of a vector.  */
8061 
8062 bool
vect_can_vectorize_without_simd_p(tree_code code)8063 vect_can_vectorize_without_simd_p (tree_code code)
8064 {
8065   switch (code)
8066     {
8067     case PLUS_EXPR:
8068     case MINUS_EXPR:
8069     case NEGATE_EXPR:
8070     case BIT_AND_EXPR:
8071     case BIT_IOR_EXPR:
8072     case BIT_XOR_EXPR:
8073     case BIT_NOT_EXPR:
8074       return true;
8075 
8076     default:
8077       return false;
8078     }
8079 }
8080 
8081 /* Likewise, but taking a code_helper.  */
8082 
8083 bool
vect_can_vectorize_without_simd_p(code_helper code)8084 vect_can_vectorize_without_simd_p (code_helper code)
8085 {
8086   return (code.is_tree_code ()
8087 	  && vect_can_vectorize_without_simd_p (tree_code (code)));
8088 }
8089 
8090 /* Function vectorizable_induction
8091 
8092    Check if STMT_INFO performs an induction computation that can be vectorized.
8093    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8094    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8095    Return true if STMT_INFO is vectorizable in this way.  */
8096 
8097 bool
vectorizable_induction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)8098 vectorizable_induction (loop_vec_info loop_vinfo,
8099 			stmt_vec_info stmt_info,
8100 			gimple **vec_stmt, slp_tree slp_node,
8101 			stmt_vector_for_cost *cost_vec)
8102 {
8103   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8104   unsigned ncopies;
8105   bool nested_in_vect_loop = false;
8106   class loop *iv_loop;
8107   tree vec_def;
8108   edge pe = loop_preheader_edge (loop);
8109   basic_block new_bb;
8110   tree new_vec, vec_init, vec_step, t;
8111   tree new_name;
8112   gimple *new_stmt;
8113   gphi *induction_phi;
8114   tree induc_def, vec_dest;
8115   tree init_expr, step_expr;
8116   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8117   unsigned i;
8118   tree expr;
8119   gimple_stmt_iterator si;
8120 
8121   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8122   if (!phi)
8123     return false;
8124 
8125   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8126     return false;
8127 
8128   /* Make sure it was recognized as induction computation.  */
8129   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8130     return false;
8131 
8132   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8133   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8134 
8135   if (slp_node)
8136     ncopies = 1;
8137   else
8138     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8139   gcc_assert (ncopies >= 1);
8140 
8141   /* FORNOW. These restrictions should be relaxed.  */
8142   if (nested_in_vect_loop_p (loop, stmt_info))
8143     {
8144       imm_use_iterator imm_iter;
8145       use_operand_p use_p;
8146       gimple *exit_phi;
8147       edge latch_e;
8148       tree loop_arg;
8149 
8150       if (ncopies > 1)
8151 	{
8152 	  if (dump_enabled_p ())
8153 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8154 			     "multiple types in nested loop.\n");
8155 	  return false;
8156 	}
8157 
8158       exit_phi = NULL;
8159       latch_e = loop_latch_edge (loop->inner);
8160       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8161       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8162 	{
8163 	  gimple *use_stmt = USE_STMT (use_p);
8164 	  if (is_gimple_debug (use_stmt))
8165 	    continue;
8166 
8167 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8168 	    {
8169 	      exit_phi = use_stmt;
8170 	      break;
8171 	    }
8172 	}
8173       if (exit_phi)
8174 	{
8175 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8176 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8177 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8178 	    {
8179 	      if (dump_enabled_p ())
8180 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181 				 "inner-loop induction only used outside "
8182 				 "of the outer vectorized loop.\n");
8183 	      return false;
8184 	    }
8185 	}
8186 
8187       nested_in_vect_loop = true;
8188       iv_loop = loop->inner;
8189     }
8190   else
8191     iv_loop = loop;
8192   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8193 
8194   if (slp_node && !nunits.is_constant ())
8195     {
8196       /* The current SLP code creates the step value element-by-element.  */
8197       if (dump_enabled_p ())
8198 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199 			 "SLP induction not supported for variable-length"
8200 			 " vectors.\n");
8201       return false;
8202     }
8203 
8204   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8205     {
8206       if (dump_enabled_p ())
8207 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8208 			 "floating point induction vectorization disabled\n");
8209       return false;
8210     }
8211 
8212   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8213   gcc_assert (step_expr != NULL_TREE);
8214   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
8215       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
8216     {
8217       if (dump_enabled_p ())
8218 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8219 			 "bit-precision induction vectorization not "
8220 			 "supported.\n");
8221       return false;
8222     }
8223   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8224 
8225   /* Check for backend support of PLUS/MINUS_EXPR. */
8226   if (!directly_supported_p (PLUS_EXPR, step_vectype)
8227       || !directly_supported_p (MINUS_EXPR, step_vectype))
8228     return false;
8229 
8230   if (!vec_stmt) /* transformation not required.  */
8231     {
8232       unsigned inside_cost = 0, prologue_cost = 0;
8233       if (slp_node)
8234 	{
8235 	  /* We eventually need to set a vector type on invariant
8236 	     arguments.  */
8237 	  unsigned j;
8238 	  slp_tree child;
8239 	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8240 	    if (!vect_maybe_update_slp_op_vectype
8241 		(child, SLP_TREE_VECTYPE (slp_node)))
8242 	      {
8243 		if (dump_enabled_p ())
8244 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8245 				   "incompatible vector types for "
8246 				   "invariants\n");
8247 		return false;
8248 	      }
8249 	  /* loop cost for vec_loop.  */
8250 	  inside_cost
8251 	    = record_stmt_cost (cost_vec,
8252 				SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8253 				vector_stmt, stmt_info, 0, vect_body);
8254 	  /* prologue cost for vec_init (if not nested) and step.  */
8255 	  prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8256 					    scalar_to_vec,
8257 					    stmt_info, 0, vect_prologue);
8258 	}
8259       else /* if (!slp_node) */
8260 	{
8261 	  /* loop cost for vec_loop.  */
8262 	  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8263 					  stmt_info, 0, vect_body);
8264 	  /* prologue cost for vec_init and vec_step.  */
8265 	  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8266 					    stmt_info, 0, vect_prologue);
8267 	}
8268       if (dump_enabled_p ())
8269 	dump_printf_loc (MSG_NOTE, vect_location,
8270 			 "vect_model_induction_cost: inside_cost = %d, "
8271 			 "prologue_cost = %d .\n", inside_cost,
8272 			 prologue_cost);
8273 
8274       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8275       DUMP_VECT_SCOPE ("vectorizable_induction");
8276       return true;
8277     }
8278 
8279   /* Transform.  */
8280 
8281   /* Compute a vector variable, initialized with the first VF values of
8282      the induction variable.  E.g., for an iv with IV_PHI='X' and
8283      evolution S, for a vector of 4 units, we want to compute:
8284      [X, X + S, X + 2*S, X + 3*S].  */
8285 
8286   if (dump_enabled_p ())
8287     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8288 
8289   pe = loop_preheader_edge (iv_loop);
8290   /* Find the first insertion point in the BB.  */
8291   basic_block bb = gimple_bb (phi);
8292   si = gsi_after_labels (bb);
8293 
8294   /* For SLP induction we have to generate several IVs as for example
8295      with group size 3 we need
8296        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8297        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8298   if (slp_node)
8299     {
8300       /* Enforced above.  */
8301       unsigned int const_nunits = nunits.to_constant ();
8302 
8303       /* The initial values are vectorized, but any lanes > group_size
8304 	 need adjustment.  */
8305       slp_tree init_node
8306 	= SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8307 
8308       /* Gather steps.  Since we do not vectorize inductions as
8309 	 cycles we have to reconstruct the step from SCEV data.  */
8310       unsigned group_size = SLP_TREE_LANES (slp_node);
8311       tree *steps = XALLOCAVEC (tree, group_size);
8312       tree *inits = XALLOCAVEC (tree, group_size);
8313       stmt_vec_info phi_info;
8314       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8315 	{
8316 	  steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8317 	  if (!init_node)
8318 	    inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8319 					   pe->dest_idx);
8320 	}
8321 
8322       /* Now generate the IVs.  */
8323       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8324       gcc_assert ((const_nunits * nvects) % group_size == 0);
8325       unsigned nivs;
8326       if (nested_in_vect_loop)
8327 	nivs = nvects;
8328       else
8329 	{
8330 	  /* Compute the number of distinct IVs we need.  First reduce
8331 	     group_size if it is a multiple of const_nunits so we get
8332 	     one IV for a group_size of 4 but const_nunits 2.  */
8333 	  unsigned group_sizep = group_size;
8334 	  if (group_sizep % const_nunits == 0)
8335 	    group_sizep = group_sizep / const_nunits;
8336 	  nivs = least_common_multiple (group_sizep,
8337 					const_nunits) / const_nunits;
8338 	}
8339       tree stept = TREE_TYPE (step_vectype);
8340       tree lupdate_mul = NULL_TREE;
8341       if (!nested_in_vect_loop)
8342 	{
8343 	  /* The number of iterations covered in one vector iteration.  */
8344 	  unsigned lup_mul = (nvects * const_nunits) / group_size;
8345 	  lupdate_mul
8346 	    = build_vector_from_val (step_vectype,
8347 				     SCALAR_FLOAT_TYPE_P (stept)
8348 				     ? build_real_from_wide (stept, lup_mul,
8349 							     UNSIGNED)
8350 				     : build_int_cstu (stept, lup_mul));
8351 	}
8352       tree peel_mul = NULL_TREE;
8353       gimple_seq init_stmts = NULL;
8354       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8355 	{
8356 	  if (SCALAR_FLOAT_TYPE_P (stept))
8357 	    peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8358 				     LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8359 	  else
8360 	    peel_mul = gimple_convert (&init_stmts, stept,
8361 				       LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8362 	  peel_mul = gimple_build_vector_from_val (&init_stmts,
8363 						   step_vectype, peel_mul);
8364 	}
8365       unsigned ivn;
8366       auto_vec<tree> vec_steps;
8367       for (ivn = 0; ivn < nivs; ++ivn)
8368 	{
8369 	  tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8370 	  tree_vector_builder init_elts (vectype, const_nunits, 1);
8371 	  tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8372 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8373 	    {
8374 	      /* The scalar steps of the IVs.  */
8375 	      tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8376 	      elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8377 	      step_elts.quick_push (elt);
8378 	      if (!init_node)
8379 		{
8380 		  /* The scalar inits of the IVs if not vectorized.  */
8381 		  elt = inits[(ivn*const_nunits + eltn) % group_size];
8382 		  if (!useless_type_conversion_p (TREE_TYPE (vectype),
8383 						  TREE_TYPE (elt)))
8384 		    elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8385 					TREE_TYPE (vectype), elt);
8386 		  init_elts.quick_push (elt);
8387 		}
8388 	      /* The number of steps to add to the initial values.  */
8389 	      unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8390 	      mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8391 				   ? build_real_from_wide (stept,
8392 							   mul_elt, UNSIGNED)
8393 				   : build_int_cstu (stept, mul_elt));
8394 	    }
8395 	  vec_step = gimple_build_vector (&init_stmts, &step_elts);
8396 	  vec_steps.safe_push (vec_step);
8397 	  tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8398 	  if (peel_mul)
8399 	    step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8400 				     step_mul, peel_mul);
8401 	  if (!init_node)
8402 	    vec_init = gimple_build_vector (&init_stmts, &init_elts);
8403 
8404 	  /* Create the induction-phi that defines the induction-operand.  */
8405 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8406 					    "vec_iv_");
8407 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
8408 	  induc_def = PHI_RESULT (induction_phi);
8409 
8410 	  /* Create the iv update inside the loop  */
8411 	  tree up = vec_step;
8412 	  if (lupdate_mul)
8413 	    up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8414 			       vec_step, lupdate_mul);
8415 	  gimple_seq stmts = NULL;
8416 	  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8417 	  vec_def = gimple_build (&stmts,
8418 				  PLUS_EXPR, step_vectype, vec_def, up);
8419 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
8420 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8421 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8422 		       UNKNOWN_LOCATION);
8423 
8424 	  if (init_node)
8425 	    vec_init = vect_get_slp_vect_def (init_node, ivn);
8426 	  if (!nested_in_vect_loop
8427 	      && !integer_zerop (step_mul))
8428 	    {
8429 	      vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8430 	      up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8431 				 vec_step, step_mul);
8432 	      vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8433 				      vec_def, up);
8434 	      vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8435 	    }
8436 
8437 	  /* Set the arguments of the phi node:  */
8438 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8439 
8440 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8441 	}
8442       if (!nested_in_vect_loop)
8443 	{
8444 	  /* Fill up to the number of vectors we need for the whole group.  */
8445 	  nivs = least_common_multiple (group_size,
8446 					const_nunits) / const_nunits;
8447 	  vec_steps.reserve (nivs-ivn);
8448 	  for (; ivn < nivs; ++ivn)
8449 	    {
8450 	      SLP_TREE_VEC_STMTS (slp_node)
8451 		.quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8452 	      vec_steps.quick_push (vec_steps[0]);
8453 	    }
8454 	}
8455 
8456       /* Re-use IVs when we can.  We are generating further vector
8457 	 stmts by adding VF' * stride to the IVs generated above.  */
8458       if (ivn < nvects)
8459 	{
8460 	  unsigned vfp
8461 	    = least_common_multiple (group_size, const_nunits) / group_size;
8462 	  tree lupdate_mul
8463 	    = build_vector_from_val (step_vectype,
8464 				     SCALAR_FLOAT_TYPE_P (stept)
8465 				     ? build_real_from_wide (stept,
8466 							     vfp, UNSIGNED)
8467 				     : build_int_cstu (stept, vfp));
8468 	  for (; ivn < nvects; ++ivn)
8469 	    {
8470 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8471 	      tree def = gimple_get_lhs (iv);
8472 	      if (ivn < 2*nivs)
8473 		vec_steps[ivn - nivs]
8474 		  = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8475 				  vec_steps[ivn - nivs], lupdate_mul);
8476 	      gimple_seq stmts = NULL;
8477 	      def = gimple_convert (&stmts, step_vectype, def);
8478 	      def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8479 				  def, vec_steps[ivn % nivs]);
8480 	      def = gimple_convert (&stmts, vectype, def);
8481 	      if (gimple_code (iv) == GIMPLE_PHI)
8482 		gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8483 	      else
8484 		{
8485 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8486 		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8487 		}
8488 	      SLP_TREE_VEC_STMTS (slp_node)
8489 		.quick_push (SSA_NAME_DEF_STMT (def));
8490 	    }
8491 	}
8492 
8493       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8494       gcc_assert (!new_bb);
8495 
8496       return true;
8497     }
8498 
8499   init_expr = vect_phi_initial_value (phi);
8500 
8501   gimple_seq stmts = NULL;
8502   if (!nested_in_vect_loop)
8503     {
8504       /* Convert the initial value to the IV update type.  */
8505       tree new_type = TREE_TYPE (step_expr);
8506       init_expr = gimple_convert (&stmts, new_type, init_expr);
8507 
8508       /* If we are using the loop mask to "peel" for alignment then we need
8509 	 to adjust the start value here.  */
8510       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8511       if (skip_niters != NULL_TREE)
8512 	{
8513 	  if (FLOAT_TYPE_P (vectype))
8514 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8515 					skip_niters);
8516 	  else
8517 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8518 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8519 					 skip_niters, step_expr);
8520 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8521 				    init_expr, skip_step);
8522 	}
8523     }
8524 
8525   if (stmts)
8526     {
8527       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8528       gcc_assert (!new_bb);
8529     }
8530 
8531   /* Create the vector that holds the initial_value of the induction.  */
8532   if (nested_in_vect_loop)
8533     {
8534       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8535 	 been created during vectorization of previous stmts.  We obtain it
8536 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8537       auto_vec<tree> vec_inits;
8538       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8539 				     init_expr, &vec_inits);
8540       vec_init = vec_inits[0];
8541       /* If the initial value is not of proper type, convert it.  */
8542       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8543 	{
8544 	  new_stmt
8545 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
8546 							  vect_simple_var,
8547 							  "vec_iv_"),
8548 				   VIEW_CONVERT_EXPR,
8549 				   build1 (VIEW_CONVERT_EXPR, vectype,
8550 					   vec_init));
8551 	  vec_init = gimple_assign_lhs (new_stmt);
8552 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8553 						 new_stmt);
8554 	  gcc_assert (!new_bb);
8555 	}
8556     }
8557   else
8558     {
8559       /* iv_loop is the loop to be vectorized. Create:
8560 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8561       stmts = NULL;
8562       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8563 
8564       unsigned HOST_WIDE_INT const_nunits;
8565       if (nunits.is_constant (&const_nunits))
8566 	{
8567 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
8568 	  elts.quick_push (new_name);
8569 	  for (i = 1; i < const_nunits; i++)
8570 	    {
8571 	      /* Create: new_name_i = new_name + step_expr  */
8572 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8573 				       new_name, step_expr);
8574 	      elts.quick_push (new_name);
8575 	    }
8576 	  /* Create a vector from [new_name_0, new_name_1, ...,
8577 	     new_name_nunits-1]  */
8578 	  vec_init = gimple_build_vector (&stmts, &elts);
8579 	}
8580       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8581 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
8582 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8583 				 new_name, step_expr);
8584       else
8585 	{
8586 	  /* Build:
8587 	        [base, base, base, ...]
8588 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8589 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8590 	  gcc_assert (flag_associative_math);
8591 	  tree index = build_index_vector (step_vectype, 0, 1);
8592 	  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8593 							new_name);
8594 	  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8595 							step_expr);
8596 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8597 	  vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8598 				   vec_init, step_vec);
8599 	  vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8600 				   vec_init, base_vec);
8601 	}
8602       vec_init = gimple_convert (&stmts, vectype, vec_init);
8603 
8604       if (stmts)
8605 	{
8606 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8607 	  gcc_assert (!new_bb);
8608 	}
8609     }
8610 
8611 
8612   /* Create the vector that holds the step of the induction.  */
8613   if (nested_in_vect_loop)
8614     /* iv_loop is nested in the loop to be vectorized. Generate:
8615        vec_step = [S, S, S, S]  */
8616     new_name = step_expr;
8617   else
8618     {
8619       /* iv_loop is the loop to be vectorized. Generate:
8620 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8621       gimple_seq seq = NULL;
8622       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8623 	{
8624 	  expr = build_int_cst (integer_type_node, vf);
8625 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8626 	}
8627       else
8628 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
8629       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8630 			       expr, step_expr);
8631       if (seq)
8632 	{
8633 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8634 	  gcc_assert (!new_bb);
8635 	}
8636     }
8637 
8638   t = unshare_expr (new_name);
8639   gcc_assert (CONSTANT_CLASS_P (new_name)
8640 	      || TREE_CODE (new_name) == SSA_NAME);
8641   new_vec = build_vector_from_val (step_vectype, t);
8642   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8643 			       new_vec, step_vectype, NULL);
8644 
8645 
8646   /* Create the following def-use cycle:
8647      loop prolog:
8648          vec_init = ...
8649 	 vec_step = ...
8650      loop:
8651          vec_iv = PHI <vec_init, vec_loop>
8652          ...
8653          STMT
8654          ...
8655          vec_loop = vec_iv + vec_step;  */
8656 
8657   /* Create the induction-phi that defines the induction-operand.  */
8658   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8659   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8660   induc_def = PHI_RESULT (induction_phi);
8661 
8662   /* Create the iv update inside the loop  */
8663   stmts = NULL;
8664   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8665   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8666   vec_def = gimple_convert (&stmts, vectype, vec_def);
8667   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8668   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8669 
8670   /* Set the arguments of the phi node:  */
8671   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8672   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8673 	       UNKNOWN_LOCATION);
8674 
8675   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8676   *vec_stmt = induction_phi;
8677 
8678   /* In case that vectorization factor (VF) is bigger than the number
8679      of elements that we can fit in a vectype (nunits), we have to generate
8680      more than one vector stmt - i.e - we need to "unroll" the
8681      vector stmt by a factor VF/nunits.  For more details see documentation
8682      in vectorizable_operation.  */
8683 
8684   if (ncopies > 1)
8685     {
8686       gimple_seq seq = NULL;
8687       /* FORNOW. This restriction should be relaxed.  */
8688       gcc_assert (!nested_in_vect_loop);
8689 
8690       /* Create the vector that holds the step of the induction.  */
8691       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8692 	{
8693 	  expr = build_int_cst (integer_type_node, nunits);
8694 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8695 	}
8696       else
8697 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8698       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8699 			       expr, step_expr);
8700       if (seq)
8701 	{
8702 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8703 	  gcc_assert (!new_bb);
8704 	}
8705 
8706       t = unshare_expr (new_name);
8707       gcc_assert (CONSTANT_CLASS_P (new_name)
8708 		  || TREE_CODE (new_name) == SSA_NAME);
8709       new_vec = build_vector_from_val (step_vectype, t);
8710       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8711 				   new_vec, step_vectype, NULL);
8712 
8713       vec_def = induc_def;
8714       for (i = 1; i < ncopies; i++)
8715 	{
8716 	  /* vec_i = vec_prev + vec_step  */
8717 	  gimple_seq stmts = NULL;
8718 	  vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8719 	  vec_def = gimple_build (&stmts,
8720 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
8721 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
8722 
8723 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8724 	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
8725 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8726 	}
8727     }
8728 
8729   if (dump_enabled_p ())
8730     dump_printf_loc (MSG_NOTE, vect_location,
8731 		     "transform induction: created def-use cycle: %G%G",
8732 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
8733 
8734   return true;
8735 }
8736 
8737 /* Function vectorizable_live_operation.
8738 
8739    STMT_INFO computes a value that is used outside the loop.  Check if
8740    it can be supported.  */
8741 
8742 bool
vectorizable_live_operation(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost * cost_vec)8743 vectorizable_live_operation (vec_info *vinfo,
8744 			     stmt_vec_info stmt_info,
8745 			     gimple_stmt_iterator *gsi,
8746 			     slp_tree slp_node, slp_instance slp_node_instance,
8747 			     int slp_index, bool vec_stmt_p,
8748 			     stmt_vector_for_cost *cost_vec)
8749 {
8750   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8751   imm_use_iterator imm_iter;
8752   tree lhs, lhs_type, bitsize;
8753   tree vectype = (slp_node
8754 		  ? SLP_TREE_VECTYPE (slp_node)
8755 		  : STMT_VINFO_VECTYPE (stmt_info));
8756   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8757   int ncopies;
8758   gimple *use_stmt;
8759   auto_vec<tree> vec_oprnds;
8760   int vec_entry = 0;
8761   poly_uint64 vec_index = 0;
8762 
8763   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8764 
8765   /* If a stmt of a reduction is live, vectorize it via
8766      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8767      validity so just trigger the transform here.  */
8768   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8769     {
8770       if (!vec_stmt_p)
8771 	return true;
8772       if (slp_node)
8773 	{
8774 	  /* For reduction chains the meta-info is attached to
8775 	     the group leader.  */
8776 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8777 	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8778 	  /* For SLP reductions we vectorize the epilogue for
8779 	     all involved stmts together.  */
8780 	  else if (slp_index != 0)
8781 	    return true;
8782 	}
8783       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8784       gcc_assert (reduc_info->is_reduc_info);
8785       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8786 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8787 	return true;
8788       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8789 					slp_node_instance);
8790       return true;
8791     }
8792 
8793   /* If STMT is not relevant and it is a simple assignment and its inputs are
8794      invariant then it can remain in place, unvectorized.  The original last
8795      scalar value that it computes will be used.  */
8796   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8797     {
8798       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8799       if (dump_enabled_p ())
8800 	dump_printf_loc (MSG_NOTE, vect_location,
8801 			 "statement is simple and uses invariant.  Leaving in "
8802 			 "place.\n");
8803       return true;
8804     }
8805 
8806   if (slp_node)
8807     ncopies = 1;
8808   else
8809     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8810 
8811   if (slp_node)
8812     {
8813       gcc_assert (slp_index >= 0);
8814 
8815       /* Get the last occurrence of the scalar index from the concatenation of
8816 	 all the slp vectors. Calculate which slp vector it is and the index
8817 	 within.  */
8818       int num_scalar = SLP_TREE_LANES (slp_node);
8819       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8820       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8821 
8822       /* Calculate which vector contains the result, and which lane of
8823 	 that vector we need.  */
8824       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8825 	{
8826 	  if (dump_enabled_p ())
8827 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8828 			     "Cannot determine which vector holds the"
8829 			     " final result.\n");
8830 	  return false;
8831 	}
8832     }
8833 
8834   if (!vec_stmt_p)
8835     {
8836       /* No transformation required.  */
8837       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8838 	{
8839 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8840 					       OPTIMIZE_FOR_SPEED))
8841 	    {
8842 	      if (dump_enabled_p ())
8843 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8844 				 "can't operate on partial vectors "
8845 				 "because the target doesn't support extract "
8846 				 "last reduction.\n");
8847 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8848 	    }
8849 	  else if (slp_node)
8850 	    {
8851 	      if (dump_enabled_p ())
8852 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8853 				 "can't operate on partial vectors "
8854 				 "because an SLP statement is live after "
8855 				 "the loop.\n");
8856 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8857 	    }
8858 	  else if (ncopies > 1)
8859 	    {
8860 	      if (dump_enabled_p ())
8861 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8862 				 "can't operate on partial vectors "
8863 				 "because ncopies is greater than 1.\n");
8864 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8865 	    }
8866 	  else
8867 	    {
8868 	      gcc_assert (ncopies == 1 && !slp_node);
8869 	      vect_record_loop_mask (loop_vinfo,
8870 				     &LOOP_VINFO_MASKS (loop_vinfo),
8871 				     1, vectype, NULL);
8872 	    }
8873 	}
8874       /* ???  Enable for loop costing as well.  */
8875       if (!loop_vinfo)
8876 	record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8877 			  0, vect_epilogue);
8878       return true;
8879     }
8880 
8881   /* Use the lhs of the original scalar statement.  */
8882   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8883   if (dump_enabled_p ())
8884     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8885 		     "stmt %G", stmt);
8886 
8887   lhs = gimple_get_lhs (stmt);
8888   lhs_type = TREE_TYPE (lhs);
8889 
8890   bitsize = vector_element_bits_tree (vectype);
8891 
8892   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8893   tree vec_lhs, bitstart;
8894   gimple *vec_stmt;
8895   if (slp_node)
8896     {
8897       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8898 
8899       /* Get the correct slp vectorized stmt.  */
8900       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8901       vec_lhs = gimple_get_lhs (vec_stmt);
8902 
8903       /* Get entry to use.  */
8904       bitstart = bitsize_int (vec_index);
8905       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8906     }
8907   else
8908     {
8909       /* For multiple copies, get the last copy.  */
8910       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8911       vec_lhs = gimple_get_lhs (vec_stmt);
8912 
8913       /* Get the last lane in the vector.  */
8914       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8915     }
8916 
8917   if (loop_vinfo)
8918     {
8919       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8920 	 requirement, insert one phi node for it.  It looks like:
8921 	   loop;
8922 	 BB:
8923 	   # lhs' = PHI <lhs>
8924 	 ==>
8925 	   loop;
8926 	 BB:
8927 	   # vec_lhs' = PHI <vec_lhs>
8928 	   new_tree = lane_extract <vec_lhs', ...>;
8929 	   lhs' = new_tree;  */
8930 
8931       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8932       basic_block exit_bb = single_exit (loop)->dest;
8933       gcc_assert (single_pred_p (exit_bb));
8934 
8935       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8936       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8937       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8938 
8939       gimple_seq stmts = NULL;
8940       tree new_tree;
8941       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8942 	{
8943 	  /* Emit:
8944 
8945 	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8946 
8947 	     where VEC_LHS is the vectorized live-out result and MASK is
8948 	     the loop mask for the final iteration.  */
8949 	  gcc_assert (ncopies == 1 && !slp_node);
8950 	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8951 	  tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8952 					  1, vectype, 0);
8953 	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8954 					  mask, vec_lhs_phi);
8955 
8956 	  /* Convert the extracted vector element to the scalar type.  */
8957 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8958 	}
8959       else
8960 	{
8961 	  tree bftype = TREE_TYPE (vectype);
8962 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
8963 	    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8964 	  new_tree = build3 (BIT_FIELD_REF, bftype,
8965 			     vec_lhs_phi, bitsize, bitstart);
8966 	  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8967 					   &stmts, true, NULL_TREE);
8968 	}
8969 
8970       if (stmts)
8971 	{
8972 	  gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8973 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8974 
8975 	  /* Remove existing phi from lhs and create one copy from new_tree.  */
8976 	  tree lhs_phi = NULL_TREE;
8977 	  gimple_stmt_iterator gsi;
8978 	  for (gsi = gsi_start_phis (exit_bb);
8979 	       !gsi_end_p (gsi); gsi_next (&gsi))
8980 	    {
8981 	      gimple *phi = gsi_stmt (gsi);
8982 	      if ((gimple_phi_arg_def (phi, 0) == lhs))
8983 		{
8984 		  remove_phi_node (&gsi, false);
8985 		  lhs_phi = gimple_phi_result (phi);
8986 		  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8987 		  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8988 		  break;
8989 		}
8990 	    }
8991 	}
8992 
8993       /* Replace use of lhs with newly computed result.  If the use stmt is a
8994 	 single arg PHI, just replace all uses of PHI result.  It's necessary
8995 	 because lcssa PHI defining lhs may be before newly inserted stmt.  */
8996       use_operand_p use_p;
8997       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8998 	if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8999 	    && !is_gimple_debug (use_stmt))
9000 	  {
9001 	    if (gimple_code (use_stmt) == GIMPLE_PHI
9002 		&& gimple_phi_num_args (use_stmt) == 1)
9003 	      {
9004 		replace_uses_by (gimple_phi_result (use_stmt), new_tree);
9005 	      }
9006 	    else
9007 	      {
9008 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9009 		    SET_USE (use_p, new_tree);
9010 	      }
9011 	    update_stmt (use_stmt);
9012 	  }
9013     }
9014   else
9015     {
9016       /* For basic-block vectorization simply insert the lane-extraction.  */
9017       tree bftype = TREE_TYPE (vectype);
9018       if (VECTOR_BOOLEAN_TYPE_P (vectype))
9019 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9020       tree new_tree = build3 (BIT_FIELD_REF, bftype,
9021 			      vec_lhs, bitsize, bitstart);
9022       gimple_seq stmts = NULL;
9023       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9024 				       &stmts, true, NULL_TREE);
9025       if (TREE_CODE (new_tree) == SSA_NAME
9026 	  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9027 	SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9028       if (is_a <gphi *> (vec_stmt))
9029 	{
9030 	  gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9031 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9032 	}
9033       else
9034 	{
9035 	  gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9036 	  gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9037 	}
9038 
9039       /* Replace use of lhs with newly computed result.  If the use stmt is a
9040 	 single arg PHI, just replace all uses of PHI result.  It's necessary
9041 	 because lcssa PHI defining lhs may be before newly inserted stmt.  */
9042       use_operand_p use_p;
9043       stmt_vec_info use_stmt_info;
9044       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9045 	if (!is_gimple_debug (use_stmt)
9046 	    && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9047 		|| !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9048 	  {
9049 	    /* ???  This can happen when the live lane ends up being
9050 	       used in a vector construction code-generated by an
9051 	       external SLP node (and code-generation for that already
9052 	       happened).  See gcc.dg/vect/bb-slp-47.c.
9053 	       Doing this is what would happen if that vector CTOR
9054 	       were not code-generated yet so it is not too bad.
9055 	       ???  In fact we'd likely want to avoid this situation
9056 	       in the first place.  */
9057 	    if (TREE_CODE (new_tree) == SSA_NAME
9058 		&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9059 		&& gimple_code (use_stmt) != GIMPLE_PHI
9060 		&& !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9061 						use_stmt))
9062 	      {
9063 		enum tree_code code = gimple_assign_rhs_code (use_stmt);
9064 		gcc_checking_assert (code == SSA_NAME
9065 				     || code == CONSTRUCTOR
9066 				     || code == VIEW_CONVERT_EXPR
9067 				     || CONVERT_EXPR_CODE_P (code));
9068 		if (dump_enabled_p ())
9069 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9070 				   "Using original scalar computation for "
9071 				   "live lane because use preceeds vector "
9072 				   "def\n");
9073 		continue;
9074 	      }
9075 	    /* ???  It can also happen that we end up pulling a def into
9076 	       a loop where replacing out-of-loop uses would require
9077 	       a new LC SSA PHI node.  Retain the original scalar in
9078 	       those cases as well.  PR98064.  */
9079 	    if (TREE_CODE (new_tree) == SSA_NAME
9080 		&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9081 		&& (gimple_bb (use_stmt)->loop_father
9082 		    != gimple_bb (vec_stmt)->loop_father)
9083 		&& !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9084 					gimple_bb (use_stmt)->loop_father))
9085 	      {
9086 		if (dump_enabled_p ())
9087 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9088 				   "Using original scalar computation for "
9089 				   "live lane because there is an out-of-loop "
9090 				   "definition for it\n");
9091 		continue;
9092 	      }
9093 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9094 	      SET_USE (use_p, new_tree);
9095 	    update_stmt (use_stmt);
9096 	  }
9097     }
9098 
9099   return true;
9100 }
9101 
9102 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9103 
9104 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)9105 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9106 {
9107   ssa_op_iter op_iter;
9108   imm_use_iterator imm_iter;
9109   def_operand_p def_p;
9110   gimple *ustmt;
9111 
9112   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9113     {
9114       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9115 	{
9116 	  basic_block bb;
9117 
9118 	  if (!is_gimple_debug (ustmt))
9119 	    continue;
9120 
9121 	  bb = gimple_bb (ustmt);
9122 
9123 	  if (!flow_bb_inside_loop_p (loop, bb))
9124 	    {
9125 	      if (gimple_debug_bind_p (ustmt))
9126 		{
9127 		  if (dump_enabled_p ())
9128 		    dump_printf_loc (MSG_NOTE, vect_location,
9129                                      "killing debug use\n");
9130 
9131 		  gimple_debug_bind_reset_value (ustmt);
9132 		  update_stmt (ustmt);
9133 		}
9134 	      else
9135 		gcc_unreachable ();
9136 	    }
9137 	}
9138     }
9139 }
9140 
9141 /* Given loop represented by LOOP_VINFO, return true if computation of
9142    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9143    otherwise.  */
9144 
9145 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)9146 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9147 {
9148   /* Constant case.  */
9149   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9150     {
9151       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9152       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9153 
9154       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9155       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9156       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9157 	return true;
9158     }
9159 
9160   widest_int max;
9161   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9162   /* Check the upper bound of loop niters.  */
9163   if (get_max_loop_iterations (loop, &max))
9164     {
9165       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9166       signop sgn = TYPE_SIGN (type);
9167       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9168       if (max < type_max)
9169 	return true;
9170     }
9171   return false;
9172 }
9173 
9174 /* Return a mask type with half the number of elements as OLD_TYPE,
9175    given that it should have mode NEW_MODE.  */
9176 
9177 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)9178 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9179 {
9180   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9181   return build_truth_vector_type_for_mode (nunits, new_mode);
9182 }
9183 
9184 /* Return a mask type with twice as many elements as OLD_TYPE,
9185    given that it should have mode NEW_MODE.  */
9186 
9187 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)9188 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9189 {
9190   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9191   return build_truth_vector_type_for_mode (nunits, new_mode);
9192 }
9193 
9194 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9195    contain a sequence of NVECTORS masks that each control a vector of type
9196    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9197    these vector masks with the vector version of SCALAR_MASK.  */
9198 
9199 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)9200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9201 		       unsigned int nvectors, tree vectype, tree scalar_mask)
9202 {
9203   gcc_assert (nvectors != 0);
9204   if (masks->length () < nvectors)
9205     masks->safe_grow_cleared (nvectors, true);
9206   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9207   /* The number of scalars per iteration and the number of vectors are
9208      both compile-time constants.  */
9209   unsigned int nscalars_per_iter
9210     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9211 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9212 
9213   if (scalar_mask)
9214     {
9215       scalar_cond_masked_key cond (scalar_mask, nvectors);
9216       loop_vinfo->scalar_cond_masked_set.add (cond);
9217     }
9218 
9219   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9220     {
9221       rgm->max_nscalars_per_iter = nscalars_per_iter;
9222       rgm->type = truth_type_for (vectype);
9223       rgm->factor = 1;
9224     }
9225 }
9226 
9227 /* Given a complete set of masks MASKS, extract mask number INDEX
9228    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9229    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9230 
9231    See the comment above vec_loop_masks for more details about the mask
9232    arrangement.  */
9233 
9234 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)9235 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9236 		    unsigned int nvectors, tree vectype, unsigned int index)
9237 {
9238   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9239   tree mask_type = rgm->type;
9240 
9241   /* Populate the rgroup's mask array, if this is the first time we've
9242      used it.  */
9243   if (rgm->controls.is_empty ())
9244     {
9245       rgm->controls.safe_grow_cleared (nvectors, true);
9246       for (unsigned int i = 0; i < nvectors; ++i)
9247 	{
9248 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9249 	  /* Provide a dummy definition until the real one is available.  */
9250 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9251 	  rgm->controls[i] = mask;
9252 	}
9253     }
9254 
9255   tree mask = rgm->controls[index];
9256   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9257 		TYPE_VECTOR_SUBPARTS (vectype)))
9258     {
9259       /* A loop mask for data type X can be reused for data type Y
9260 	 if X has N times more elements than Y and if Y's elements
9261 	 are N times bigger than X's.  In this case each sequence
9262 	 of N elements in the loop mask will be all-zero or all-one.
9263 	 We can then view-convert the mask so that each sequence of
9264 	 N elements is replaced by a single element.  */
9265       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9266 			      TYPE_VECTOR_SUBPARTS (vectype)));
9267       gimple_seq seq = NULL;
9268       mask_type = truth_type_for (vectype);
9269       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9270       if (seq)
9271 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9272     }
9273   return mask;
9274 }
9275 
9276 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9277    lengths for controlling an operation on VECTYPE.  The operation splits
9278    each element of VECTYPE into FACTOR separate subelements, measuring the
9279    length as a number of these subelements.  */
9280 
9281 void
vect_record_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,tree vectype,unsigned int factor)9282 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9283 		      unsigned int nvectors, tree vectype, unsigned int factor)
9284 {
9285   gcc_assert (nvectors != 0);
9286   if (lens->length () < nvectors)
9287     lens->safe_grow_cleared (nvectors, true);
9288   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9289 
9290   /* The number of scalars per iteration, scalar occupied bytes and
9291      the number of vectors are both compile-time constants.  */
9292   unsigned int nscalars_per_iter
9293     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9294 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9295 
9296   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9297     {
9298       /* For now, we only support cases in which all loads and stores fall back
9299 	 to VnQI or none do.  */
9300       gcc_assert (!rgl->max_nscalars_per_iter
9301 		  || (rgl->factor == 1 && factor == 1)
9302 		  || (rgl->max_nscalars_per_iter * rgl->factor
9303 		      == nscalars_per_iter * factor));
9304       rgl->max_nscalars_per_iter = nscalars_per_iter;
9305       rgl->type = vectype;
9306       rgl->factor = factor;
9307     }
9308 }
9309 
9310 /* Given a complete set of length LENS, extract length number INDEX for an
9311    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9312 
9313 tree
vect_get_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,unsigned int index)9314 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9315 		   unsigned int nvectors, unsigned int index)
9316 {
9317   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9318   bool use_bias_adjusted_len =
9319     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9320 
9321   /* Populate the rgroup's len array, if this is the first time we've
9322      used it.  */
9323   if (rgl->controls.is_empty ())
9324     {
9325       rgl->controls.safe_grow_cleared (nvectors, true);
9326       for (unsigned int i = 0; i < nvectors; ++i)
9327 	{
9328 	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9329 	  gcc_assert (len_type != NULL_TREE);
9330 
9331 	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9332 
9333 	  /* Provide a dummy definition until the real one is available.  */
9334 	  SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9335 	  rgl->controls[i] = len;
9336 
9337 	  if (use_bias_adjusted_len)
9338 	    {
9339 	      gcc_assert (i == 0);
9340 	      tree adjusted_len =
9341 		make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9342 	      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9343 	      rgl->bias_adjusted_ctrl = adjusted_len;
9344 	    }
9345 	}
9346     }
9347 
9348   if (use_bias_adjusted_len)
9349     return rgl->bias_adjusted_ctrl;
9350   else
9351     return rgl->controls[index];
9352 }
9353 
9354 /* Scale profiling counters by estimation for LOOP which is vectorized
9355    by factor VF.  */
9356 
9357 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)9358 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9359 {
9360   edge preheader = loop_preheader_edge (loop);
9361   /* Reduce loop iterations by the vectorization factor.  */
9362   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9363   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9364 
9365   if (freq_h.nonzero_p ())
9366     {
9367       profile_probability p;
9368 
9369       /* Avoid dropping loop body profile counter to 0 because of zero count
9370 	 in loop's preheader.  */
9371       if (!(freq_e == profile_count::zero ()))
9372         freq_e = freq_e.force_nonzero ();
9373       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9374       scale_loop_frequencies (loop, p);
9375     }
9376 
9377   edge exit_e = single_exit (loop);
9378   exit_e->probability = profile_probability::always ()
9379 				 .apply_scale (1, new_est_niter + 1);
9380 
9381   edge exit_l = single_pred_edge (loop->latch);
9382   profile_probability prob = exit_l->probability;
9383   exit_l->probability = exit_e->probability.invert ();
9384   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9385     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9386 }
9387 
9388 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9389    latch edge values originally defined by it.  */
9390 
9391 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)9392 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9393 				     stmt_vec_info def_stmt_info)
9394 {
9395   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9396   if (!def || TREE_CODE (def) != SSA_NAME)
9397     return;
9398   stmt_vec_info phi_info;
9399   imm_use_iterator iter;
9400   use_operand_p use_p;
9401   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9402     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9403       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9404 	  && (phi_info = loop_vinfo->lookup_stmt (phi))
9405 	  && STMT_VINFO_RELEVANT_P (phi_info)
9406 	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9407 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9408 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9409 	{
9410 	  loop_p loop = gimple_bb (phi)->loop_father;
9411 	  edge e = loop_latch_edge (loop);
9412 	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9413 	    {
9414 	      vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9415 	      vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9416 	      gcc_assert (phi_defs.length () == latch_defs.length ());
9417 	      for (unsigned i = 0; i < phi_defs.length (); ++i)
9418 		add_phi_arg (as_a <gphi *> (phi_defs[i]),
9419 			     gimple_get_lhs (latch_defs[i]), e,
9420 			     gimple_phi_arg_location (phi, e->dest_idx));
9421 	    }
9422 	}
9423 }
9424 
9425 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9426    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9427    stmt_vec_info.  */
9428 
9429 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)9430 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9431 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9432 {
9433   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9434   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9435 
9436   if (dump_enabled_p ())
9437     dump_printf_loc (MSG_NOTE, vect_location,
9438 		     "------>vectorizing statement: %G", stmt_info->stmt);
9439 
9440   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9441     vect_loop_kill_debug_uses (loop, stmt_info);
9442 
9443   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9444       && !STMT_VINFO_LIVE_P (stmt_info))
9445     return false;
9446 
9447   if (STMT_VINFO_VECTYPE (stmt_info))
9448     {
9449       poly_uint64 nunits
9450 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9451       if (!STMT_SLP_TYPE (stmt_info)
9452 	  && maybe_ne (nunits, vf)
9453 	  && dump_enabled_p ())
9454 	/* For SLP VF is set according to unrolling factor, and not
9455 	   to vector size, hence for SLP this print is not valid.  */
9456 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9457     }
9458 
9459   /* Pure SLP statements have already been vectorized.  We still need
9460      to apply loop vectorization to hybrid SLP statements.  */
9461   if (PURE_SLP_STMT (stmt_info))
9462     return false;
9463 
9464   if (dump_enabled_p ())
9465     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9466 
9467   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9468     *seen_store = stmt_info;
9469 
9470   return true;
9471 }
9472 
9473 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9474    in the hash_map with its corresponding values.  */
9475 
9476 static tree
find_in_mapping(tree t,void * context)9477 find_in_mapping (tree t, void *context)
9478 {
9479   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9480 
9481   tree *value = mapping->get (t);
9482   return value ? *value : t;
9483 }
9484 
9485 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9486    original loop that has now been vectorized.
9487 
9488    The inits of the data_references need to be advanced with the number of
9489    iterations of the main loop.  This has been computed in vect_do_peeling and
9490    is stored in parameter ADVANCE.  We first restore the data_references
9491    initial offset with the values recored in ORIG_DRS_INIT.
9492 
9493    Since the loop_vec_info of this EPILOGUE was constructed for the original
9494    loop, its stmt_vec_infos all point to the original statements.  These need
9495    to be updated to point to their corresponding copies as well as the SSA_NAMES
9496    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9497 
9498    The data_reference's connections also need to be updated.  Their
9499    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9500    stmt_vec_infos, their statements need to point to their corresponding copy,
9501    if they are gather loads or scatter stores then their reference needs to be
9502    updated to point to its corresponding copy.  */
9503 
9504 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)9505 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9506 {
9507   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9508   auto_vec<gimple *> stmt_worklist;
9509   hash_map<tree,tree> mapping;
9510   gimple *orig_stmt, *new_stmt;
9511   gimple_stmt_iterator epilogue_gsi;
9512   gphi_iterator epilogue_phi_gsi;
9513   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9514   basic_block *epilogue_bbs = get_loop_body (epilogue);
9515   unsigned i;
9516 
9517   free (LOOP_VINFO_BBS (epilogue_vinfo));
9518   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9519 
9520   /* Advance data_reference's with the number of iterations of the previous
9521      loop and its prologue.  */
9522   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9523 
9524 
9525   /* The EPILOGUE loop is a copy of the original loop so they share the same
9526      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9527      point to the copied statements.  We also create a mapping of all LHS' in
9528      the original loop and all the LHS' in the EPILOGUE and create worklists to
9529      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9530   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9531     {
9532       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9533 	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9534 	{
9535 	  new_stmt = epilogue_phi_gsi.phi ();
9536 
9537 	  gcc_assert (gimple_uid (new_stmt) > 0);
9538 	  stmt_vinfo
9539 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9540 
9541 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9542 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9543 
9544 	  mapping.put (gimple_phi_result (orig_stmt),
9545 		       gimple_phi_result (new_stmt));
9546 	  /* PHI nodes can not have patterns or related statements.  */
9547 	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9548 		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9549 	}
9550 
9551       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9552 	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9553 	{
9554 	  new_stmt = gsi_stmt (epilogue_gsi);
9555 	  if (is_gimple_debug (new_stmt))
9556 	    continue;
9557 
9558 	  gcc_assert (gimple_uid (new_stmt) > 0);
9559 	  stmt_vinfo
9560 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9561 
9562 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9563 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9564 
9565 	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
9566 	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9567 
9568 	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9569 	    {
9570 	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9571 	      for (gimple_stmt_iterator gsi = gsi_start (seq);
9572 		   !gsi_end_p (gsi); gsi_next (&gsi))
9573 		stmt_worklist.safe_push (gsi_stmt (gsi));
9574 	    }
9575 
9576 	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9577 	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9578 	    {
9579 	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9580 	      stmt_worklist.safe_push (stmt);
9581 	      /* Set BB such that the assert in
9582 		'get_initial_def_for_reduction' is able to determine that
9583 		the BB of the related stmt is inside this loop.  */
9584 	      gimple_set_bb (stmt,
9585 			     gimple_bb (new_stmt));
9586 	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9587 	      gcc_assert (related_vinfo == NULL
9588 			  || related_vinfo == stmt_vinfo);
9589 	    }
9590 	}
9591     }
9592 
9593   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9594      using the original main loop and thus need to be updated to refer to the
9595      cloned variables used in the epilogue.  */
9596   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9597     {
9598       gimple *stmt = stmt_worklist[i];
9599       tree *new_op;
9600 
9601       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9602 	{
9603 	  tree op = gimple_op (stmt, j);
9604 	  if ((new_op = mapping.get(op)))
9605 	    gimple_set_op (stmt, j, *new_op);
9606 	  else
9607 	    {
9608 	      /* PR92429: The last argument of simplify_replace_tree disables
9609 		 folding when replacing arguments.  This is required as
9610 		 otherwise you might end up with different statements than the
9611 		 ones analyzed in vect_loop_analyze, leading to different
9612 		 vectorization.  */
9613 	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9614 					  &find_in_mapping, &mapping, false);
9615 	      gimple_set_op (stmt, j, op);
9616 	    }
9617 	}
9618     }
9619 
9620   struct data_reference *dr;
9621   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9622   FOR_EACH_VEC_ELT (datarefs, i, dr)
9623     {
9624       orig_stmt = DR_STMT (dr);
9625       gcc_assert (gimple_uid (orig_stmt) > 0);
9626       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9627       /* Data references for gather loads and scatter stores do not use the
9628 	 updated offset we set using ADVANCE.  Instead we have to make sure the
9629 	 reference in the data references point to the corresponding copy of
9630 	 the original in the epilogue.  */
9631       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9632 	  == VMAT_GATHER_SCATTER)
9633 	{
9634 	  DR_REF (dr)
9635 	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9636 				     &find_in_mapping, &mapping);
9637 	  DR_BASE_ADDRESS (dr)
9638 	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9639 				     &find_in_mapping, &mapping);
9640 	}
9641       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9642       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9643     }
9644 
9645   epilogue_vinfo->shared->datarefs_copy.release ();
9646   epilogue_vinfo->shared->save_datarefs ();
9647 }
9648 
9649 /* Function vect_transform_loop.
9650 
9651    The analysis phase has determined that the loop is vectorizable.
9652    Vectorize the loop - created vectorized stmts to replace the scalar
9653    stmts in the loop, and update the loop exit condition.
9654    Returns scalar epilogue loop if any.  */
9655 
9656 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)9657 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9658 {
9659   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9660   class loop *epilogue = NULL;
9661   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9662   int nbbs = loop->num_nodes;
9663   int i;
9664   tree niters_vector = NULL_TREE;
9665   tree step_vector = NULL_TREE;
9666   tree niters_vector_mult_vf = NULL_TREE;
9667   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9668   unsigned int lowest_vf = constant_lower_bound (vf);
9669   gimple *stmt;
9670   bool check_profitability = false;
9671   unsigned int th;
9672 
9673   DUMP_VECT_SCOPE ("vec_transform_loop");
9674 
9675   loop_vinfo->shared->check_datarefs ();
9676 
9677   /* Use the more conservative vectorization threshold.  If the number
9678      of iterations is constant assume the cost check has been performed
9679      by our caller.  If the threshold makes all loops profitable that
9680      run at least the (estimated) vectorization factor number of times
9681      checking is pointless, too.  */
9682   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9683   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9684     {
9685       if (dump_enabled_p ())
9686 	dump_printf_loc (MSG_NOTE, vect_location,
9687 			 "Profitability threshold is %d loop iterations.\n",
9688 			 th);
9689       check_profitability = true;
9690     }
9691 
9692   /* Make sure there exists a single-predecessor exit bb.  Do this before
9693      versioning.   */
9694   edge e = single_exit (loop);
9695   if (! single_pred_p (e->dest))
9696     {
9697       split_loop_exit_edge (e, true);
9698       if (dump_enabled_p ())
9699 	dump_printf (MSG_NOTE, "split exit edge\n");
9700     }
9701 
9702   /* Version the loop first, if required, so the profitability check
9703      comes first.  */
9704 
9705   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9706     {
9707       class loop *sloop
9708 	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9709       sloop->force_vectorize = false;
9710       check_profitability = false;
9711     }
9712 
9713   /* Make sure there exists a single-predecessor exit bb also on the
9714      scalar loop copy.  Do this after versioning but before peeling
9715      so CFG structure is fine for both scalar and if-converted loop
9716      to make slpeel_duplicate_current_defs_from_edges face matched
9717      loop closed PHI nodes on the exit.  */
9718   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9719     {
9720       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9721       if (! single_pred_p (e->dest))
9722 	{
9723 	  split_loop_exit_edge (e, true);
9724 	  if (dump_enabled_p ())
9725 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9726 	}
9727     }
9728 
9729   tree niters = vect_build_loop_niters (loop_vinfo);
9730   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9731   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9732   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9733   tree advance;
9734   drs_init_vec orig_drs_init;
9735 
9736   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9737 			      &step_vector, &niters_vector_mult_vf, th,
9738 			      check_profitability, niters_no_overflow,
9739 			      &advance);
9740 
9741   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9742       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9743     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9744 			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9745 
9746   if (niters_vector == NULL_TREE)
9747     {
9748       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9749 	  && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9750 	  && known_eq (lowest_vf, vf))
9751 	{
9752 	  niters_vector
9753 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9754 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9755 	  step_vector = build_one_cst (TREE_TYPE (niters));
9756 	}
9757       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9758 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9759 				     &step_vector, niters_no_overflow);
9760       else
9761 	/* vect_do_peeling subtracted the number of peeled prologue
9762 	   iterations from LOOP_VINFO_NITERS.  */
9763 	vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9764 				     &niters_vector, &step_vector,
9765 				     niters_no_overflow);
9766     }
9767 
9768   /* 1) Make sure the loop header has exactly two entries
9769      2) Make sure we have a preheader basic block.  */
9770 
9771   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9772 
9773   split_edge (loop_preheader_edge (loop));
9774 
9775   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9776     /* This will deal with any possible peeling.  */
9777     vect_prepare_for_masked_peels (loop_vinfo);
9778 
9779   /* Schedule the SLP instances first, then handle loop vectorization
9780      below.  */
9781   if (!loop_vinfo->slp_instances.is_empty ())
9782     {
9783       DUMP_VECT_SCOPE ("scheduling SLP instances");
9784       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9785     }
9786 
9787   /* FORNOW: the vectorizer supports only loops which body consist
9788      of one basic block (header + empty latch). When the vectorizer will
9789      support more involved loop forms, the order by which the BBs are
9790      traversed need to be reconsidered.  */
9791 
9792   for (i = 0; i < nbbs; i++)
9793     {
9794       basic_block bb = bbs[i];
9795       stmt_vec_info stmt_info;
9796 
9797       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9798 	   gsi_next (&si))
9799 	{
9800 	  gphi *phi = si.phi ();
9801 	  if (dump_enabled_p ())
9802 	    dump_printf_loc (MSG_NOTE, vect_location,
9803 			     "------>vectorizing phi: %G", phi);
9804 	  stmt_info = loop_vinfo->lookup_stmt (phi);
9805 	  if (!stmt_info)
9806 	    continue;
9807 
9808 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9809 	    vect_loop_kill_debug_uses (loop, stmt_info);
9810 
9811 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
9812 	      && !STMT_VINFO_LIVE_P (stmt_info))
9813 	    continue;
9814 
9815 	  if (STMT_VINFO_VECTYPE (stmt_info)
9816 	      && (maybe_ne
9817 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9818 	      && dump_enabled_p ())
9819 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9820 
9821 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9822 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9823 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9824 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9825 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9826 	      && ! PURE_SLP_STMT (stmt_info))
9827 	    {
9828 	      if (dump_enabled_p ())
9829 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9830 	      vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9831 	    }
9832 	}
9833 
9834       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9835 	   gsi_next (&si))
9836 	{
9837 	  gphi *phi = si.phi ();
9838 	  stmt_info = loop_vinfo->lookup_stmt (phi);
9839 	  if (!stmt_info)
9840 	    continue;
9841 
9842 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
9843 	      && !STMT_VINFO_LIVE_P (stmt_info))
9844 	    continue;
9845 
9846 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9847 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9848 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9849 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9850 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9851 	      && ! PURE_SLP_STMT (stmt_info))
9852 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9853 	}
9854 
9855       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9856 	   !gsi_end_p (si);)
9857 	{
9858 	  stmt = gsi_stmt (si);
9859 	  /* During vectorization remove existing clobber stmts.  */
9860 	  if (gimple_clobber_p (stmt))
9861 	    {
9862 	      unlink_stmt_vdef (stmt);
9863 	      gsi_remove (&si, true);
9864 	      release_defs (stmt);
9865 	    }
9866 	  else
9867 	    {
9868 	      /* Ignore vector stmts created in the outer loop.  */
9869 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
9870 
9871 	      /* vector stmts created in the outer-loop during vectorization of
9872 		 stmts in an inner-loop may not have a stmt_info, and do not
9873 		 need to be vectorized.  */
9874 	      stmt_vec_info seen_store = NULL;
9875 	      if (stmt_info)
9876 		{
9877 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9878 		    {
9879 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9880 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9881 			   !gsi_end_p (subsi); gsi_next (&subsi))
9882 			{
9883 			  stmt_vec_info pat_stmt_info
9884 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9885 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9886 						    &si, &seen_store);
9887 			}
9888 		      stmt_vec_info pat_stmt_info
9889 			= STMT_VINFO_RELATED_STMT (stmt_info);
9890 		      if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9891 						    &si, &seen_store))
9892 			maybe_set_vectorized_backedge_value (loop_vinfo,
9893 							     pat_stmt_info);
9894 		    }
9895 		  else
9896 		    {
9897 		      if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9898 						    &seen_store))
9899 			maybe_set_vectorized_backedge_value (loop_vinfo,
9900 							     stmt_info);
9901 		    }
9902 		}
9903 	      gsi_next (&si);
9904 	      if (seen_store)
9905 		{
9906 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9907 		    /* Interleaving.  If IS_STORE is TRUE, the
9908 		       vectorization of the interleaving chain was
9909 		       completed - free all the stores in the chain.  */
9910 		    vect_remove_stores (loop_vinfo,
9911 					DR_GROUP_FIRST_ELEMENT (seen_store));
9912 		  else
9913 		    /* Free the attached stmt_vec_info and remove the stmt.  */
9914 		    loop_vinfo->remove_stmt (stmt_info);
9915 		}
9916 	    }
9917 	}
9918 
9919       /* Stub out scalar statements that must not survive vectorization.
9920 	 Doing this here helps with grouped statements, or statements that
9921 	 are involved in patterns.  */
9922       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9923 	   !gsi_end_p (gsi); gsi_next (&gsi))
9924 	{
9925 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9926 	  if (!call || !gimple_call_internal_p (call))
9927 	    continue;
9928 	  internal_fn ifn = gimple_call_internal_fn (call);
9929 	  if (ifn == IFN_MASK_LOAD)
9930 	    {
9931 	      tree lhs = gimple_get_lhs (call);
9932 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9933 		{
9934 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
9935 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
9936 		  gsi_replace (&gsi, new_stmt, true);
9937 		}
9938 	    }
9939 	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9940 	    {
9941 	      tree lhs = gimple_get_lhs (call);
9942 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9943 		{
9944 		  tree else_arg
9945 		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9946 		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9947 		  gsi_replace (&gsi, new_stmt, true);
9948 		}
9949 	    }
9950 	}
9951     }				/* BBs in loop */
9952 
9953   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9954      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9955   if (integer_onep (step_vector))
9956     niters_no_overflow = true;
9957   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9958 			   niters_vector_mult_vf, !niters_no_overflow);
9959 
9960   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9961   scale_profile_for_vect_loop (loop, assumed_vf);
9962 
9963   /* True if the final iteration might not handle a full vector's
9964      worth of scalar iterations.  */
9965   bool final_iter_may_be_partial
9966     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9967   /* The minimum number of iterations performed by the epilogue.  This
9968      is 1 when peeling for gaps because we always need a final scalar
9969      iteration.  */
9970   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9971   /* +1 to convert latch counts to loop iteration counts,
9972      -min_epilogue_iters to remove iterations that cannot be performed
9973        by the vector code.  */
9974   int bias_for_lowest = 1 - min_epilogue_iters;
9975   int bias_for_assumed = bias_for_lowest;
9976   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9977   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9978     {
9979       /* When the amount of peeling is known at compile time, the first
9980 	 iteration will have exactly alignment_npeels active elements.
9981 	 In the worst case it will have at least one.  */
9982       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9983       bias_for_lowest += lowest_vf - min_first_active;
9984       bias_for_assumed += assumed_vf - min_first_active;
9985     }
9986   /* In these calculations the "- 1" converts loop iteration counts
9987      back to latch counts.  */
9988   if (loop->any_upper_bound)
9989     {
9990       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9991       loop->nb_iterations_upper_bound
9992 	= (final_iter_may_be_partial
9993 	   ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9994 			    lowest_vf) - 1
9995 	   : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9996 			     lowest_vf) - 1);
9997       if (main_vinfo
9998 	  /* Both peeling for alignment and peeling for gaps can end up
9999 	     with the scalar epilogue running for more than VF-1 iterations.  */
10000 	  && !main_vinfo->peeling_for_alignment
10001 	  && !main_vinfo->peeling_for_gaps)
10002 	{
10003 	  unsigned int bound;
10004 	  poly_uint64 main_iters
10005 	    = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
10006 			   LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
10007 	  main_iters
10008 	    = upper_bound (main_iters,
10009 			   LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
10010 	  if (can_div_away_from_zero_p (main_iters,
10011 					LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10012 					&bound))
10013 	    loop->nb_iterations_upper_bound
10014 	      = wi::umin ((widest_int) (bound - 1),
10015 			  loop->nb_iterations_upper_bound);
10016       }
10017   }
10018   if (loop->any_likely_upper_bound)
10019     loop->nb_iterations_likely_upper_bound
10020       = (final_iter_may_be_partial
10021 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10022 			  + bias_for_lowest, lowest_vf) - 1
10023 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10024 			   + bias_for_lowest, lowest_vf) - 1);
10025   if (loop->any_estimate)
10026     loop->nb_iterations_estimate
10027       = (final_iter_may_be_partial
10028 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10029 			  assumed_vf) - 1
10030 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10031 			   assumed_vf) - 1);
10032 
10033   if (dump_enabled_p ())
10034     {
10035       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10036 	{
10037 	  dump_printf_loc (MSG_NOTE, vect_location,
10038 			   "LOOP VECTORIZED\n");
10039 	  if (loop->inner)
10040 	    dump_printf_loc (MSG_NOTE, vect_location,
10041 			     "OUTER LOOP VECTORIZED\n");
10042 	  dump_printf (MSG_NOTE, "\n");
10043 	}
10044       else
10045 	dump_printf_loc (MSG_NOTE, vect_location,
10046 			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10047 			 GET_MODE_NAME (loop_vinfo->vector_mode));
10048     }
10049 
10050   /* Loops vectorized with a variable factor won't benefit from
10051      unrolling/peeling.  */
10052   if (!vf.is_constant ())
10053     {
10054       loop->unroll = 1;
10055       if (dump_enabled_p ())
10056 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10057 			 " variable-length vectorization factor\n");
10058     }
10059   /* Free SLP instances here because otherwise stmt reference counting
10060      won't work.  */
10061   slp_instance instance;
10062   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10063     vect_free_slp_instance (instance);
10064   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10065   /* Clear-up safelen field since its value is invalid after vectorization
10066      since vectorized loop can have loop-carried dependencies.  */
10067   loop->safelen = 0;
10068 
10069   if (epilogue)
10070     {
10071       update_epilogue_loop_vinfo (epilogue, advance);
10072 
10073       epilogue->simduid = loop->simduid;
10074       epilogue->force_vectorize = loop->force_vectorize;
10075       epilogue->dont_vectorize = false;
10076     }
10077 
10078   return epilogue;
10079 }
10080 
10081 /* The code below is trying to perform simple optimization - revert
10082    if-conversion for masked stores, i.e. if the mask of a store is zero
10083    do not perform it and all stored value producers also if possible.
10084    For example,
10085      for (i=0; i<n; i++)
10086        if (c[i])
10087 	{
10088 	  p1[i] += 1;
10089 	  p2[i] = p3[i] +2;
10090 	}
10091    this transformation will produce the following semi-hammock:
10092 
10093    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10094      {
10095        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10096        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10097        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10098        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10099        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10100        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10101      }
10102 */
10103 
10104 void
optimize_mask_stores(class loop * loop)10105 optimize_mask_stores (class loop *loop)
10106 {
10107   basic_block *bbs = get_loop_body (loop);
10108   unsigned nbbs = loop->num_nodes;
10109   unsigned i;
10110   basic_block bb;
10111   class loop *bb_loop;
10112   gimple_stmt_iterator gsi;
10113   gimple *stmt;
10114   auto_vec<gimple *> worklist;
10115   auto_purge_vect_location sentinel;
10116 
10117   vect_location = find_loop_location (loop);
10118   /* Pick up all masked stores in loop if any.  */
10119   for (i = 0; i < nbbs; i++)
10120     {
10121       bb = bbs[i];
10122       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10123 	   gsi_next (&gsi))
10124 	{
10125 	  stmt = gsi_stmt (gsi);
10126 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10127 	    worklist.safe_push (stmt);
10128 	}
10129     }
10130 
10131   free (bbs);
10132   if (worklist.is_empty ())
10133     return;
10134 
10135   /* Loop has masked stores.  */
10136   while (!worklist.is_empty ())
10137     {
10138       gimple *last, *last_store;
10139       edge e, efalse;
10140       tree mask;
10141       basic_block store_bb, join_bb;
10142       gimple_stmt_iterator gsi_to;
10143       tree vdef, new_vdef;
10144       gphi *phi;
10145       tree vectype;
10146       tree zero;
10147 
10148       last = worklist.pop ();
10149       mask = gimple_call_arg (last, 2);
10150       bb = gimple_bb (last);
10151       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10152 	 the same loop as if_bb.  It could be different to LOOP when two
10153 	 level loop-nest is vectorized and mask_store belongs to the inner
10154 	 one.  */
10155       e = split_block (bb, last);
10156       bb_loop = bb->loop_father;
10157       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10158       join_bb = e->dest;
10159       store_bb = create_empty_bb (bb);
10160       add_bb_to_loop (store_bb, bb_loop);
10161       e->flags = EDGE_TRUE_VALUE;
10162       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10163       /* Put STORE_BB to likely part.  */
10164       efalse->probability = profile_probability::unlikely ();
10165       store_bb->count = efalse->count ();
10166       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10167       if (dom_info_available_p (CDI_DOMINATORS))
10168 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10169       if (dump_enabled_p ())
10170 	dump_printf_loc (MSG_NOTE, vect_location,
10171 			 "Create new block %d to sink mask stores.",
10172 			 store_bb->index);
10173       /* Create vector comparison with boolean result.  */
10174       vectype = TREE_TYPE (mask);
10175       zero = build_zero_cst (vectype);
10176       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10177       gsi = gsi_last_bb (bb);
10178       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10179       /* Create new PHI node for vdef of the last masked store:
10180 	 .MEM_2 = VDEF <.MEM_1>
10181 	 will be converted to
10182 	 .MEM.3 = VDEF <.MEM_1>
10183 	 and new PHI node will be created in join bb
10184 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
10185       */
10186       vdef = gimple_vdef (last);
10187       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10188       gimple_set_vdef (last, new_vdef);
10189       phi = create_phi_node (vdef, join_bb);
10190       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10191 
10192       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10193       while (true)
10194 	{
10195 	  gimple_stmt_iterator gsi_from;
10196 	  gimple *stmt1 = NULL;
10197 
10198 	  /* Move masked store to STORE_BB.  */
10199 	  last_store = last;
10200 	  gsi = gsi_for_stmt (last);
10201 	  gsi_from = gsi;
10202 	  /* Shift GSI to the previous stmt for further traversal.  */
10203 	  gsi_prev (&gsi);
10204 	  gsi_to = gsi_start_bb (store_bb);
10205 	  gsi_move_before (&gsi_from, &gsi_to);
10206 	  /* Setup GSI_TO to the non-empty block start.  */
10207 	  gsi_to = gsi_start_bb (store_bb);
10208 	  if (dump_enabled_p ())
10209 	    dump_printf_loc (MSG_NOTE, vect_location,
10210 			     "Move stmt to created bb\n%G", last);
10211 	  /* Move all stored value producers if possible.  */
10212 	  while (!gsi_end_p (gsi))
10213 	    {
10214 	      tree lhs;
10215 	      imm_use_iterator imm_iter;
10216 	      use_operand_p use_p;
10217 	      bool res;
10218 
10219 	      /* Skip debug statements.  */
10220 	      if (is_gimple_debug (gsi_stmt (gsi)))
10221 		{
10222 		  gsi_prev (&gsi);
10223 		  continue;
10224 		}
10225 	      stmt1 = gsi_stmt (gsi);
10226 	      /* Do not consider statements writing to memory or having
10227 		 volatile operand.  */
10228 	      if (gimple_vdef (stmt1)
10229 		  || gimple_has_volatile_ops (stmt1))
10230 		break;
10231 	      gsi_from = gsi;
10232 	      gsi_prev (&gsi);
10233 	      lhs = gimple_get_lhs (stmt1);
10234 	      if (!lhs)
10235 		break;
10236 
10237 	      /* LHS of vectorized stmt must be SSA_NAME.  */
10238 	      if (TREE_CODE (lhs) != SSA_NAME)
10239 		break;
10240 
10241 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10242 		{
10243 		  /* Remove dead scalar statement.  */
10244 		  if (has_zero_uses (lhs))
10245 		    {
10246 		      gsi_remove (&gsi_from, true);
10247 		      continue;
10248 		    }
10249 		}
10250 
10251 	      /* Check that LHS does not have uses outside of STORE_BB.  */
10252 	      res = true;
10253 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10254 		{
10255 		  gimple *use_stmt;
10256 		  use_stmt = USE_STMT (use_p);
10257 		  if (is_gimple_debug (use_stmt))
10258 		    continue;
10259 		  if (gimple_bb (use_stmt) != store_bb)
10260 		    {
10261 		      res = false;
10262 		      break;
10263 		    }
10264 		}
10265 	      if (!res)
10266 		break;
10267 
10268 	      if (gimple_vuse (stmt1)
10269 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
10270 		break;
10271 
10272 	      /* Can move STMT1 to STORE_BB.  */
10273 	      if (dump_enabled_p ())
10274 		dump_printf_loc (MSG_NOTE, vect_location,
10275 				 "Move stmt to created bb\n%G", stmt1);
10276 	      gsi_move_before (&gsi_from, &gsi_to);
10277 	      /* Shift GSI_TO for further insertion.  */
10278 	      gsi_prev (&gsi_to);
10279 	    }
10280 	  /* Put other masked stores with the same mask to STORE_BB.  */
10281 	  if (worklist.is_empty ()
10282 	      || gimple_call_arg (worklist.last (), 2) != mask
10283 	      || worklist.last () != stmt1)
10284 	    break;
10285 	  last = worklist.pop ();
10286 	}
10287       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10288     }
10289 }
10290 
10291 /* Decide whether it is possible to use a zero-based induction variable
10292    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10293    the value that the induction variable must be able to hold in order
10294    to ensure that the rgroups eventually have no active vector elements.
10295    Return -1 otherwise.  */
10296 
10297 widest_int
vect_iv_limit_for_partial_vectors(loop_vec_info loop_vinfo)10298 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10299 {
10300   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10301   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10302   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10303 
10304   /* Calculate the value that the induction variable must be able
10305      to hit in order to ensure that we end the loop with an all-false mask.
10306      This involves adding the maximum number of inactive trailing scalar
10307      iterations.  */
10308   widest_int iv_limit = -1;
10309   if (max_loop_iterations (loop, &iv_limit))
10310     {
10311       if (niters_skip)
10312 	{
10313 	  /* Add the maximum number of skipped iterations to the
10314 	     maximum iteration count.  */
10315 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
10316 	    iv_limit += wi::to_widest (niters_skip);
10317 	  else
10318 	    iv_limit += max_vf - 1;
10319 	}
10320       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10321 	/* Make a conservatively-correct assumption.  */
10322 	iv_limit += max_vf - 1;
10323 
10324       /* IV_LIMIT is the maximum number of latch iterations, which is also
10325 	 the maximum in-range IV value.  Round this value down to the previous
10326 	 vector alignment boundary and then add an extra full iteration.  */
10327       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10328       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10329     }
10330   return iv_limit;
10331 }
10332 
10333 /* For the given rgroup_controls RGC, check whether an induction variable
10334    would ever hit a value that produces a set of all-false masks or zero
10335    lengths before wrapping around.  Return true if it's possible to wrap
10336    around before hitting the desirable value, otherwise return false.  */
10337 
10338 bool
vect_rgroup_iv_might_wrap_p(loop_vec_info loop_vinfo,rgroup_controls * rgc)10339 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10340 {
10341   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10342 
10343   if (iv_limit == -1)
10344     return true;
10345 
10346   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10347   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10348   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10349 
10350   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10351     return true;
10352 
10353   return false;
10354 }
10355