xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-vect-loop.cc (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1 /* Loop Vectorization
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 
59 /* Loop Vectorization Pass.
60 
61    This pass tries to vectorize loops.
62 
63    For example, the vectorizer transforms the following simple loop:
64 
65         short a[N]; short b[N]; short c[N]; int i;
66 
67         for (i=0; i<N; i++){
68           a[i] = b[i] + c[i];
69         }
70 
71    as if it was manually vectorized by rewriting the source code into:
72 
73         typedef int __attribute__((mode(V8HI))) v8hi;
74         short a[N];  short b[N]; short c[N];   int i;
75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76         v8hi va, vb, vc;
77 
78         for (i=0; i<N/8; i++){
79           vb = pb[i];
80           vc = pc[i];
81           va = vb + vc;
82           pa[i] = va;
83         }
84 
85         The main entry to this pass is vectorize_loops(), in which
86    the vectorizer applies a set of analyses on a given set of loops,
87    followed by the actual vectorization transformation for the loops that
88    had successfully passed the analysis phase.
89         Throughout this pass we make a distinction between two types of
90    data: scalars (which are represented by SSA_NAMES), and memory references
91    ("data-refs").  These two types of data require different handling both
92    during analysis and transformation. The types of data-refs that the
93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95    accesses are required to have a simple (consecutive) access pattern.
96 
97    Analysis phase:
98    ===============
99         The driver for the analysis phase is vect_analyze_loop().
100    It applies a set of analyses, some of which rely on the scalar evolution
101    analyzer (scev) developed by Sebastian Pop.
102 
103         During the analysis phase the vectorizer records some information
104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105    loop, as well as general information about the loop as a whole, which is
106    recorded in a "loop_vec_info" struct attached to each loop.
107 
108    Transformation phase:
109    =====================
110         The loop transformation phase scans all the stmts in the loop, and
111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112    the loop that needs to be vectorized.  It inserts the vector code sequence
113    just before the scalar stmt S, and records a pointer to the vector code
114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115    attached to S).  This pointer will be used for the vectorization of following
116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117    otherwise, we rely on dead code elimination for removing it.
118 
119         For example, say stmt S1 was vectorized into stmt VS1:
120 
121    VS1: vb = px[i];
122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123    S2:  a = b;
124 
125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
128    resulting sequence would be:
129 
130    VS1: vb = px[i];
131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132    VS2: va = vb;
133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 
135         Operands that are not SSA_NAMEs, are data-refs that appear in
136    load/store operations (like 'x[i]' in S1), and are handled differently.
137 
138    Target modeling:
139    =================
140         Currently the only target specific information that is used is the
141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142    Targets that can support different sizes of vectors, for now will need
143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
144    flexibility will be added in the future.
145 
146         Since we only vectorize operations which vector form can be
147    expressed using existing tree codes, to verify that an operation is
148    supported, the vectorizer checks the relevant optab at the relevant
149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
150    the value found is CODE_FOR_nothing, then there's no target support, and
151    we can't vectorize the stmt.
152 
153    For additional information on this project see:
154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156 
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 						unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 					       bool *, bool *);
161 
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164    may already be set for general statements (not just data refs).  */
165 
166 static opt_result
167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 			      bool vectype_maybe_set_p,
169 			      poly_uint64 *vf)
170 {
171   gimple *stmt = stmt_info->stmt;
172 
173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174        && !STMT_VINFO_LIVE_P (stmt_info))
175       || gimple_clobber_p (stmt))
176     {
177       if (dump_enabled_p ())
178 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179       return opt_result::success ();
180     }
181 
182   tree stmt_vectype, nunits_vectype;
183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 						   &stmt_vectype,
185 						   &nunits_vectype);
186   if (!res)
187     return res;
188 
189   if (stmt_vectype)
190     {
191       if (STMT_VINFO_VECTYPE (stmt_info))
192 	/* The only case when a vectype had been already set is for stmts
193 	   that contain a data ref, or for "pattern-stmts" (stmts generated
194 	   by the vectorizer to represent/replace a certain idiom).  */
195 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 		     || vectype_maybe_set_p)
197 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198       else
199 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200     }
201 
202   if (nunits_vectype)
203     vect_update_max_nunits (vf, nunits_vectype);
204 
205   return opt_result::success ();
206 }
207 
208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
209    types of STMT_INFO and all attached pattern statements and update
210    the vectorization factor VF accordingly.  Return true on success
211    or false if something prevented vectorization.  */
212 
213 static opt_result
214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 			    stmt_vec_info stmt_info, poly_uint64 *vf)
216 {
217   if (dump_enabled_p ())
218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 		     stmt_info->stmt);
220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221   if (!res)
222     return res;
223 
224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225       && STMT_VINFO_RELATED_STMT (stmt_info))
226     {
227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
229 
230       /* If a pattern statement has def stmts, analyze them too.  */
231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 	   !gsi_end_p (si); gsi_next (&si))
233 	{
234 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 	  if (dump_enabled_p ())
236 	    dump_printf_loc (MSG_NOTE, vect_location,
237 			     "==> examining pattern def stmt: %G",
238 			     def_stmt_info->stmt);
239 	  res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 	  if (!res)
241 	    return res;
242 	}
243 
244       if (dump_enabled_p ())
245 	dump_printf_loc (MSG_NOTE, vect_location,
246 			 "==> examining pattern statement: %G",
247 			 stmt_info->stmt);
248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249       if (!res)
250 	return res;
251     }
252 
253   return opt_result::success ();
254 }
255 
256 /* Function vect_determine_vectorization_factor
257 
258    Determine the vectorization factor (VF).  VF is the number of data elements
259    that are operated upon in parallel in a single iteration of the vectorized
260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262    elements can fit in a single vector register.
263 
264    We currently support vectorization of loops in which all types operated upon
265    are of the same size.  Therefore this function currently sets VF according to
266    the size of the types operated upon, and fails if there are multiple sizes
267    in the loop.
268 
269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
270    original loop:
271         for (i=0; i<N; i++){
272           a[i] = b[i] + c[i];
273         }
274 
275    vectorized loop:
276         for (i=0; i<N; i+=VF){
277           a[i:VF] = b[i:VF] + c[i:VF];
278         }
279 */
280 
281 static opt_result
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
283 {
284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286   unsigned nbbs = loop->num_nodes;
287   poly_uint64 vectorization_factor = 1;
288   tree scalar_type = NULL_TREE;
289   gphi *phi;
290   tree vectype;
291   stmt_vec_info stmt_info;
292   unsigned i;
293 
294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
295 
296   for (i = 0; i < nbbs; i++)
297     {
298       basic_block bb = bbs[i];
299 
300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 	   gsi_next (&si))
302 	{
303 	  phi = si.phi ();
304 	  stmt_info = loop_vinfo->lookup_stmt (phi);
305 	  if (dump_enabled_p ())
306 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 			     phi);
308 
309 	  gcc_assert (stmt_info);
310 
311 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
312 	      || STMT_VINFO_LIVE_P (stmt_info))
313             {
314 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
316 
317 	      if (dump_enabled_p ())
318 		dump_printf_loc (MSG_NOTE, vect_location,
319 				 "get vectype for scalar type:  %T\n",
320 				 scalar_type);
321 
322 	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 	      if (!vectype)
324 		return opt_result::failure_at (phi,
325 					       "not vectorized: unsupported "
326 					       "data-type %T\n",
327 					       scalar_type);
328 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
329 
330 	      if (dump_enabled_p ())
331 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 				 vectype);
333 
334 	      if (dump_enabled_p ())
335 		{
336 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 		  dump_printf (MSG_NOTE, "\n");
339 		}
340 
341 	      vect_update_max_nunits (&vectorization_factor, vectype);
342 	    }
343 	}
344 
345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 	   gsi_next (&si))
347 	{
348 	  if (is_gimple_debug (gsi_stmt (si)))
349 	    continue;
350 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 	  opt_result res
352 	    = vect_determine_vf_for_stmt (loop_vinfo,
353 					  stmt_info, &vectorization_factor);
354 	  if (!res)
355 	    return res;
356         }
357     }
358 
359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
360   if (dump_enabled_p ())
361     {
362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363       dump_dec (MSG_NOTE, vectorization_factor);
364       dump_printf (MSG_NOTE, "\n");
365     }
366 
367   if (known_le (vectorization_factor, 1U))
368     return opt_result::failure_at (vect_location,
369 				   "not vectorized: unsupported data-type\n");
370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371   return opt_result::success ();
372 }
373 
374 
375 /* Function vect_is_simple_iv_evolution.
376 
377    FORNOW: A simple evolution of an induction variables in the loop is
378    considered a polynomial evolution.  */
379 
380 static bool
381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382                              tree * step)
383 {
384   tree init_expr;
385   tree step_expr;
386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387   basic_block bb;
388 
389   /* When there is no evolution in this loop, the evolution function
390      is not "simple".  */
391   if (evolution_part == NULL_TREE)
392     return false;
393 
394   /* When the evolution is a polynomial of degree >= 2
395      the evolution function is not "simple".  */
396   if (tree_is_chrec (evolution_part))
397     return false;
398 
399   step_expr = evolution_part;
400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
401 
402   if (dump_enabled_p ())
403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
404 		     step_expr, init_expr);
405 
406   *init = init_expr;
407   *step = step_expr;
408 
409   if (TREE_CODE (step_expr) != INTEGER_CST
410       && (TREE_CODE (step_expr) != SSA_NAME
411 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 		  || !flag_associative_math)))
416       && (TREE_CODE (step_expr) != REAL_CST
417 	  || !flag_associative_math))
418     {
419       if (dump_enabled_p ())
420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421                          "step unknown.\n");
422       return false;
423     }
424 
425   return true;
426 }
427 
428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
429    what we are assuming is a double reduction.  For example, given
430    a structure like this:
431 
432       outer1:
433 	x_1 = PHI <x_4(outer2), ...>;
434 	...
435 
436       inner:
437 	x_2 = PHI <x_1(outer1), ...>;
438 	...
439 	x_3 = ...;
440 	...
441 
442       outer2:
443 	x_4 = PHI <x_3(inner)>;
444 	...
445 
446    outer loop analysis would treat x_1 as a double reduction phi and
447    this function would then return true for x_2.  */
448 
449 static bool
450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
451 {
452   use_operand_p use_p;
453   ssa_op_iter op_iter;
454   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
455     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
456       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
457 	return true;
458   return false;
459 }
460 
461 /* Function vect_analyze_scalar_cycles_1.
462 
463    Examine the cross iteration def-use cycles of scalar variables
464    in LOOP.  LOOP_VINFO represents the loop that is now being
465    considered for vectorization (can be LOOP, or an outer-loop
466    enclosing LOOP).  */
467 
468 static void
469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
470 {
471   basic_block bb = loop->header;
472   tree init, step;
473   auto_vec<stmt_vec_info, 64> worklist;
474   gphi_iterator gsi;
475   bool double_reduc, reduc_chain;
476 
477   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
478 
479   /* First - identify all inductions.  Reduction detection assumes that all the
480      inductions have been identified, therefore, this order must not be
481      changed.  */
482   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
483     {
484       gphi *phi = gsi.phi ();
485       tree access_fn = NULL;
486       tree def = PHI_RESULT (phi);
487       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
488 
489       if (dump_enabled_p ())
490 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
491 
492       /* Skip virtual phi's.  The data dependences that are associated with
493          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
494       if (virtual_operand_p (def))
495 	continue;
496 
497       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
498 
499       /* Analyze the evolution function.  */
500       access_fn = analyze_scalar_evolution (loop, def);
501       if (access_fn)
502 	{
503 	  STRIP_NOPS (access_fn);
504 	  if (dump_enabled_p ())
505 	    dump_printf_loc (MSG_NOTE, vect_location,
506 			     "Access function of PHI: %T\n", access_fn);
507 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
508 	    = initial_condition_in_loop_num (access_fn, loop->num);
509 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
510 	    = evolution_part_in_loop_num (access_fn, loop->num);
511 	}
512 
513       if (!access_fn
514 	  || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
515 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
516 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
517 	      && TREE_CODE (step) != INTEGER_CST))
518 	{
519 	  worklist.safe_push (stmt_vinfo);
520 	  continue;
521 	}
522 
523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
524 		  != NULL_TREE);
525       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
526 
527       if (dump_enabled_p ())
528 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
529       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530     }
531 
532 
533   /* Second - identify all reductions and nested cycles.  */
534   while (worklist.length () > 0)
535     {
536       stmt_vec_info stmt_vinfo = worklist.pop ();
537       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
538       tree def = PHI_RESULT (phi);
539 
540       if (dump_enabled_p ())
541 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
542 
543       gcc_assert (!virtual_operand_p (def)
544 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
545 
546       stmt_vec_info reduc_stmt_info
547 	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
548 				    &reduc_chain);
549       if (reduc_stmt_info)
550         {
551 	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
552 	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
553 	  if (double_reduc)
554 	    {
555 	      if (dump_enabled_p ())
556 		dump_printf_loc (MSG_NOTE, vect_location,
557 				 "Detected double reduction.\n");
558 
559               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
560 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
561             }
562           else
563             {
564               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
565                 {
566                   if (dump_enabled_p ())
567                     dump_printf_loc (MSG_NOTE, vect_location,
568 				     "Detected vectorizable nested cycle.\n");
569 
570                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
571                 }
572               else
573                 {
574                   if (dump_enabled_p ())
575                     dump_printf_loc (MSG_NOTE, vect_location,
576 				     "Detected reduction.\n");
577 
578                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
579 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
580                   /* Store the reduction cycles for possible vectorization in
581                      loop-aware SLP if it was not detected as reduction
582 		     chain.  */
583 		  if (! reduc_chain)
584 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
585 		      (reduc_stmt_info);
586                 }
587             }
588         }
589       else
590         if (dump_enabled_p ())
591           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592 			   "Unknown def-use cycle pattern.\n");
593     }
594 }
595 
596 
597 /* Function vect_analyze_scalar_cycles.
598 
599    Examine the cross iteration def-use cycles of scalar variables, by
600    analyzing the loop-header PHIs of scalar variables.  Classify each
601    cycle as one of the following: invariant, induction, reduction, unknown.
602    We do that for the loop represented by LOOP_VINFO, and also to its
603    inner-loop, if exists.
604    Examples for scalar cycles:
605 
606    Example1: reduction:
607 
608               loop1:
609               for (i=0; i<N; i++)
610                  sum += a[i];
611 
612    Example2: induction:
613 
614               loop2:
615               for (i=0; i<N; i++)
616                  a[i] = i;  */
617 
618 static void
619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
620 {
621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
622 
623   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
624 
625   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
626      Reductions in such inner-loop therefore have different properties than
627      the reductions in the nest that gets vectorized:
628      1. When vectorized, they are executed in the same order as in the original
629         scalar loop, so we can't change the order of computation when
630         vectorizing them.
631      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
632         current checks are too strict.  */
633 
634   if (loop->inner)
635     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 }
637 
638 /* Transfer group and reduction information from STMT_INFO to its
639    pattern stmt.  */
640 
641 static void
642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
643 {
644   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
645   stmt_vec_info stmtp;
646   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
647 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
648   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649   do
650     {
651       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
652       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
653 			   == STMT_VINFO_DEF_TYPE (stmt_info));
654       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
655       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
656       if (stmt_info)
657 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
658 	  = STMT_VINFO_RELATED_STMT (stmt_info);
659     }
660   while (stmt_info);
661 }
662 
663 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
664 
665 static void
666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
667 {
668   stmt_vec_info first;
669   unsigned i;
670 
671   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
672     {
673       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
674       while (next)
675 	{
676 	  if ((STMT_VINFO_IN_PATTERN_P (next)
677 	       != STMT_VINFO_IN_PATTERN_P (first))
678 	      || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
679 	    break;
680 	  next = REDUC_GROUP_NEXT_ELEMENT (next);
681 	}
682       /* If all reduction chain members are well-formed patterns adjust
683 	 the group to group the pattern stmts instead.  */
684       if (! next
685 	  && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
686 	{
687 	  if (STMT_VINFO_IN_PATTERN_P (first))
688 	    {
689 	      vect_fixup_reduc_chain (first);
690 	      LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
691 		= STMT_VINFO_RELATED_STMT (first);
692 	    }
693 	}
694       /* If not all stmt in the chain are patterns or if we failed
695 	 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
696 	 it as regular reduction instead.  */
697       else
698 	{
699 	  stmt_vec_info vinfo = first;
700 	  stmt_vec_info last = NULL;
701 	  while (vinfo)
702 	    {
703 	      next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
704 	      REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
705 	      REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
706 	      last = vinfo;
707 	      vinfo = next;
708 	    }
709 	  STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
710 	    = vect_internal_def;
711 	  loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
712 	  LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
713 	  --i;
714 	}
715     }
716 }
717 
718 /* Function vect_get_loop_niters.
719 
720    Determine how many iterations the loop is executed and place it
721    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
722    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
723    niter information holds in ASSUMPTIONS.
724 
725    Return the loop exit condition.  */
726 
727 
728 static gcond *
729 vect_get_loop_niters (class loop *loop, tree *assumptions,
730 		      tree *number_of_iterations, tree *number_of_iterationsm1)
731 {
732   edge exit = single_exit (loop);
733   class tree_niter_desc niter_desc;
734   tree niter_assumptions, niter, may_be_zero;
735   gcond *cond = get_loop_exit_condition (loop);
736 
737   *assumptions = boolean_true_node;
738   *number_of_iterationsm1 = chrec_dont_know;
739   *number_of_iterations = chrec_dont_know;
740   DUMP_VECT_SCOPE ("get_loop_niters");
741 
742   if (!exit)
743     return cond;
744 
745   may_be_zero = NULL_TREE;
746   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
747       || chrec_contains_undetermined (niter_desc.niter))
748     return cond;
749 
750   niter_assumptions = niter_desc.assumptions;
751   may_be_zero = niter_desc.may_be_zero;
752   niter = niter_desc.niter;
753 
754   if (may_be_zero && integer_zerop (may_be_zero))
755     may_be_zero = NULL_TREE;
756 
757   if (may_be_zero)
758     {
759       if (COMPARISON_CLASS_P (may_be_zero))
760 	{
761 	  /* Try to combine may_be_zero with assumptions, this can simplify
762 	     computation of niter expression.  */
763 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
764 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
765 					     niter_assumptions,
766 					     fold_build1 (TRUTH_NOT_EXPR,
767 							  boolean_type_node,
768 							  may_be_zero));
769 	  else
770 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
771 				 build_int_cst (TREE_TYPE (niter), 0),
772 				 rewrite_to_non_trapping_overflow (niter));
773 
774 	  may_be_zero = NULL_TREE;
775 	}
776       else if (integer_nonzerop (may_be_zero))
777 	{
778 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
779 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
780 	  return cond;
781 	}
782       else
783 	return cond;
784     }
785 
786   *assumptions = niter_assumptions;
787   *number_of_iterationsm1 = niter;
788 
789   /* We want the number of loop header executions which is the number
790      of latch executions plus one.
791      ???  For UINT_MAX latch executions this number overflows to zero
792      for loops like do { n++; } while (n != 0);  */
793   if (niter && !chrec_contains_undetermined (niter))
794     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
795 			  build_int_cst (TREE_TYPE (niter), 1));
796   *number_of_iterations = niter;
797 
798   return cond;
799 }
800 
801 /* Function bb_in_loop_p
802 
803    Used as predicate for dfs order traversal of the loop bbs.  */
804 
805 static bool
806 bb_in_loop_p (const_basic_block bb, const void *data)
807 {
808   const class loop *const loop = (const class loop *)data;
809   if (flow_bb_inside_loop_p (loop, bb))
810     return true;
811   return false;
812 }
813 
814 
815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
816    stmt_vec_info structs for all the stmts in LOOP_IN.  */
817 
818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
819   : vec_info (vec_info::loop, shared),
820     loop (loop_in),
821     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
822     num_itersm1 (NULL_TREE),
823     num_iters (NULL_TREE),
824     num_iters_unchanged (NULL_TREE),
825     num_iters_assumptions (NULL_TREE),
826     vector_costs (nullptr),
827     scalar_costs (nullptr),
828     th (0),
829     versioning_threshold (0),
830     vectorization_factor (0),
831     main_loop_edge (nullptr),
832     skip_main_loop_edge (nullptr),
833     skip_this_loop_edge (nullptr),
834     reusable_accumulators (),
835     suggested_unroll_factor (1),
836     max_vectorization_factor (0),
837     mask_skip_niters (NULL_TREE),
838     rgroup_compare_type (NULL_TREE),
839     simd_if_cond (NULL_TREE),
840     unaligned_dr (NULL),
841     peeling_for_alignment (0),
842     ptr_mask (0),
843     ivexpr_map (NULL),
844     scan_map (NULL),
845     slp_unrolling_factor (1),
846     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
847     vectorizable (false),
848     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
849     using_partial_vectors_p (false),
850     epil_using_partial_vectors_p (false),
851     partial_load_store_bias (0),
852     peeling_for_gaps (false),
853     peeling_for_niter (false),
854     no_data_dependencies (false),
855     has_mask_store (false),
856     scalar_loop_scaling (profile_probability::uninitialized ()),
857     scalar_loop (NULL),
858     orig_loop_info (NULL)
859 {
860   /* CHECKME: We want to visit all BBs before their successors (except for
861      latch blocks, for which this assertion wouldn't hold).  In the simple
862      case of the loop forms we allow, a dfs order of the BBs would the same
863      as reversed postorder traversal, so we are safe.  */
864 
865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 					  bbs, loop->num_nodes, loop);
867   gcc_assert (nbbs == loop->num_nodes);
868 
869   for (unsigned int i = 0; i < nbbs; i++)
870     {
871       basic_block bb = bbs[i];
872       gimple_stmt_iterator si;
873 
874       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
875 	{
876 	  gimple *phi = gsi_stmt (si);
877 	  gimple_set_uid (phi, 0);
878 	  add_stmt (phi);
879 	}
880 
881       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
882 	{
883 	  gimple *stmt = gsi_stmt (si);
884 	  gimple_set_uid (stmt, 0);
885 	  if (is_gimple_debug (stmt))
886 	    continue;
887 	  add_stmt (stmt);
888 	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
889 	     third argument is the #pragma omp simd if (x) condition, when 0,
890 	     loop shouldn't be vectorized, when non-zero constant, it should
891 	     be vectorized normally, otherwise versioned with vectorized loop
892 	     done if the condition is non-zero at runtime.  */
893 	  if (loop_in->simduid
894 	      && is_gimple_call (stmt)
895 	      && gimple_call_internal_p (stmt)
896 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
897 	      && gimple_call_num_args (stmt) >= 3
898 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
899 	      && (loop_in->simduid
900 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
901 	    {
902 	      tree arg = gimple_call_arg (stmt, 2);
903 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
904 		simd_if_cond = arg;
905 	      else
906 		gcc_assert (integer_nonzerop (arg));
907 	    }
908 	}
909     }
910 
911   epilogue_vinfos.create (6);
912 }
913 
914 /* Free all levels of rgroup CONTROLS.  */
915 
916 void
917 release_vec_loop_controls (vec<rgroup_controls> *controls)
918 {
919   rgroup_controls *rgc;
920   unsigned int i;
921   FOR_EACH_VEC_ELT (*controls, i, rgc)
922     rgc->controls.release ();
923   controls->release ();
924 }
925 
926 /* Free all memory used by the _loop_vec_info, as well as all the
927    stmt_vec_info structs of all the stmts in the loop.  */
928 
929 _loop_vec_info::~_loop_vec_info ()
930 {
931   free (bbs);
932 
933   release_vec_loop_controls (&masks);
934   release_vec_loop_controls (&lens);
935   delete ivexpr_map;
936   delete scan_map;
937   epilogue_vinfos.release ();
938   delete scalar_costs;
939   delete vector_costs;
940 
941   /* When we release an epiloge vinfo that we do not intend to use
942      avoid clearing AUX of the main loop which should continue to
943      point to the main loop vinfo since otherwise we'll leak that.  */
944   if (loop->aux == this)
945     loop->aux = NULL;
946 }
947 
948 /* Return an invariant or register for EXPR and emit necessary
949    computations in the LOOP_VINFO loop preheader.  */
950 
951 tree
952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
953 {
954   if (is_gimple_reg (expr)
955       || is_gimple_min_invariant (expr))
956     return expr;
957 
958   if (! loop_vinfo->ivexpr_map)
959     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
960   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
961   if (! cached)
962     {
963       gimple_seq stmts = NULL;
964       cached = force_gimple_operand (unshare_expr (expr),
965 				     &stmts, true, NULL_TREE);
966       if (stmts)
967 	{
968 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
969 	  gsi_insert_seq_on_edge_immediate (e, stmts);
970 	}
971     }
972   return cached;
973 }
974 
975 /* Return true if we can use CMP_TYPE as the comparison type to produce
976    all masks required to mask LOOP_VINFO.  */
977 
978 static bool
979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
980 {
981   rgroup_controls *rgm;
982   unsigned int i;
983   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
984     if (rgm->type != NULL_TREE
985 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
986 					    cmp_type, rgm->type,
987 					    OPTIMIZE_FOR_SPEED))
988       return false;
989   return true;
990 }
991 
992 /* Calculate the maximum number of scalars per iteration for every
993    rgroup in LOOP_VINFO.  */
994 
995 static unsigned int
996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
997 {
998   unsigned int res = 1;
999   unsigned int i;
1000   rgroup_controls *rgm;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     res = MAX (res, rgm->max_nscalars_per_iter);
1003   return res;
1004 }
1005 
1006 /* Calculate the minimum precision necessary to represent:
1007 
1008       MAX_NITERS * FACTOR
1009 
1010    as an unsigned integer, where MAX_NITERS is the maximum number of
1011    loop header iterations for the original scalar form of LOOP_VINFO.  */
1012 
1013 static unsigned
1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1015 {
1016   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022 
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027 
1028   /* Work out how many bits we need to represent the limit.  */
1029   return wi::min_precision (max_ni * factor, UNSIGNED);
1030 }
1031 
1032 /* True if the loop needs peeling or partial vectors when vectorized.  */
1033 
1034 static bool
1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1036 {
1037   unsigned HOST_WIDE_INT const_vf;
1038   HOST_WIDE_INT max_niter
1039     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1040 
1041   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044 					  (loop_vinfo));
1045 
1046   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1048     {
1049       /* Work out the (constant) number of iterations that need to be
1050 	 peeled for reasons other than niters.  */
1051       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053 	peel_niter += 1;
1054       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056 	return true;
1057     }
1058   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059       /* ??? When peeling for gaps but not alignment, we could
1060 	 try to check whether the (variable) niters is known to be
1061 	 VF * N + 1.  That's something of a niche case though.  */
1062       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065 	   < (unsigned) exact_log2 (const_vf))
1066 	  /* In case of versioning, check if the maximum number of
1067 	     iterations is greater than th.  If they are identical,
1068 	     the epilogue is unnecessary.  */
1069 	  && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070 	      || ((unsigned HOST_WIDE_INT) max_niter
1071 		  > (th / const_vf) * const_vf))))
1072     return true;
1073 
1074   return false;
1075 }
1076 
1077 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1078    whether we can actually generate the masks required.  Return true if so,
1079    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1080 
1081 static bool
1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1083 {
1084   unsigned int min_ni_width;
1085   unsigned int max_nscalars_per_iter
1086     = vect_get_max_nscalars_per_iter (loop_vinfo);
1087 
1088   /* Use a normal loop if there are no statements that need masking.
1089      This only happens in rare degenerate cases: it means that the loop
1090      has no loads, no stores, and no live-out values.  */
1091   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092     return false;
1093 
1094   /* Work out how many bits we need to represent the limit.  */
1095   min_ni_width
1096     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1097 
1098   /* Find a scalar mode for which WHILE_ULT is supported.  */
1099   opt_scalar_int_mode cmp_mode_iter;
1100   tree cmp_type = NULL_TREE;
1101   tree iv_type = NULL_TREE;
1102   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103   unsigned int iv_precision = UINT_MAX;
1104 
1105   if (iv_limit != -1)
1106     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107 				      UNSIGNED);
1108 
1109   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1110     {
1111       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112       if (cmp_bits >= min_ni_width
1113 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1114 	{
1115 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116 	  if (this_type
1117 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1118 	    {
1119 	      /* Although we could stop as soon as we find a valid mode,
1120 		 there are at least two reasons why that's not always the
1121 		 best choice:
1122 
1123 		 - An IV that's Pmode or wider is more likely to be reusable
1124 		   in address calculations than an IV that's narrower than
1125 		   Pmode.
1126 
1127 		 - Doing the comparison in IV_PRECISION or wider allows
1128 		   a natural 0-based IV, whereas using a narrower comparison
1129 		   type requires mitigations against wrap-around.
1130 
1131 		 Conversely, if the IV limit is variable, doing the comparison
1132 		 in a wider type than the original type can introduce
1133 		 unnecessary extensions, so picking the widest valid mode
1134 		 is not always a good choice either.
1135 
1136 		 Here we prefer the first IV type that's Pmode or wider,
1137 		 and the first comparison type that's IV_PRECISION or wider.
1138 		 (The comparison type must be no wider than the IV type,
1139 		 to avoid extensions in the vector loop.)
1140 
1141 		 ??? We might want to try continuing beyond Pmode for ILP32
1142 		 targets if CMP_BITS < IV_PRECISION.  */
1143 	      iv_type = this_type;
1144 	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145 		cmp_type = this_type;
1146 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147 		break;
1148 	    }
1149 	}
1150     }
1151 
1152   if (!cmp_type)
1153     return false;
1154 
1155   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157   return true;
1158 }
1159 
1160 /* Check whether we can use vector access with length based on precison
1161    comparison.  So far, to keep it simple, we only allow the case that the
1162    precision of the target supported length is larger than the precision
1163    required by loop niters.  */
1164 
1165 static bool
1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1167 {
1168   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169     return false;
1170 
1171   machine_mode len_load_mode = get_len_load_store_mode
1172     (loop_vinfo->vector_mode, true).require ();
1173   machine_mode len_store_mode = get_len_load_store_mode
1174     (loop_vinfo->vector_mode, false).require ();
1175 
1176   signed char partial_load_bias = internal_len_load_store_bias
1177     (IFN_LEN_LOAD, len_load_mode);
1178 
1179   signed char partial_store_bias = internal_len_load_store_bias
1180     (IFN_LEN_STORE, len_store_mode);
1181 
1182   gcc_assert (partial_load_bias == partial_store_bias);
1183 
1184   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185     return false;
1186 
1187   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188      len_loads with a length of zero.  In order to avoid that we prohibit
1189      more than one loop length here.  */
1190   if (partial_load_bias == -1
1191       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192     return false;
1193 
1194   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1195 
1196   unsigned int max_nitems_per_iter = 1;
1197   unsigned int i;
1198   rgroup_controls *rgl;
1199   /* Find the maximum number of items per iteration for every rgroup.  */
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1201     {
1202       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1204     }
1205 
1206   /* Work out how many bits we need to represent the length limit.  */
1207   unsigned int min_ni_prec
1208     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1209 
1210   /* Now use the maximum of below precisions for one suitable IV type:
1211      - the IV's natural precision
1212      - the precision needed to hold: the maximum number of scalar
1213        iterations multiplied by the scale factor (min_ni_prec above)
1214      - the Pmode precision
1215 
1216      If min_ni_prec is less than the precision of the current niters,
1217      we perfer to still use the niters type.  Prefer to use Pmode and
1218      wider IV to avoid narrow conversions.  */
1219 
1220   unsigned int ni_prec
1221     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222   min_ni_prec = MAX (min_ni_prec, ni_prec);
1223   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1224 
1225   tree iv_type = NULL_TREE;
1226   opt_scalar_int_mode tmode_iter;
1227   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1228     {
1229       scalar_mode tmode = tmode_iter.require ();
1230       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1231 
1232       /* ??? Do we really want to construct one IV whose precision exceeds
1233 	 BITS_PER_WORD?  */
1234       if (tbits > BITS_PER_WORD)
1235 	break;
1236 
1237       /* Find the first available standard integral type.  */
1238       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1239 	{
1240 	  iv_type = build_nonstandard_integer_type (tbits, true);
1241 	  break;
1242 	}
1243     }
1244 
1245   if (!iv_type)
1246     {
1247       if (dump_enabled_p ())
1248 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 			 "can't vectorize with length-based partial vectors"
1250 			 " because there is no suitable iv type.\n");
1251       return false;
1252     }
1253 
1254   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1256 
1257   return true;
1258 }
1259 
1260 /* Calculate the cost of one scalar iteration of the loop.  */
1261 static void
1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1263 {
1264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266   int nbbs = loop->num_nodes, factor;
1267   int innerloop_iters, i;
1268 
1269   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1270 
1271   /* Gather costs for statements in the scalar loop.  */
1272 
1273   /* FORNOW.  */
1274   innerloop_iters = 1;
1275   if (loop->inner)
1276     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1277 
1278   for (i = 0; i < nbbs; i++)
1279     {
1280       gimple_stmt_iterator si;
1281       basic_block bb = bbs[i];
1282 
1283       if (bb->loop_father == loop->inner)
1284         factor = innerloop_iters;
1285       else
1286         factor = 1;
1287 
1288       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1289         {
1290 	  gimple *stmt = gsi_stmt (si);
1291 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1292 
1293           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294             continue;
1295 
1296           /* Skip stmts that are not vectorized inside the loop.  */
1297 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299               && (!STMT_VINFO_LIVE_P (vstmt_info)
1300                   || !VECTORIZABLE_CYCLE_DEF
1301 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1302             continue;
1303 
1304 	  vect_cost_for_stmt kind;
1305           if (STMT_VINFO_DATA_REF (stmt_info))
1306             {
1307               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308                kind = scalar_load;
1309              else
1310                kind = scalar_store;
1311             }
1312 	  else if (vect_nop_conversion_p (stmt_info))
1313 	    continue;
1314 	  else
1315             kind = scalar_stmt;
1316 
1317 	  /* We are using vect_prologue here to avoid scaling twice
1318 	     by the inner loop factor.  */
1319 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320 			    factor, kind, stmt_info, 0, vect_prologue);
1321         }
1322     }
1323 
1324   /* Now accumulate cost.  */
1325   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326   add_stmt_costs (loop_vinfo->scalar_costs,
1327 		  &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328   loop_vinfo->scalar_costs->finish_cost (nullptr);
1329 }
1330 
1331 
1332 /* Function vect_analyze_loop_form.
1333 
1334    Verify that certain CFG restrictions hold, including:
1335    - the loop has a pre-header
1336    - the loop has a single entry and exit
1337    - the loop exit condition is simple enough
1338    - the number of iterations can be analyzed, i.e, a countable loop.  The
1339      niter could be analyzed under some assumptions.  */
1340 
1341 opt_result
1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1343 {
1344   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1345 
1346   /* Different restrictions apply when we are considering an inner-most loop,
1347      vs. an outer (nested) loop.
1348      (FORNOW. May want to relax some of these restrictions in the future).  */
1349 
1350   info->inner_loop_cond = NULL;
1351   if (!loop->inner)
1352     {
1353       /* Inner-most loop.  We currently require that the number of BBs is
1354 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1355 	 look like this:
1356 
1357                         (pre-header)
1358                            |
1359                           header <--------+
1360                            | |            |
1361                            | +--> latch --+
1362                            |
1363                         (exit-bb)  */
1364 
1365       if (loop->num_nodes != 2)
1366 	return opt_result::failure_at (vect_location,
1367 				       "not vectorized:"
1368 				       " control flow in loop.\n");
1369 
1370       if (empty_block_p (loop->header))
1371 	return opt_result::failure_at (vect_location,
1372 				       "not vectorized: empty loop.\n");
1373     }
1374   else
1375     {
1376       class loop *innerloop = loop->inner;
1377       edge entryedge;
1378 
1379       /* Nested loop. We currently require that the loop is doubly-nested,
1380 	 contains a single inner loop, and the number of BBs is exactly 5.
1381 	 Vectorizable outer-loops look like this:
1382 
1383 			(pre-header)
1384 			   |
1385 			  header <---+
1386 			   |         |
1387 		          inner-loop |
1388 			   |         |
1389 			  tail ------+
1390 			   |
1391 		        (exit-bb)
1392 
1393 	 The inner-loop has the properties expected of inner-most loops
1394 	 as described above.  */
1395 
1396       if ((loop->inner)->inner || (loop->inner)->next)
1397 	return opt_result::failure_at (vect_location,
1398 				       "not vectorized:"
1399 				       " multiple nested loops.\n");
1400 
1401       if (loop->num_nodes != 5)
1402 	return opt_result::failure_at (vect_location,
1403 				       "not vectorized:"
1404 				       " control flow in loop.\n");
1405 
1406       entryedge = loop_preheader_edge (innerloop);
1407       if (entryedge->src != loop->header
1408 	  || !single_exit (innerloop)
1409 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410 	return opt_result::failure_at (vect_location,
1411 				       "not vectorized:"
1412 				       " unsupported outerloop form.\n");
1413 
1414       /* Analyze the inner-loop.  */
1415       vect_loop_form_info inner;
1416       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417       if (!res)
1418 	{
1419 	  if (dump_enabled_p ())
1420 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421 			     "not vectorized: Bad inner loop.\n");
1422 	  return res;
1423 	}
1424 
1425       /* Don't support analyzing niter under assumptions for inner
1426 	 loop.  */
1427       if (!integer_onep (inner.assumptions))
1428 	return opt_result::failure_at (vect_location,
1429 				       "not vectorized: Bad inner loop.\n");
1430 
1431       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432 	return opt_result::failure_at (vect_location,
1433 				       "not vectorized: inner-loop count not"
1434 				       " invariant.\n");
1435 
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_NOTE, vect_location,
1438 			 "Considering outer-loop vectorization.\n");
1439       info->inner_loop_cond = inner.loop_cond;
1440     }
1441 
1442   if (!single_exit (loop))
1443     return opt_result::failure_at (vect_location,
1444 				   "not vectorized: multiple exits.\n");
1445   if (EDGE_COUNT (loop->header->preds) != 2)
1446     return opt_result::failure_at (vect_location,
1447 				   "not vectorized:"
1448 				   " too many incoming edges.\n");
1449 
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     return opt_result::failure_at (vect_location,
1457 				   "not vectorized: latch block not empty.\n");
1458 
1459   /* Make sure the exit is not abnormal.  */
1460   edge e = single_exit (loop);
1461   if (e->flags & EDGE_ABNORMAL)
1462     return opt_result::failure_at (vect_location,
1463 				   "not vectorized:"
1464 				   " abnormal loop exit edge.\n");
1465 
1466   info->loop_cond
1467     = vect_get_loop_niters (loop, &info->assumptions,
1468 			    &info->number_of_iterations,
1469 			    &info->number_of_iterationsm1);
1470   if (!info->loop_cond)
1471     return opt_result::failure_at
1472       (vect_location,
1473        "not vectorized: complicated exit condition.\n");
1474 
1475   if (integer_zerop (info->assumptions)
1476       || !info->number_of_iterations
1477       || chrec_contains_undetermined (info->number_of_iterations))
1478     return opt_result::failure_at
1479       (info->loop_cond,
1480        "not vectorized: number of iterations cannot be computed.\n");
1481 
1482   if (integer_zerop (info->number_of_iterations))
1483     return opt_result::failure_at
1484       (info->loop_cond,
1485        "not vectorized: number of iterations = 0.\n");
1486 
1487   if (!(tree_fits_shwi_p (info->number_of_iterations)
1488 	&& tree_to_shwi (info->number_of_iterations) > 0))
1489     {
1490       if (dump_enabled_p ())
1491 	{
1492 	  dump_printf_loc (MSG_NOTE, vect_location,
1493 			   "Symbolic number of iterations is ");
1494 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495 	  dump_printf (MSG_NOTE, "\n");
1496 	}
1497     }
1498 
1499   return opt_result::success ();
1500 }
1501 
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503    vect_analyze_loop_form result.  */
1504 
1505 loop_vec_info
1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507 			const vect_loop_form_info *info,
1508 			loop_vec_info main_loop_info)
1509 {
1510   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515   /* Also record the assumptions for versioning.  */
1516   if (!integer_onep (info->assumptions) && !main_loop_info)
1517     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1518 
1519   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521   if (info->inner_loop_cond)
1522     {
1523       stmt_vec_info inner_loop_cond_info
1524 	= loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526       /* If we have an estimate on the number of iterations of the inner
1527 	 loop use that to limit the scale for costing, otherwise use
1528 	 --param vect-inner-loop-cost-factor literally.  */
1529       widest_int nit;
1530       if (estimated_stmt_executions (loop->inner, &nit))
1531 	LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532 	  = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1533     }
1534 
1535   return loop_vinfo;
1536 }
1537 
1538 
1539 
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541    statements update the vectorization factor.  */
1542 
1543 static void
1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1545 {
1546   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548   int nbbs = loop->num_nodes;
1549   poly_uint64 vectorization_factor;
1550   int i;
1551 
1552   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1553 
1554   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555   gcc_assert (known_ne (vectorization_factor, 0U));
1556 
1557   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558      vectorization factor of the loop is the unrolling factor required by
1559      the SLP instances.  If that unrolling factor is 1, we say, that we
1560      perform pure SLP on loop - cross iteration parallelism is not
1561      exploited.  */
1562   bool only_slp_in_loop = true;
1563   for (i = 0; i < nbbs; i++)
1564     {
1565       basic_block bb = bbs[i];
1566       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567 	   gsi_next (&si))
1568 	{
1569 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570 	  if (!stmt_info)
1571 	    continue;
1572 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 	      && !PURE_SLP_STMT (stmt_info))
1575 	    /* STMT needs both SLP and loop-based vectorization.  */
1576 	    only_slp_in_loop = false;
1577 	}
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 	   gsi_next (&si))
1580 	{
1581 	  if (is_gimple_debug (gsi_stmt (si)))
1582 	    continue;
1583 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1585 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587 	      && !PURE_SLP_STMT (stmt_info))
1588 	    /* STMT needs both SLP and loop-based vectorization.  */
1589 	    only_slp_in_loop = false;
1590 	}
1591     }
1592 
1593   if (only_slp_in_loop)
1594     {
1595       if (dump_enabled_p ())
1596 	dump_printf_loc (MSG_NOTE, vect_location,
1597 			 "Loop contains only SLP stmts\n");
1598       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1599     }
1600   else
1601     {
1602       if (dump_enabled_p ())
1603 	dump_printf_loc (MSG_NOTE, vect_location,
1604 			 "Loop contains SLP and non-SLP stmts\n");
1605       /* Both the vectorization factor and unroll factor have the form
1606 	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607 	 so they must have a common multiple.  */
1608       vectorization_factor
1609 	= force_common_multiple (vectorization_factor,
1610 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611     }
1612 
1613   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614   if (dump_enabled_p ())
1615     {
1616       dump_printf_loc (MSG_NOTE, vect_location,
1617 		       "Updating vectorization factor to ");
1618       dump_dec (MSG_NOTE, vectorization_factor);
1619       dump_printf (MSG_NOTE, ".\n");
1620     }
1621 }
1622 
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624    the other phi in the reduction is also relevant for vectorization.
1625    This rejects cases such as:
1626 
1627       outer1:
1628 	x_1 = PHI <x_3(outer2), ...>;
1629 	...
1630 
1631       inner:
1632 	x_2 = ...;
1633 	...
1634 
1635       outer2:
1636 	x_3 = PHI <x_2(inner)>;
1637 
1638    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1639 
1640 static bool
1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1642 {
1643   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644     return false;
1645 
1646   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 }
1648 
1649 /* Function vect_analyze_loop_operations.
1650 
1651    Scan the loop stmts and make sure they are all vectorizable.  */
1652 
1653 static opt_result
1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1655 {
1656   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658   int nbbs = loop->num_nodes;
1659   int i;
1660   stmt_vec_info stmt_info;
1661   bool need_to_vectorize = false;
1662   bool ok;
1663 
1664   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1665 
1666   auto_vec<stmt_info_for_cost> cost_vec;
1667 
1668   for (i = 0; i < nbbs; i++)
1669     {
1670       basic_block bb = bbs[i];
1671 
1672       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673 	   gsi_next (&si))
1674         {
1675           gphi *phi = si.phi ();
1676           ok = true;
1677 
1678 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1679           if (dump_enabled_p ())
1680 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681 	  if (virtual_operand_p (gimple_phi_result (phi)))
1682 	    continue;
1683 
1684           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685              (i.e., a phi in the tail of the outer-loop).  */
1686           if (! is_loop_header_bb_p (bb))
1687             {
1688               /* FORNOW: we currently don't support the case that these phis
1689                  are not used in the outerloop (unless it is double reduction,
1690                  i.e., this phi is vect_reduction_def), cause this case
1691                  requires to actually do something here.  */
1692               if (STMT_VINFO_LIVE_P (stmt_info)
1693 		  && !vect_active_double_reduction_p (stmt_info))
1694 		return opt_result::failure_at (phi,
1695 					       "Unsupported loop-closed phi"
1696 					       " in outer-loop.\n");
1697 
1698               /* If PHI is used in the outer loop, we check that its operand
1699                  is defined in the inner loop.  */
1700               if (STMT_VINFO_RELEVANT_P (stmt_info))
1701                 {
1702                   tree phi_op;
1703 
1704                   if (gimple_phi_num_args (phi) != 1)
1705                     return opt_result::failure_at (phi, "unsupported phi");
1706 
1707                   phi_op = PHI_ARG_DEF (phi, 0);
1708 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709 		  if (!op_def_info)
1710 		    return opt_result::failure_at (phi, "unsupported phi\n");
1711 
1712 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713 		      && (STMT_VINFO_RELEVANT (op_def_info)
1714 			  != vect_used_in_outer_by_reduction))
1715 		    return opt_result::failure_at (phi, "unsupported phi\n");
1716 
1717 		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718 		       || (STMT_VINFO_DEF_TYPE (stmt_info)
1719 			   == vect_double_reduction_def))
1720 		      && !vectorizable_lc_phi (loop_vinfo,
1721 					       stmt_info, NULL, NULL))
1722 		    return opt_result::failure_at (phi, "unsupported phi\n");
1723                 }
1724 
1725               continue;
1726             }
1727 
1728           gcc_assert (stmt_info);
1729 
1730           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731                || STMT_VINFO_LIVE_P (stmt_info))
1732               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733 	    /* A scalar-dependence cycle that we don't support.  */
1734 	    return opt_result::failure_at (phi,
1735 					   "not vectorized:"
1736 					   " scalar dependence cycle.\n");
1737 
1738           if (STMT_VINFO_RELEVANT_P (stmt_info))
1739             {
1740               need_to_vectorize = true;
1741               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742 		  && ! PURE_SLP_STMT (stmt_info))
1743 		ok = vectorizable_induction (loop_vinfo,
1744 					     stmt_info, NULL, NULL,
1745 					     &cost_vec);
1746 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747 			|| (STMT_VINFO_DEF_TYPE (stmt_info)
1748 			    == vect_double_reduction_def)
1749 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750 		       && ! PURE_SLP_STMT (stmt_info))
1751 		ok = vectorizable_reduction (loop_vinfo,
1752 					     stmt_info, NULL, NULL, &cost_vec);
1753             }
1754 
1755 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1756 	  if (ok
1757 	      && STMT_VINFO_LIVE_P (stmt_info)
1758 	      && !PURE_SLP_STMT (stmt_info))
1759 	    ok = vectorizable_live_operation (loop_vinfo,
1760 					      stmt_info, NULL, NULL, NULL,
1761 					      -1, false, &cost_vec);
1762 
1763           if (!ok)
1764 	    return opt_result::failure_at (phi,
1765 					   "not vectorized: relevant phi not "
1766 					   "supported: %G",
1767 					   static_cast <gimple *> (phi));
1768         }
1769 
1770       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771 	   gsi_next (&si))
1772         {
1773 	  gimple *stmt = gsi_stmt (si);
1774 	  if (!gimple_clobber_p (stmt)
1775 	      && !is_gimple_debug (stmt))
1776 	    {
1777 	      opt_result res
1778 		= vect_analyze_stmt (loop_vinfo,
1779 				     loop_vinfo->lookup_stmt (stmt),
1780 				     &need_to_vectorize,
1781 				     NULL, NULL, &cost_vec);
1782 	      if (!res)
1783 		return res;
1784 	    }
1785         }
1786     } /* bbs */
1787 
1788   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1789 
1790   /* All operations in the loop are either irrelevant (deal with loop
1791      control, or dead), or only used outside the loop and can be moved
1792      out of the loop (e.g. invariants, inductions).  The loop can be
1793      optimized away by scalar optimizations.  We're better off not
1794      touching this loop.  */
1795   if (!need_to_vectorize)
1796     {
1797       if (dump_enabled_p ())
1798         dump_printf_loc (MSG_NOTE, vect_location,
1799 			 "All the computation can be taken out of the loop.\n");
1800       return opt_result::failure_at
1801 	(vect_location,
1802 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1803     }
1804 
1805   return opt_result::success ();
1806 }
1807 
1808 /* Return true if we know that the iteration count is smaller than the
1809    vectorization factor.  Return false if it isn't, or if we can't be sure
1810    either way.  */
1811 
1812 static bool
1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 
1817   HOST_WIDE_INT max_niter;
1818   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820   else
1821     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1822 
1823   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824     return true;
1825 
1826   return false;
1827 }
1828 
1829 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1830    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1831    definitely no, or -1 if it's worth retrying.  */
1832 
1833 static int
1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835 			   unsigned *suggested_unroll_factor)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1839 
1840   /* Only loops that can handle partially-populated vectors can have iteration
1841      counts less than the vectorization factor.  */
1842   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1843     {
1844       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1845 	{
1846 	  if (dump_enabled_p ())
1847 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 			     "not vectorized: iteration count smaller than "
1849 			     "vectorization factor.\n");
1850 	  return 0;
1851 	}
1852     }
1853 
1854   /* If using the "very cheap" model. reject cases in which we'd keep
1855      a copy of the scalar code (even if we might be able to vectorize it).  */
1856   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858 	  || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859 	  || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1860     {
1861       if (dump_enabled_p ())
1862 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 			 "some scalar iterations would need to be peeled\n");
1864       return 0;
1865     }
1866 
1867   int min_profitable_iters, min_profitable_estimate;
1868   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869 				      &min_profitable_estimate,
1870 				      suggested_unroll_factor);
1871 
1872   if (min_profitable_iters < 0)
1873     {
1874       if (dump_enabled_p ())
1875 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 			 "not vectorized: vectorization not profitable.\n");
1877       if (dump_enabled_p ())
1878 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879 			 "not vectorized: vector version will never be "
1880 			 "profitable.\n");
1881       return -1;
1882     }
1883 
1884   int min_scalar_loop_bound = (param_min_vect_loop_bound
1885 			       * assumed_vf);
1886 
1887   /* Use the cost model only if it is more conservative than user specified
1888      threshold.  */
1889   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890 				    min_profitable_iters);
1891 
1892   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1893 
1894   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1896     {
1897       if (dump_enabled_p ())
1898 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 			 "not vectorized: vectorization not profitable.\n");
1900       if (dump_enabled_p ())
1901 	dump_printf_loc (MSG_NOTE, vect_location,
1902 			 "not vectorized: iteration count smaller than user "
1903 			 "specified loop bound parameter or minimum profitable "
1904 			 "iterations (whichever is more conservative).\n");
1905       return 0;
1906     }
1907 
1908   /* The static profitablity threshold min_profitable_estimate includes
1909      the cost of having to check at runtime whether the scalar loop
1910      should be used instead.  If it turns out that we don't need or want
1911      such a check, the threshold we should use for the static estimate
1912      is simply the point at which the vector loop becomes more profitable
1913      than the scalar loop.  */
1914   if (min_profitable_estimate > min_profitable_iters
1915       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1919     {
1920       if (dump_enabled_p ())
1921 	dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922 			 " choice between the scalar and vector loops\n");
1923       min_profitable_estimate = min_profitable_iters;
1924     }
1925 
1926   /* If the vector loop needs multiple iterations to be beneficial then
1927      things are probably too close to call, and the conservative thing
1928      would be to stick with the scalar code.  */
1929   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1931     {
1932       if (dump_enabled_p ())
1933 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934 			 "one iteration of the vector loop would be"
1935 			 " more expensive than the equivalent number of"
1936 			 " iterations of the scalar loop\n");
1937       return 0;
1938     }
1939 
1940   HOST_WIDE_INT estimated_niter;
1941 
1942   /* If we are vectorizing an epilogue then we know the maximum number of
1943      scalar iterations it will cover is at least one lower than the
1944      vectorization factor of the main loop.  */
1945   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946     estimated_niter
1947       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948   else
1949     {
1950       estimated_niter = estimated_stmt_executions_int (loop);
1951       if (estimated_niter == -1)
1952 	estimated_niter = likely_max_stmt_executions_int (loop);
1953     }
1954   if (estimated_niter != -1
1955       && ((unsigned HOST_WIDE_INT) estimated_niter
1956 	  < MAX (th, (unsigned) min_profitable_estimate)))
1957     {
1958       if (dump_enabled_p ())
1959 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960 			 "not vectorized: estimated iteration count too "
1961 			 "small.\n");
1962       if (dump_enabled_p ())
1963 	dump_printf_loc (MSG_NOTE, vect_location,
1964 			 "not vectorized: estimated iteration count smaller "
1965 			 "than specified loop bound parameter or minimum "
1966 			 "profitable iterations (whichever is more "
1967 			 "conservative).\n");
1968       return -1;
1969     }
1970 
1971   return 1;
1972 }
1973 
1974 static opt_result
1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976 			   vec<data_reference_p> *datarefs,
1977 			   unsigned int *n_stmts)
1978 {
1979   *n_stmts = 0;
1980   for (unsigned i = 0; i < loop->num_nodes; i++)
1981     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982 	 !gsi_end_p (gsi); gsi_next (&gsi))
1983       {
1984 	gimple *stmt = gsi_stmt (gsi);
1985 	if (is_gimple_debug (stmt))
1986 	  continue;
1987 	++(*n_stmts);
1988 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989 							NULL, 0);
1990 	if (!res)
1991 	  {
1992 	    if (is_gimple_call (stmt) && loop->safelen)
1993 	      {
1994 		tree fndecl = gimple_call_fndecl (stmt), op;
1995 		if (fndecl != NULL_TREE)
1996 		  {
1997 		    cgraph_node *node = cgraph_node::get (fndecl);
1998 		    if (node != NULL && node->simd_clones != NULL)
1999 		      {
2000 			unsigned int j, n = gimple_call_num_args (stmt);
2001 			for (j = 0; j < n; j++)
2002 			  {
2003 			    op = gimple_call_arg (stmt, j);
2004 			    if (DECL_P (op)
2005 				|| (REFERENCE_CLASS_P (op)
2006 				    && get_base_address (op)))
2007 			      break;
2008 			  }
2009 			op = gimple_call_lhs (stmt);
2010 			/* Ignore #pragma omp declare simd functions
2011 			   if they don't have data references in the
2012 			   call stmt itself.  */
2013 			if (j == n
2014 			    && !(op
2015 				 && (DECL_P (op)
2016 				     || (REFERENCE_CLASS_P (op)
2017 					 && get_base_address (op)))))
2018 			  continue;
2019 		      }
2020 		  }
2021 	      }
2022 	    return res;
2023 	  }
2024 	/* If dependence analysis will give up due to the limit on the
2025 	   number of datarefs stop here and fail fatally.  */
2026 	if (datarefs->length ()
2027 	    > (unsigned)param_loop_max_datarefs_for_datadeps)
2028 	  return opt_result::failure_at (stmt, "exceeded param "
2029 					 "loop-max-datarefs-for-datadeps\n");
2030       }
2031   return opt_result::success ();
2032 }
2033 
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035    group.  */
2036 static void
2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2038 {
2039   unsigned int i;
2040   struct data_reference *dr;
2041 
2042   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2043 
2044   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045   FOR_EACH_VEC_ELT (datarefs, i, dr)
2046     {
2047       gcc_assert (DR_REF (dr));
2048       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2049 
2050       /* Check if the load is a part of an interleaving chain.  */
2051       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 	{
2053 	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054 	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055 	  unsigned int group_size = DR_GROUP_SIZE (first_element);
2056 
2057 	  /* Check if SLP-only groups.  */
2058 	  if (!STMT_SLP_TYPE (stmt_info)
2059 	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
2060 	    {
2061 	      /* Dissolve the group.  */
2062 	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2063 
2064 	      stmt_vec_info vinfo = first_element;
2065 	      while (vinfo)
2066 		{
2067 		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068 		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069 		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070 		  DR_GROUP_SIZE (vinfo) = 1;
2071 		  if (STMT_VINFO_STRIDED_P (first_element))
2072 		    DR_GROUP_GAP (vinfo) = 0;
2073 		  else
2074 		    DR_GROUP_GAP (vinfo) = group_size - 1;
2075 		  /* Duplicate and adjust alignment info, it needs to
2076 		     be present on each group leader, see dr_misalignment.  */
2077 		  if (vinfo != first_element)
2078 		    {
2079 		      dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080 		      dr_info2->target_alignment = dr_info->target_alignment;
2081 		      int misalignment = dr_info->misalignment;
2082 		      if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2083 			{
2084 			  HOST_WIDE_INT diff
2085 			    = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086 			       - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087 			  unsigned HOST_WIDE_INT align_c
2088 			    = dr_info->target_alignment.to_constant ();
2089 			  misalignment = (misalignment + diff) % align_c;
2090 			}
2091 		      dr_info2->misalignment = misalignment;
2092 		    }
2093 		  vinfo = next;
2094 		}
2095 	    }
2096 	}
2097     }
2098 }
2099 
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101    some scalar iterations still to do.  If so, decide how we should
2102    handle those scalar iterations.  The possibilities are:
2103 
2104    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105        In this case:
2106 
2107 	 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108 	 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109 	 LOOP_VINFO_PEELING_FOR_NITER == false
2110 
2111    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112        to handle the remaining scalar iterations.  In this case:
2113 
2114 	 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115 	 LOOP_VINFO_PEELING_FOR_NITER == true
2116 
2117        There are two choices:
2118 
2119        (2a) Consider vectorizing the epilogue loop at the same VF as the
2120 	    main loop, but using partial vectors instead of full vectors.
2121 	    In this case:
2122 
2123 	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2124 
2125        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126 	    In this case:
2127 
2128 	      LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2129 
2130    When FOR_EPILOGUE_P is true, make this determination based on the
2131    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132    based on the assumption that LOOP_VINFO is the main loop.  The caller
2133    has made sure that the number of iterations is set appropriately for
2134    this value of FOR_EPILOGUE_P.  */
2135 
2136 opt_result
2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138 					    bool for_epilogue_p)
2139 {
2140   /* Determine whether there would be any scalar iterations left over.  */
2141   bool need_peeling_or_partial_vectors_p
2142     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2143 
2144   /* Decide whether to vectorize the loop with partial vectors.  */
2145   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148       && need_peeling_or_partial_vectors_p)
2149     {
2150       /* For partial-vector-usage=1, try to push the handling of partial
2151 	 vectors to the epilogue, with the main loop continuing to operate
2152 	 on full vectors.
2153 
2154 	 If we are unrolling we also do not want to use partial vectors. This
2155 	 is to avoid the overhead of generating multiple masks and also to
2156 	 avoid having to execute entire iterations of FALSE masked instructions
2157 	 when dealing with one or less full iterations.
2158 
2159 	 ??? We could then end up failing to use partial vectors if we
2160 	 decide to peel iterations into a prologue, and if the main loop
2161 	 then ends up processing fewer than VF iterations.  */
2162       if ((param_vect_partial_vector_usage == 1
2163 	   || loop_vinfo->suggested_unroll_factor > 1)
2164 	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165 	  && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166 	LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167       else
2168 	LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2169     }
2170 
2171   if (dump_enabled_p ())
2172     {
2173       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174 	dump_printf_loc (MSG_NOTE, vect_location,
2175 			 "operating on partial vectors%s.\n",
2176 			 for_epilogue_p ? " for epilogue loop" : "");
2177       else
2178 	dump_printf_loc (MSG_NOTE, vect_location,
2179 			 "operating only on full vectors%s.\n",
2180 			 for_epilogue_p ? " for epilogue loop" : "");
2181     }
2182 
2183   if (for_epilogue_p)
2184     {
2185       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186       gcc_assert (orig_loop_vinfo);
2187       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188 	gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189 			      LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2190     }
2191 
2192   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2194     {
2195       /* Check that the loop processes at least one full vector.  */
2196       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198       if (known_lt (wi::to_widest (scalar_niters), vf))
2199 	return opt_result::failure_at (vect_location,
2200 				       "loop does not have enough iterations"
2201 				       " to support vectorization.\n");
2202 
2203       /* If we need to peel an extra epilogue iteration to handle data
2204 	 accesses with gaps, check that there are enough scalar iterations
2205 	 available.
2206 
2207 	 The check above is redundant with this one when peeling for gaps,
2208 	 but the distinction is useful for diagnostics.  */
2209       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211 	  && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212 	return opt_result::failure_at (vect_location,
2213 				       "loop does not have enough iterations"
2214 				       " to support peeling for gaps.\n");
2215     }
2216 
2217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219        && need_peeling_or_partial_vectors_p);
2220 
2221   return opt_result::success ();
2222 }
2223 
2224 /* Function vect_analyze_loop_2.
2225 
2226    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227    for it.  The different analyses will record information in the
2228    loop_vec_info struct.  */
2229 static opt_result
2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231 		     unsigned *suggested_unroll_factor)
2232 {
2233   opt_result ok = opt_result::success ();
2234   int res;
2235   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236   poly_uint64 min_vf = 2;
2237   loop_vec_info orig_loop_vinfo = NULL;
2238 
2239   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240      loop_vec_info of the first vectorized loop.  */
2241   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243   else
2244     orig_loop_vinfo = loop_vinfo;
2245   gcc_assert (orig_loop_vinfo);
2246 
2247   /* The first group of checks is independent of the vector size.  */
2248   fatal = true;
2249 
2250   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252     return opt_result::failure_at (vect_location,
2253 				   "not vectorized: simd if(0)\n");
2254 
2255   /* Find all data references in the loop (which correspond to vdefs/vuses)
2256      and analyze their evolution in the loop.  */
2257 
2258   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2259 
2260   /* Gather the data references and count stmts in the loop.  */
2261   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2262     {
2263       opt_result res
2264 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
2266 				     &LOOP_VINFO_N_STMTS (loop_vinfo));
2267       if (!res)
2268 	{
2269 	  if (dump_enabled_p ())
2270 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 			     "not vectorized: loop contains function "
2272 			     "calls or data references that cannot "
2273 			     "be analyzed\n");
2274 	  return res;
2275 	}
2276       loop_vinfo->shared->save_datarefs ();
2277     }
2278   else
2279     loop_vinfo->shared->check_datarefs ();
2280 
2281   /* Analyze the data references and also adjust the minimal
2282      vectorization factor according to the loads and stores.  */
2283 
2284   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 			 "bad data references.\n");
2290       return ok;
2291     }
2292 
2293   /* Classify all cross-iteration scalar data-flow cycles.
2294      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2295   vect_analyze_scalar_cycles (loop_vinfo);
2296 
2297   vect_pattern_recog (loop_vinfo);
2298 
2299   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2300 
2301   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2303 
2304   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 			 "bad data access.\n");
2310       return ok;
2311     }
2312 
2313   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2314 
2315   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316   if (!ok)
2317     {
2318       if (dump_enabled_p ())
2319 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 			 "unexpected pattern.\n");
2321       return ok;
2322     }
2323 
2324   /* While the rest of the analysis below depends on it in some way.  */
2325   fatal = false;
2326 
2327   /* Analyze data dependences between the data-refs in the loop
2328      and adjust the maximum vectorization factor according to
2329      the dependences.
2330      FORNOW: fail at the first data dependence that we encounter.  */
2331 
2332   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333   if (!ok)
2334     {
2335       if (dump_enabled_p ())
2336 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 			 "bad data dependence.\n");
2338       return ok;
2339     }
2340   if (max_vf != MAX_VECTORIZATION_FACTOR
2341       && maybe_lt (max_vf, min_vf))
2342     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2344 
2345   ok = vect_determine_vectorization_factor (loop_vinfo);
2346   if (!ok)
2347     {
2348       if (dump_enabled_p ())
2349 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350 			 "can't determine vectorization factor.\n");
2351       return ok;
2352     }
2353   if (max_vf != MAX_VECTORIZATION_FACTOR
2354       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2356 
2357   /* Compute the scalar iteration cost.  */
2358   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2359 
2360   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 
2362   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2363   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364   if (!ok)
2365     return ok;
2366 
2367   /* If there are any SLP instances mark them as pure_slp.  */
2368   bool slp = vect_make_slp_decision (loop_vinfo);
2369   if (slp)
2370     {
2371       /* Find stmts that need to be both vectorized and SLPed.  */
2372       vect_detect_hybrid_slp (loop_vinfo);
2373 
2374       /* Update the vectorization factor based on the SLP decision.  */
2375       vect_update_vf_for_slp (loop_vinfo);
2376 
2377       /* Optimize the SLP graph with the vectorization factor fixed.  */
2378       vect_optimize_slp (loop_vinfo);
2379 
2380       /* Gather the loads reachable from the SLP graph entries.  */
2381       vect_gather_slp_loads (loop_vinfo);
2382     }
2383 
2384   bool saved_can_use_partial_vectors_p
2385     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2386 
2387   /* We don't expect to have to roll back to anything other than an empty
2388      set of rgroups.  */
2389   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2390 
2391   /* This is the point where we can re-start analysis with SLP forced off.  */
2392 start_over:
2393 
2394   /* Apply the suggested unrolling factor, this was determined by the backend
2395      during finish_cost the first time we ran the analyzis for this
2396      vector mode.  */
2397   if (loop_vinfo->suggested_unroll_factor > 1)
2398     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2399 
2400   /* Now the vectorization factor is final.  */
2401   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402   gcc_assert (known_ne (vectorization_factor, 0U));
2403 
2404   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2405     {
2406       dump_printf_loc (MSG_NOTE, vect_location,
2407 		       "vectorization_factor = ");
2408       dump_dec (MSG_NOTE, vectorization_factor);
2409       dump_printf (MSG_NOTE, ", niters = %wd\n",
2410 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2411     }
2412 
2413   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2414 
2415   /* Analyze the alignment of the data-refs in the loop.
2416      Fail if a data reference is found that cannot be vectorized.  */
2417 
2418   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419   if (!ok)
2420     {
2421       if (dump_enabled_p ())
2422 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423 			 "bad data alignment.\n");
2424       return ok;
2425     }
2426 
2427   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428      It is important to call pruning after vect_analyze_data_ref_accesses,
2429      since we use grouping information gathered by interleaving analysis.  */
2430   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431   if (!ok)
2432     return ok;
2433 
2434   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435      vectorization, since we do not want to add extra peeling or
2436      add versioning for alignment.  */
2437   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438     /* This pass will decide on using loop versioning and/or loop peeling in
2439        order to enhance the alignment of data references in the loop.  */
2440     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441   if (!ok)
2442     return ok;
2443 
2444   if (slp)
2445     {
2446       /* Analyze operations in the SLP instances.  Note this may
2447 	 remove unsupported SLP instances which makes the above
2448 	 SLP kind detection invalid.  */
2449       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450       vect_slp_analyze_operations (loop_vinfo);
2451       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2452 	{
2453 	  ok = opt_result::failure_at (vect_location,
2454 				       "unsupported SLP instances\n");
2455 	  goto again;
2456 	}
2457 
2458       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2459       slp_tree load_node, slp_root;
2460       unsigned i, x;
2461       slp_instance instance;
2462       bool can_use_lanes = true;
2463       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2464 	{
2465 	  slp_root = SLP_INSTANCE_TREE (instance);
2466 	  int group_size = SLP_TREE_LANES (slp_root);
2467 	  tree vectype = SLP_TREE_VECTYPE (slp_root);
2468 	  bool loads_permuted = false;
2469 	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2470 	    {
2471 	      if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472 		continue;
2473 	      unsigned j;
2474 	      stmt_vec_info load_info;
2475 	      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476 		if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2477 		  {
2478 		    loads_permuted = true;
2479 		    break;
2480 		  }
2481 	    }
2482 
2483 	  /* If the loads and stores can be handled with load/store-lane
2484 	     instructions record it and move on to the next instance.  */
2485 	  if (loads_permuted
2486 	      && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487 	      && vect_store_lanes_supported (vectype, group_size, false))
2488 	    {
2489 	      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2490 		{
2491 		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492 		      (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493 		  /* Use SLP for strided accesses (or if we can't
2494 		     load-lanes).  */
2495 		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496 		      || ! vect_load_lanes_supported
2497 			    (STMT_VINFO_VECTYPE (stmt_vinfo),
2498 			     DR_GROUP_SIZE (stmt_vinfo), false))
2499 		    break;
2500 		}
2501 
2502 	      can_use_lanes
2503 		= can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2504 
2505 	      if (can_use_lanes && dump_enabled_p ())
2506 		dump_printf_loc (MSG_NOTE, vect_location,
2507 				 "SLP instance %p can use load/store-lanes\n",
2508 				 instance);
2509 	    }
2510 	  else
2511 	    {
2512 	      can_use_lanes = false;
2513 	      break;
2514 	    }
2515 	}
2516 
2517       /* If all SLP instances can use load/store-lanes abort SLP and try again
2518 	 with SLP disabled.  */
2519       if (can_use_lanes)
2520 	{
2521 	  ok = opt_result::failure_at (vect_location,
2522 				       "Built SLP cancelled: can use "
2523 				       "load/store-lanes\n");
2524 	  if (dump_enabled_p ())
2525 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 			     "Built SLP cancelled: all SLP instances support "
2527 			     "load/store-lanes\n");
2528 	  goto again;
2529 	}
2530     }
2531 
2532   /* Dissolve SLP-only groups.  */
2533   vect_dissolve_slp_only_groups (loop_vinfo);
2534 
2535   /* Scan all the remaining operations in the loop that are not subject
2536      to SLP and make sure they are vectorizable.  */
2537   ok = vect_analyze_loop_operations (loop_vinfo);
2538   if (!ok)
2539     {
2540       if (dump_enabled_p ())
2541 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542 			 "bad operation or unsupported loop bound.\n");
2543       return ok;
2544     }
2545 
2546   /* For now, we don't expect to mix both masking and length approaches for one
2547      loop, disable it if both are recorded.  */
2548   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2551     {
2552       if (dump_enabled_p ())
2553 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 			 "can't vectorize a loop with partial vectors"
2555 			 " because we don't expect to mix different"
2556 			 " approaches with partial vectors for the"
2557 			 " same loop.\n");
2558       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2559     }
2560 
2561   /* If we still have the option of using partial vectors,
2562      check whether we can generate the necessary loop controls.  */
2563   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564       && !vect_verify_full_masking (loop_vinfo)
2565       && !vect_verify_loop_lens (loop_vinfo))
2566     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2567 
2568   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569      to be able to handle fewer than VF scalars, or needs to have a lower VF
2570      than the main loop.  */
2571   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574 		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575     return opt_result::failure_at (vect_location,
2576 				   "Vectorization factor too high for"
2577 				   " epilogue loop.\n");
2578 
2579   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580      assuming that the loop will be used as a main loop.  We will redo
2581      this analysis later if we instead decide to use the loop as an
2582      epilogue loop.  */
2583   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584   if (!ok)
2585     return ok;
2586 
2587   /* Check the costings of the loop make vectorizing worthwhile.  */
2588   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589   if (res < 0)
2590     {
2591       ok = opt_result::failure_at (vect_location,
2592 				   "Loop costings may not be worthwhile.\n");
2593       goto again;
2594     }
2595   if (!res)
2596     return opt_result::failure_at (vect_location,
2597 				   "Loop costings not worthwhile.\n");
2598 
2599   /* If an epilogue loop is required make sure we can create one.  */
2600   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602     {
2603       if (dump_enabled_p ())
2604         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605       if (!vect_can_advance_ivs_p (loop_vinfo)
2606 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607 					   single_exit (LOOP_VINFO_LOOP
2608 							 (loop_vinfo))))
2609         {
2610 	  ok = opt_result::failure_at (vect_location,
2611 				       "not vectorized: can't create required "
2612 				       "epilog loop\n");
2613           goto again;
2614         }
2615     }
2616 
2617   /* During peeling, we need to check if number of loop iterations is
2618      enough for both peeled prolog loop and vector loop.  This check
2619      can be merged along with threshold check of loop versioning, so
2620      increase threshold for this case if necessary.
2621 
2622      If we are analyzing an epilogue we still want to check what its
2623      versioning threshold would be.  If we decide to vectorize the epilogues we
2624      will want to use the lowest versioning threshold of all epilogues and main
2625      loop.  This will enable us to enter a vectorized epilogue even when
2626      versioning the loop.  We can't simply check whether the epilogue requires
2627      versioning though since we may have skipped some versioning checks when
2628      analyzing the epilogue.  For instance, checks for alias versioning will be
2629      skipped when dealing with epilogues as we assume we already checked them
2630      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2631   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2632     {
2633       poly_uint64 niters_th = 0;
2634       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2635 
2636       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2637 	{
2638 	  /* Niters for peeled prolog loop.  */
2639 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2640 	    {
2641 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2644 	    }
2645 	  else
2646 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647 	}
2648 
2649       /* Niters for at least one iteration of vectorized loop.  */
2650       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652       /* One additional iteration because of peeling for gap.  */
2653       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654 	niters_th += 1;
2655 
2656       /*  Use the same condition as vect_transform_loop to decide when to use
2657 	  the cost to determine a versioning threshold.  */
2658       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659 	  && ordered_p (th, niters_th))
2660 	niters_th = ordered_max (poly_uint64 (th), niters_th);
2661 
2662       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2663     }
2664 
2665   gcc_assert (known_eq (vectorization_factor,
2666 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2667 
2668   /* Ok to vectorize!  */
2669   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670   return opt_result::success ();
2671 
2672 again:
2673   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2674   gcc_assert (!ok);
2675 
2676   /* Try again with SLP forced off but if we didn't do any SLP there is
2677      no point in re-trying.  */
2678   if (!slp)
2679     return ok;
2680 
2681   /* If there are reduction chains re-trying will fail anyway.  */
2682   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683     return ok;
2684 
2685   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686      via interleaving or lane instructions.  */
2687   slp_instance instance;
2688   slp_tree node;
2689   unsigned i, j;
2690   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691     {
2692       stmt_vec_info vinfo;
2693       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695 	continue;
2696       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697       unsigned int size = DR_GROUP_SIZE (vinfo);
2698       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699       if (! vect_store_lanes_supported (vectype, size, false)
2700 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701 	 && ! vect_grouped_store_supported (vectype, size))
2702 	return opt_result::failure_at (vinfo->stmt,
2703 				       "unsupported grouped store\n");
2704       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2705 	{
2706 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709 	  size = DR_GROUP_SIZE (vinfo);
2710 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2711 	  if (! vect_load_lanes_supported (vectype, size, false)
2712 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2713 						size))
2714 	    return opt_result::failure_at (vinfo->stmt,
2715 					   "unsupported grouped load\n");
2716 	}
2717     }
2718 
2719   if (dump_enabled_p ())
2720     dump_printf_loc (MSG_NOTE, vect_location,
2721 		     "re-trying with SLP disabled\n");
2722 
2723   /* Roll back state appropriately.  No SLP this time.  */
2724   slp = false;
2725   /* Restore vectorization factor as it were without SLP.  */
2726   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727   /* Free the SLP instances.  */
2728   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729     vect_free_slp_instance (instance);
2730   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731   /* Reset SLP type to loop_vect on all stmts.  */
2732   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2733     {
2734       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736 	   !gsi_end_p (si); gsi_next (&si))
2737 	{
2738 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2740 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741 	      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2742 	    {
2743 	      /* vectorizable_reduction adjusts reduction stmt def-types,
2744 		 restore them to that of the PHI.  */
2745 	      STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746 		= STMT_VINFO_DEF_TYPE (stmt_info);
2747 	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748 					(STMT_VINFO_REDUC_DEF (stmt_info)))
2749 		= STMT_VINFO_DEF_TYPE (stmt_info);
2750 	    }
2751 	}
2752       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753 	   !gsi_end_p (si); gsi_next (&si))
2754 	{
2755 	  if (is_gimple_debug (gsi_stmt (si)))
2756 	    continue;
2757 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2759 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2760 	    {
2761 	      stmt_vec_info pattern_stmt_info
2762 		= STMT_VINFO_RELATED_STMT (stmt_info);
2763 	      if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764 		STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2765 
2766 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767 	      STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769 		   !gsi_end_p (pi); gsi_next (&pi))
2770 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771 		  = loop_vect;
2772 	    }
2773 	}
2774     }
2775   /* Free optimized alias test DDRS.  */
2776   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779   /* Reset target cost data.  */
2780   delete loop_vinfo->vector_costs;
2781   loop_vinfo->vector_costs = nullptr;
2782   /* Reset accumulated rgroup information.  */
2783   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785   /* Reset assorted flags.  */
2786   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791     = saved_can_use_partial_vectors_p;
2792 
2793   goto start_over;
2794 }
2795 
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2798    OLD_LOOP_VINFO is better unless something specifically indicates
2799    otherwise.
2800 
2801    Note that this deliberately isn't a partial order.  */
2802 
2803 static bool
2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805 			  loop_vec_info old_loop_vinfo)
2806 {
2807   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2809 
2810   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2812 
2813   /* Always prefer a VF of loop->simdlen over any other VF.  */
2814   if (loop->simdlen)
2815     {
2816       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818       if (new_simdlen_p != old_simdlen_p)
2819 	return new_simdlen_p;
2820     }
2821 
2822   const auto *old_costs = old_loop_vinfo->vector_costs;
2823   const auto *new_costs = new_loop_vinfo->vector_costs;
2824   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2826 
2827   return new_costs->better_main_loop_than_p (old_costs);
2828 }
2829 
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2831    true if we should.  */
2832 
2833 static bool
2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835 			loop_vec_info old_loop_vinfo)
2836 {
2837   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838     return false;
2839 
2840   if (dump_enabled_p ())
2841     dump_printf_loc (MSG_NOTE, vect_location,
2842 		     "***** Preferring vector mode %s to vector mode %s\n",
2843 		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844 		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845   return true;
2846 }
2847 
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850    MODE_I to the next mode useful to analyze.
2851    Return the loop_vinfo on success and wrapped null on failure.  */
2852 
2853 static opt_loop_vec_info
2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855 		     const vect_loop_form_info *loop_form_info,
2856 		     loop_vec_info main_loop_vinfo,
2857 		     const vector_modes &vector_modes, unsigned &mode_i,
2858 		     machine_mode &autodetected_vector_mode,
2859 		     bool &fatal)
2860 {
2861   loop_vec_info loop_vinfo
2862     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2863 
2864   machine_mode vector_mode = vector_modes[mode_i];
2865   loop_vinfo->vector_mode = vector_mode;
2866   unsigned int suggested_unroll_factor = 1;
2867 
2868   /* Run the main analysis.  */
2869   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870 					&suggested_unroll_factor);
2871   if (dump_enabled_p ())
2872     dump_printf_loc (MSG_NOTE, vect_location,
2873 		     "***** Analysis %s with vector mode %s\n",
2874 		     res ? "succeeded" : " failed",
2875 		     GET_MODE_NAME (loop_vinfo->vector_mode));
2876 
2877   if (!main_loop_vinfo && suggested_unroll_factor > 1)
2878     {
2879       if (dump_enabled_p ())
2880 	dump_printf_loc (MSG_NOTE, vect_location,
2881 			 "***** Re-trying analysis for unrolling"
2882 			 " with unroll factor %d.\n",
2883 			 suggested_unroll_factor);
2884       loop_vec_info unroll_vinfo
2885 	= vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886       unroll_vinfo->vector_mode = vector_mode;
2887       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889       if (new_res)
2890 	{
2891 	  delete loop_vinfo;
2892 	  loop_vinfo = unroll_vinfo;
2893 	}
2894       else
2895 	delete unroll_vinfo;
2896     }
2897 
2898   /* Remember the autodetected vector mode.  */
2899   if (vector_mode == VOIDmode)
2900     autodetected_vector_mode = loop_vinfo->vector_mode;
2901 
2902   /* Advance mode_i, first skipping modes that would result in the
2903      same analysis result.  */
2904   while (mode_i + 1 < vector_modes.length ()
2905 	 && vect_chooses_same_modes_p (loop_vinfo,
2906 				       vector_modes[mode_i + 1]))
2907     {
2908       if (dump_enabled_p ())
2909 	dump_printf_loc (MSG_NOTE, vect_location,
2910 			 "***** The result for vector mode %s would"
2911 			 " be the same\n",
2912 			 GET_MODE_NAME (vector_modes[mode_i + 1]));
2913       mode_i += 1;
2914     }
2915   if (mode_i + 1 < vector_modes.length ()
2916       && VECTOR_MODE_P (autodetected_vector_mode)
2917       && (related_vector_mode (vector_modes[mode_i + 1],
2918 			       GET_MODE_INNER (autodetected_vector_mode))
2919 	  == autodetected_vector_mode)
2920       && (related_vector_mode (autodetected_vector_mode,
2921 			       GET_MODE_INNER (vector_modes[mode_i + 1]))
2922 	  == vector_modes[mode_i + 1]))
2923     {
2924       if (dump_enabled_p ())
2925 	dump_printf_loc (MSG_NOTE, vect_location,
2926 			 "***** Skipping vector mode %s, which would"
2927 			 " repeat the analysis for %s\n",
2928 			 GET_MODE_NAME (vector_modes[mode_i + 1]),
2929 			 GET_MODE_NAME (autodetected_vector_mode));
2930       mode_i += 1;
2931     }
2932   mode_i++;
2933 
2934   if (!res)
2935     {
2936       delete loop_vinfo;
2937       if (fatal)
2938 	gcc_checking_assert (main_loop_vinfo == NULL);
2939       return opt_loop_vec_info::propagate_failure (res);
2940     }
2941 
2942   return opt_loop_vec_info::success (loop_vinfo);
2943 }
2944 
2945 /* Function vect_analyze_loop.
2946 
2947    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948    for it.  The different analyses will record information in the
2949    loop_vec_info struct.  */
2950 opt_loop_vec_info
2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2952 {
2953   DUMP_VECT_SCOPE ("analyze_loop_nest");
2954 
2955   if (loop_outer (loop)
2956       && loop_vec_info_for_loop (loop_outer (loop))
2957       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958     return opt_loop_vec_info::failure_at (vect_location,
2959 					  "outer-loop already vectorized.\n");
2960 
2961   if (!find_loop_nest (loop, &shared->loop_nest))
2962     return opt_loop_vec_info::failure_at
2963       (vect_location,
2964        "not vectorized: loop nest containing two or more consecutive inner"
2965        " loops cannot be vectorized\n");
2966 
2967   /* Analyze the loop form.  */
2968   vect_loop_form_info loop_form_info;
2969   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970   if (!res)
2971     {
2972       if (dump_enabled_p ())
2973 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 			 "bad loop form.\n");
2975       return opt_loop_vec_info::propagate_failure (res);
2976     }
2977   if (!integer_onep (loop_form_info.assumptions))
2978     {
2979       /* We consider to vectorize this loop by versioning it under
2980 	 some assumptions.  In order to do this, we need to clear
2981 	 existing information computed by scev and niter analyzer.  */
2982       scev_reset_htab ();
2983       free_numbers_of_iterations_estimates (loop);
2984       /* Also set flag for this loop so that following scev and niter
2985 	 analysis are done under the assumptions.  */
2986       loop_constraint_set (loop, LOOP_C_FINITE);
2987     }
2988 
2989   auto_vector_modes vector_modes;
2990   /* Autodetect first vector size we try.  */
2991   vector_modes.safe_push (VOIDmode);
2992   unsigned int autovec_flags
2993     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994 						    loop->simdlen != 0);
2995   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996 			     && !unlimited_cost_model (loop));
2997   machine_mode autodetected_vector_mode = VOIDmode;
2998   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999   unsigned int mode_i = 0;
3000   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3001 
3002   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3003      a mode has not been analyzed.  */
3004   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005   for (unsigned i = 0; i < vector_modes.length (); ++i)
3006     cached_vf_per_mode.safe_push (0);
3007 
3008   /* First determine the main loop vectorization mode, either the first
3009      one that works, starting with auto-detecting the vector mode and then
3010      following the targets order of preference, or the one with the
3011      lowest cost if pick_lowest_cost_p.  */
3012   while (1)
3013     {
3014       bool fatal;
3015       unsigned int last_mode_i = mode_i;
3016       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017 	 failed.  */
3018       cached_vf_per_mode[last_mode_i] = -1;
3019       opt_loop_vec_info loop_vinfo
3020 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021 			       NULL, vector_modes, mode_i,
3022 			       autodetected_vector_mode, fatal);
3023       if (fatal)
3024 	break;
3025 
3026       if (loop_vinfo)
3027 	{
3028 	  /*  Analyzis has been successful so update the VF value.  The
3029 	      VF should always be a multiple of unroll_factor and we want to
3030 	      capture the original VF here.  */
3031 	  cached_vf_per_mode[last_mode_i]
3032 	    = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033 			 loop_vinfo->suggested_unroll_factor);
3034 	  /* Once we hit the desired simdlen for the first time,
3035 	     discard any previous attempts.  */
3036 	  if (simdlen
3037 	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038 	    {
3039 	      delete first_loop_vinfo;
3040 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041 	      simdlen = 0;
3042 	    }
3043 	  else if (pick_lowest_cost_p
3044 		   && first_loop_vinfo
3045 		   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046 	    {
3047 	      /* Pick loop_vinfo over first_loop_vinfo.  */
3048 	      delete first_loop_vinfo;
3049 	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050 	    }
3051 	  if (first_loop_vinfo == NULL)
3052 	    first_loop_vinfo = loop_vinfo;
3053 	  else
3054 	    {
3055 	      delete loop_vinfo;
3056 	      loop_vinfo = opt_loop_vec_info::success (NULL);
3057 	    }
3058 
3059 	  /* Commit to first_loop_vinfo if we have no reason to try
3060 	     alternatives.  */
3061 	  if (!simdlen && !pick_lowest_cost_p)
3062 	    break;
3063 	}
3064       if (mode_i == vector_modes.length ()
3065 	  || autodetected_vector_mode == VOIDmode)
3066 	break;
3067 
3068       /* Try the next biggest vector size.  */
3069       if (dump_enabled_p ())
3070 	dump_printf_loc (MSG_NOTE, vect_location,
3071 			 "***** Re-trying analysis with vector mode %s\n",
3072 			 GET_MODE_NAME (vector_modes[mode_i]));
3073     }
3074   if (!first_loop_vinfo)
3075     return opt_loop_vec_info::propagate_failure (res);
3076 
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079 		     "***** Choosing vector mode %s\n",
3080 		     GET_MODE_NAME (first_loop_vinfo->vector_mode));
3081 
3082   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083      enabled, SIMDUID is not set, it is the innermost loop and we have
3084      either already found the loop's SIMDLEN or there was no SIMDLEN to
3085      begin with.
3086      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3087   bool vect_epilogues = (!simdlen
3088 			 && loop->inner == NULL
3089 			 && param_vect_epilogues_nomask
3090 			 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091 			 && !loop->simduid);
3092   if (!vect_epilogues)
3093     return first_loop_vinfo;
3094 
3095   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3096   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3097 
3098   /* For epilogues start the analysis from the first mode.  The motivation
3099      behind starting from the beginning comes from cases where the VECTOR_MODES
3100      array may contain length-agnostic and length-specific modes.  Their
3101      ordering is not guaranteed, so we could end up picking a mode for the main
3102      loop that is after the epilogue's optimal mode.  */
3103   vector_modes[0] = autodetected_vector_mode;
3104   mode_i = 0;
3105 
3106   bool supports_partial_vectors =
3107     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3109 
3110   while (1)
3111     {
3112       /* If the target does not support partial vectors we can shorten the
3113 	 number of modes to analyze for the epilogue as we know we can't pick a
3114 	 mode that would lead to a VF at least as big as the
3115 	 FIRST_VINFO_VF.  */
3116       if (!supports_partial_vectors
3117 	  && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3118 	{
3119 	  mode_i++;
3120 	  if (mode_i == vector_modes.length ())
3121 	    break;
3122 	  continue;
3123 	}
3124 
3125       if (dump_enabled_p ())
3126 	dump_printf_loc (MSG_NOTE, vect_location,
3127 			 "***** Re-trying epilogue analysis with vector "
3128 			 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3129 
3130       bool fatal;
3131       opt_loop_vec_info loop_vinfo
3132 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133 			       first_loop_vinfo,
3134 			       vector_modes, mode_i,
3135 			       autodetected_vector_mode, fatal);
3136       if (fatal)
3137 	break;
3138 
3139       if (loop_vinfo)
3140 	{
3141 	  if (pick_lowest_cost_p)
3142 	    {
3143 	      /* Keep trying to roll back vectorization attempts while the
3144 		 loop_vec_infos they produced were worse than this one.  */
3145 	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146 	      while (!vinfos.is_empty ()
3147 		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3148 		{
3149 		  gcc_assert (vect_epilogues);
3150 		  delete vinfos.pop ();
3151 		}
3152 	    }
3153 	  /* For now only allow one epilogue loop.  */
3154 	  if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3155 	    {
3156 	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157 	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158 	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159 			  || maybe_ne (lowest_th, 0U));
3160 	      /* Keep track of the known smallest versioning
3161 		 threshold.  */
3162 	      if (ordered_p (lowest_th, th))
3163 		lowest_th = ordered_min (lowest_th, th);
3164 	    }
3165 	  else
3166 	    {
3167 	      delete loop_vinfo;
3168 	      loop_vinfo = opt_loop_vec_info::success (NULL);
3169 	    }
3170 
3171 	  /* For now only allow one epilogue loop, but allow
3172 	     pick_lowest_cost_p to replace it, so commit to the
3173 	     first epilogue if we have no reason to try alternatives.  */
3174 	  if (!pick_lowest_cost_p)
3175 	    break;
3176 	}
3177 
3178       if (mode_i == vector_modes.length ())
3179 	break;
3180 
3181     }
3182 
3183   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3184     {
3185       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186       if (dump_enabled_p ())
3187 	dump_printf_loc (MSG_NOTE, vect_location,
3188 			 "***** Choosing epilogue vector mode %s\n",
3189 			 GET_MODE_NAME
3190 			   (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3191     }
3192 
3193   return first_loop_vinfo;
3194 }
3195 
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197    it in *REDUC_FN if so.  */
3198 
3199 static bool
3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3201 {
3202   if (code == PLUS_EXPR)
3203     {
3204       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205       return true;
3206     }
3207   return false;
3208 }
3209 
3210 /* Function reduction_fn_for_scalar_code
3211 
3212    Input:
3213    CODE - tree_code of a reduction operations.
3214 
3215    Output:
3216    REDUC_FN - the corresponding internal function to be used to reduce the
3217       vector of partial results into a single scalar result, or IFN_LAST
3218       if the operation is a supported reduction operation, but does not have
3219       such an internal function.
3220 
3221    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3222 
3223 bool
3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3225 {
3226   if (code.is_tree_code ())
3227     switch (tree_code (code))
3228       {
3229       case MAX_EXPR:
3230 	*reduc_fn = IFN_REDUC_MAX;
3231 	return true;
3232 
3233       case MIN_EXPR:
3234 	*reduc_fn = IFN_REDUC_MIN;
3235 	return true;
3236 
3237       case PLUS_EXPR:
3238 	*reduc_fn = IFN_REDUC_PLUS;
3239 	return true;
3240 
3241       case BIT_AND_EXPR:
3242 	*reduc_fn = IFN_REDUC_AND;
3243 	return true;
3244 
3245       case BIT_IOR_EXPR:
3246 	*reduc_fn = IFN_REDUC_IOR;
3247 	return true;
3248 
3249       case BIT_XOR_EXPR:
3250 	*reduc_fn = IFN_REDUC_XOR;
3251 	return true;
3252 
3253       case MULT_EXPR:
3254       case MINUS_EXPR:
3255 	*reduc_fn = IFN_LAST;
3256 	return true;
3257 
3258       default:
3259 	return false;
3260       }
3261   else
3262     switch (combined_fn (code))
3263       {
3264       CASE_CFN_FMAX:
3265 	*reduc_fn = IFN_REDUC_FMAX;
3266 	return true;
3267 
3268       CASE_CFN_FMIN:
3269 	*reduc_fn = IFN_REDUC_FMIN;
3270 	return true;
3271 
3272       default:
3273 	return false;
3274       }
3275 }
3276 
3277 /* If there is a neutral value X such that a reduction would not be affected
3278    by the introduction of additional X elements, return that X, otherwise
3279    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3280    of the scalar elements.  If the reduction has just a single initial value
3281    then INITIAL_VALUE is that value, otherwise it is null.  */
3282 
3283 tree
3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285 			  tree initial_value)
3286 {
3287   if (code.is_tree_code ())
3288     switch (tree_code (code))
3289       {
3290       case WIDEN_SUM_EXPR:
3291       case DOT_PROD_EXPR:
3292       case SAD_EXPR:
3293       case PLUS_EXPR:
3294       case MINUS_EXPR:
3295       case BIT_IOR_EXPR:
3296       case BIT_XOR_EXPR:
3297 	return build_zero_cst (scalar_type);
3298 
3299       case MULT_EXPR:
3300 	return build_one_cst (scalar_type);
3301 
3302       case BIT_AND_EXPR:
3303 	return build_all_ones_cst (scalar_type);
3304 
3305       case MAX_EXPR:
3306       case MIN_EXPR:
3307 	return initial_value;
3308 
3309       default:
3310 	return NULL_TREE;
3311       }
3312   else
3313     switch (combined_fn (code))
3314       {
3315       CASE_CFN_FMIN:
3316       CASE_CFN_FMAX:
3317 	return initial_value;
3318 
3319       default:
3320 	return NULL_TREE;
3321       }
3322 }
3323 
3324 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3325    STMT is printed with a message MSG. */
3326 
3327 static void
3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3329 {
3330   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3331 }
3332 
3333 /* Return true if we need an in-order reduction for operation CODE
3334    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335    overflow must wrap.  */
3336 
3337 bool
3338 needs_fold_left_reduction_p (tree type, code_helper code)
3339 {
3340   /* CHECKME: check for !flag_finite_math_only too?  */
3341   if (SCALAR_FLOAT_TYPE_P (type))
3342     {
3343       if (code.is_tree_code ())
3344 	switch (tree_code (code))
3345 	  {
3346 	  case MIN_EXPR:
3347 	  case MAX_EXPR:
3348 	    return false;
3349 
3350 	  default:
3351 	    return !flag_associative_math;
3352 	  }
3353       else
3354 	switch (combined_fn (code))
3355 	  {
3356 	  CASE_CFN_FMIN:
3357 	  CASE_CFN_FMAX:
3358 	    return false;
3359 
3360 	  default:
3361 	    return !flag_associative_math;
3362 	  }
3363     }
3364 
3365   if (INTEGRAL_TYPE_P (type))
3366     return (!code.is_tree_code ()
3367 	    || !operation_no_trapping_overflow (type, tree_code (code)));
3368 
3369   if (SAT_FIXED_POINT_TYPE_P (type))
3370     return true;
3371 
3372   return false;
3373 }
3374 
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376    has a handled computation expression.  Store the main reduction
3377    operation in *CODE.  */
3378 
3379 static bool
3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381 		      tree loop_arg, code_helper *code,
3382 		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3383 {
3384   auto_bitmap visited;
3385   tree lookfor = PHI_RESULT (phi);
3386   ssa_op_iter curri;
3387   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388   while (USE_FROM_PTR (curr) != loop_arg)
3389     curr = op_iter_next_use (&curri);
3390   curri.i = curri.numops;
3391   do
3392     {
3393       path.safe_push (std::make_pair (curri, curr));
3394       tree use = USE_FROM_PTR (curr);
3395       if (use == lookfor)
3396 	break;
3397       gimple *def = SSA_NAME_DEF_STMT (use);
3398       if (gimple_nop_p (def)
3399 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3400 	{
3401 pop:
3402 	  do
3403 	    {
3404 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405 	      curri = x.first;
3406 	      curr = x.second;
3407 	      do
3408 		curr = op_iter_next_use (&curri);
3409 	      /* Skip already visited or non-SSA operands (from iterating
3410 	         over PHI args).  */
3411 	      while (curr != NULL_USE_OPERAND_P
3412 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413 			 || ! bitmap_set_bit (visited,
3414 					      SSA_NAME_VERSION
3415 					        (USE_FROM_PTR (curr)))));
3416 	    }
3417 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418 	  if (curr == NULL_USE_OPERAND_P)
3419 	    break;
3420 	}
3421       else
3422 	{
3423 	  if (gimple_code (def) == GIMPLE_PHI)
3424 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425 	  else
3426 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427 	  while (curr != NULL_USE_OPERAND_P
3428 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429 		     || ! bitmap_set_bit (visited,
3430 					  SSA_NAME_VERSION
3431 					    (USE_FROM_PTR (curr)))))
3432 	    curr = op_iter_next_use (&curri);
3433 	  if (curr == NULL_USE_OPERAND_P)
3434 	    goto pop;
3435 	}
3436     }
3437   while (1);
3438   if (dump_file && (dump_flags & TDF_DETAILS))
3439     {
3440       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441       unsigned i;
3442       std::pair<ssa_op_iter, use_operand_p> *x;
3443       FOR_EACH_VEC_ELT (path, i, x)
3444 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445       dump_printf (MSG_NOTE, "\n");
3446     }
3447 
3448   /* Check whether the reduction path detected is valid.  */
3449   bool fail = path.length () == 0;
3450   bool neg = false;
3451   int sign = -1;
3452   *code = ERROR_MARK;
3453   for (unsigned i = 1; i < path.length (); ++i)
3454     {
3455       gimple *use_stmt = USE_STMT (path[i].second);
3456       gimple_match_op op;
3457       if (!gimple_extract_op (use_stmt, &op))
3458 	{
3459 	  fail = true;
3460 	  break;
3461 	}
3462       unsigned int opi = op.num_ops;
3463       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3464 	{
3465 	  /* The following make sure we can compute the operand index
3466 	     easily plus it mostly disallows chaining via COND_EXPR condition
3467 	     operands.  */
3468 	  for (opi = 0; opi < op.num_ops; ++opi)
3469 	    if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470 	      break;
3471 	}
3472       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3473 	{
3474 	  for (opi = 0; opi < op.num_ops; ++opi)
3475 	    if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476 	      break;
3477 	}
3478       if (opi == op.num_ops)
3479 	{
3480 	  fail = true;
3481 	  break;
3482 	}
3483       op.code = canonicalize_code (op.code, op.type);
3484       if (op.code == MINUS_EXPR)
3485 	{
3486 	  op.code = PLUS_EXPR;
3487 	  /* Track whether we negate the reduction value each iteration.  */
3488 	  if (op.ops[1] == op.ops[opi])
3489 	    neg = ! neg;
3490 	}
3491       if (CONVERT_EXPR_CODE_P (op.code)
3492 	  && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3493 	;
3494       else if (*code == ERROR_MARK)
3495 	{
3496 	  *code = op.code;
3497 	  sign = TYPE_SIGN (op.type);
3498 	}
3499       else if (op.code != *code)
3500 	{
3501 	  fail = true;
3502 	  break;
3503 	}
3504       else if ((op.code == MIN_EXPR
3505 		|| op.code == MAX_EXPR)
3506 	       && sign != TYPE_SIGN (op.type))
3507 	{
3508 	  fail = true;
3509 	  break;
3510 	}
3511       /* Check there's only a single stmt the op is used on.  For the
3512 	 not value-changing tail and the last stmt allow out-of-loop uses.
3513 	 ???  We could relax this and handle arbitrary live stmts by
3514 	 forcing a scalar epilogue for example.  */
3515       imm_use_iterator imm_iter;
3516       gimple *op_use_stmt;
3517       unsigned cnt = 0;
3518       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3519 	if (!is_gimple_debug (op_use_stmt)
3520 	    && (*code != ERROR_MARK
3521 		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3522 	  {
3523 	    /* We want to allow x + x but not x < 1 ? x : 2.  */
3524 	    if (is_gimple_assign (op_use_stmt)
3525 		&& gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3526 	      {
3527 		use_operand_p use_p;
3528 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3529 		  cnt++;
3530 	      }
3531 	    else
3532 	      cnt++;
3533 	  }
3534       if (cnt != 1)
3535 	{
3536 	  fail = true;
3537 	  break;
3538 	}
3539     }
3540   return ! fail && ! neg && *code != ERROR_MARK;
3541 }
3542 
3543 bool
3544 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3545 		      tree loop_arg, enum tree_code code)
3546 {
3547   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3548   code_helper code_;
3549   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3550 	  && code_ == code);
3551 }
3552 
3553 
3554 
3555 /* Function vect_is_simple_reduction
3556 
3557    (1) Detect a cross-iteration def-use cycle that represents a simple
3558    reduction computation.  We look for the following pattern:
3559 
3560    loop_header:
3561      a1 = phi < a0, a2 >
3562      a3 = ...
3563      a2 = operation (a3, a1)
3564 
3565    or
3566 
3567    a3 = ...
3568    loop_header:
3569      a1 = phi < a0, a2 >
3570      a2 = operation (a3, a1)
3571 
3572    such that:
3573    1. operation is commutative and associative and it is safe to
3574       change the order of the computation
3575    2. no uses for a2 in the loop (a2 is used out of the loop)
3576    3. no uses of a1 in the loop besides the reduction operation
3577    4. no uses of a1 outside the loop.
3578 
3579    Conditions 1,4 are tested here.
3580    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3581 
3582    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3583    nested cycles.
3584 
3585    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3586    reductions:
3587 
3588      a1 = phi < a0, a2 >
3589      inner loop (def of a3)
3590      a2 = phi < a3 >
3591 
3592    (4) Detect condition expressions, ie:
3593      for (int i = 0; i < N; i++)
3594        if (a[i] < val)
3595 	ret_val = a[i];
3596 
3597 */
3598 
3599 static stmt_vec_info
3600 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3601 			  bool *double_reduc, bool *reduc_chain_p)
3602 {
3603   gphi *phi = as_a <gphi *> (phi_info->stmt);
3604   gimple *phi_use_stmt = NULL;
3605   imm_use_iterator imm_iter;
3606   use_operand_p use_p;
3607 
3608   *double_reduc = false;
3609   *reduc_chain_p = false;
3610   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3611 
3612   tree phi_name = PHI_RESULT (phi);
3613   /* ???  If there are no uses of the PHI result the inner loop reduction
3614      won't be detected as possibly double-reduction by vectorizable_reduction
3615      because that tries to walk the PHI arg from the preheader edge which
3616      can be constant.  See PR60382.  */
3617   if (has_zero_uses (phi_name))
3618     return NULL;
3619   class loop *loop = (gimple_bb (phi))->loop_father;
3620   unsigned nphi_def_loop_uses = 0;
3621   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3622     {
3623       gimple *use_stmt = USE_STMT (use_p);
3624       if (is_gimple_debug (use_stmt))
3625 	continue;
3626 
3627       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3628         {
3629           if (dump_enabled_p ())
3630 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3631 			     "intermediate value used outside loop.\n");
3632 
3633           return NULL;
3634         }
3635 
3636       nphi_def_loop_uses++;
3637       phi_use_stmt = use_stmt;
3638     }
3639 
3640   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3641   if (TREE_CODE (latch_def) != SSA_NAME)
3642     {
3643       if (dump_enabled_p ())
3644 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3645 			 "reduction: not ssa_name: %T\n", latch_def);
3646       return NULL;
3647     }
3648 
3649   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3650   if (!def_stmt_info
3651       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3652     return NULL;
3653 
3654   bool nested_in_vect_loop
3655     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3656   unsigned nlatch_def_loop_uses = 0;
3657   auto_vec<gphi *, 3> lcphis;
3658   bool inner_loop_of_double_reduc = false;
3659   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3660     {
3661       gimple *use_stmt = USE_STMT (use_p);
3662       if (is_gimple_debug (use_stmt))
3663 	continue;
3664       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3665 	nlatch_def_loop_uses++;
3666       else
3667 	{
3668 	  /* We can have more than one loop-closed PHI.  */
3669 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
3670 	  if (nested_in_vect_loop
3671 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3672 		  == vect_double_reduction_def))
3673 	    inner_loop_of_double_reduc = true;
3674 	}
3675     }
3676 
3677   /* If we are vectorizing an inner reduction we are executing that
3678      in the original order only in case we are not dealing with a
3679      double reduction.  */
3680   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3681     {
3682       if (dump_enabled_p ())
3683 	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3684 			"detected nested cycle: ");
3685       return def_stmt_info;
3686     }
3687 
3688   /* When the inner loop of a double reduction ends up with more than
3689      one loop-closed PHI we have failed to classify alternate such
3690      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3691   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3692     {
3693       if (dump_enabled_p ())
3694 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3695 			 "unhandle double reduction\n");
3696       return NULL;
3697     }
3698 
3699   /* If this isn't a nested cycle or if the nested cycle reduction value
3700      is used ouside of the inner loop we cannot handle uses of the reduction
3701      value.  */
3702   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3703     {
3704       if (dump_enabled_p ())
3705 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 			 "reduction used in loop.\n");
3707       return NULL;
3708     }
3709 
3710   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3711      defined in the inner loop.  */
3712   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3713     {
3714       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3715       if (gimple_phi_num_args (def_stmt) != 1
3716           || TREE_CODE (op1) != SSA_NAME)
3717         {
3718           if (dump_enabled_p ())
3719 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3720 			     "unsupported phi node definition.\n");
3721 
3722           return NULL;
3723         }
3724 
3725       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3726       if (gimple_bb (def1)
3727 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3728 	  && loop->inner
3729 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3730 	  && (is_gimple_assign (def1) || is_gimple_call (def1))
3731 	  && is_a <gphi *> (phi_use_stmt)
3732 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3733         {
3734           if (dump_enabled_p ())
3735             report_vect_op (MSG_NOTE, def_stmt,
3736 			    "detected double reduction: ");
3737 
3738           *double_reduc = true;
3739 	  return def_stmt_info;
3740         }
3741 
3742       return NULL;
3743     }
3744 
3745   /* Look for the expression computing latch_def from then loop PHI result.  */
3746   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3747   code_helper code;
3748   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3749 			    path))
3750     {
3751       STMT_VINFO_REDUC_CODE (phi_info) = code;
3752       if (code == COND_EXPR && !nested_in_vect_loop)
3753 	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3754 
3755       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3756 	 reduction chain for which the additional restriction is that
3757 	 all operations in the chain are the same.  */
3758       auto_vec<stmt_vec_info, 8> reduc_chain;
3759       unsigned i;
3760       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3761       for (i = path.length () - 1; i >= 1; --i)
3762 	{
3763 	  gimple *stmt = USE_STMT (path[i].second);
3764 	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3765 	  gimple_match_op op;
3766 	  if (!gimple_extract_op (stmt, &op))
3767 	    gcc_unreachable ();
3768 	  if (gassign *assign = dyn_cast<gassign *> (stmt))
3769 	    STMT_VINFO_REDUC_IDX (stmt_info)
3770 	      = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3771 	  else
3772 	    {
3773 	      gcall *call = as_a<gcall *> (stmt);
3774 	      STMT_VINFO_REDUC_IDX (stmt_info)
3775 		= path[i].second->use - gimple_call_arg_ptr (call, 0);
3776 	    }
3777 	  bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3778 				     && (i == 1 || i == path.length () - 1));
3779 	  if ((op.code != code && !leading_conversion)
3780 	      /* We can only handle the final value in epilogue
3781 		 generation for reduction chains.  */
3782 	      || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3783 	    is_slp_reduc = false;
3784 	  /* For reduction chains we support a trailing/leading
3785 	     conversions.  We do not store those in the actual chain.  */
3786 	  if (leading_conversion)
3787 	    continue;
3788 	  reduc_chain.safe_push (stmt_info);
3789 	}
3790       if (is_slp_reduc && reduc_chain.length () > 1)
3791 	{
3792 	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3793 	    {
3794 	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3795 	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3796 	    }
3797 	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3798 	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3799 
3800 	  /* Save the chain for further analysis in SLP detection.  */
3801 	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3802 	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3803 
3804 	  *reduc_chain_p = true;
3805 	  if (dump_enabled_p ())
3806 	    dump_printf_loc (MSG_NOTE, vect_location,
3807 			    "reduction: detected reduction chain\n");
3808 	}
3809       else if (dump_enabled_p ())
3810 	dump_printf_loc (MSG_NOTE, vect_location,
3811 			 "reduction: detected reduction\n");
3812 
3813       return def_stmt_info;
3814     }
3815 
3816   if (dump_enabled_p ())
3817     dump_printf_loc (MSG_NOTE, vect_location,
3818 		     "reduction: unknown pattern\n");
3819 
3820   return NULL;
3821 }
3822 
3823 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3824    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3825    or -1 if not known.  */
3826 
3827 static int
3828 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3829 {
3830   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3831   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3832     {
3833       if (dump_enabled_p ())
3834 	dump_printf_loc (MSG_NOTE, vect_location,
3835 			 "cost model: epilogue peel iters set to vf/2 "
3836 			 "because loop iterations are unknown .\n");
3837       return assumed_vf / 2;
3838     }
3839   else
3840     {
3841       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3842       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3843       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3844       /* If we need to peel for gaps, but no peeling is required, we have to
3845 	 peel VF iterations.  */
3846       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3847 	peel_iters_epilogue = assumed_vf;
3848       return peel_iters_epilogue;
3849     }
3850 }
3851 
3852 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3853 int
3854 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3855 			     int *peel_iters_epilogue,
3856 			     stmt_vector_for_cost *scalar_cost_vec,
3857 			     stmt_vector_for_cost *prologue_cost_vec,
3858 			     stmt_vector_for_cost *epilogue_cost_vec)
3859 {
3860   int retval = 0;
3861 
3862   *peel_iters_epilogue
3863     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3864 
3865   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3866     {
3867       /* If peeled iterations are known but number of scalar loop
3868 	 iterations are unknown, count a taken branch per peeled loop.  */
3869       if (peel_iters_prologue > 0)
3870 	retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3871 				   vect_prologue);
3872       if (*peel_iters_epilogue > 0)
3873 	retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3874 				    vect_epilogue);
3875     }
3876 
3877   stmt_info_for_cost *si;
3878   int j;
3879   if (peel_iters_prologue)
3880     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3881       retval += record_stmt_cost (prologue_cost_vec,
3882 				  si->count * peel_iters_prologue,
3883 				  si->kind, si->stmt_info, si->misalign,
3884 				  vect_prologue);
3885   if (*peel_iters_epilogue)
3886     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3887       retval += record_stmt_cost (epilogue_cost_vec,
3888 				  si->count * *peel_iters_epilogue,
3889 				  si->kind, si->stmt_info, si->misalign,
3890 				  vect_epilogue);
3891 
3892   return retval;
3893 }
3894 
3895 /* Function vect_estimate_min_profitable_iters
3896 
3897    Return the number of iterations required for the vector version of the
3898    loop to be profitable relative to the cost of the scalar version of the
3899    loop.
3900 
3901    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3902    of iterations for vectorization.  -1 value means loop vectorization
3903    is not profitable.  This returned value may be used for dynamic
3904    profitability check.
3905 
3906    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3907    for static check against estimated number of iterations.  */
3908 
3909 static void
3910 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3911 				    int *ret_min_profitable_niters,
3912 				    int *ret_min_profitable_estimate,
3913 				    unsigned *suggested_unroll_factor)
3914 {
3915   int min_profitable_iters;
3916   int min_profitable_estimate;
3917   int peel_iters_prologue;
3918   int peel_iters_epilogue;
3919   unsigned vec_inside_cost = 0;
3920   int vec_outside_cost = 0;
3921   unsigned vec_prologue_cost = 0;
3922   unsigned vec_epilogue_cost = 0;
3923   int scalar_single_iter_cost = 0;
3924   int scalar_outside_cost = 0;
3925   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3926   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3927   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3928 
3929   /* Cost model disabled.  */
3930   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3931     {
3932       if (dump_enabled_p ())
3933 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3934       *ret_min_profitable_niters = 0;
3935       *ret_min_profitable_estimate = 0;
3936       return;
3937     }
3938 
3939   /* Requires loop versioning tests to handle misalignment.  */
3940   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3941     {
3942       /*  FIXME: Make cost depend on complexity of individual check.  */
3943       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3944       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3945       if (dump_enabled_p ())
3946 	dump_printf (MSG_NOTE,
3947 		     "cost model: Adding cost of checks for loop "
3948 		     "versioning to treat misalignment.\n");
3949     }
3950 
3951   /* Requires loop versioning with alias checks.  */
3952   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3953     {
3954       /*  FIXME: Make cost depend on complexity of individual check.  */
3955       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3956       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3957       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3958       if (len)
3959 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3960 	(void) add_stmt_cost (target_cost_data, len * 2 - 1,
3961 			      scalar_stmt, vect_prologue);
3962       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3963       if (len)
3964 	{
3965 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3966 	  unsigned int nstmts = len * 2 - 1;
3967 	  /* +1 for each bias that needs adding.  */
3968 	  for (unsigned int i = 0; i < len; ++i)
3969 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3970 	      nstmts += 1;
3971 	  (void) add_stmt_cost (target_cost_data, nstmts,
3972 				scalar_stmt, vect_prologue);
3973 	}
3974       if (dump_enabled_p ())
3975 	dump_printf (MSG_NOTE,
3976 		     "cost model: Adding cost of checks for loop "
3977 		     "versioning aliasing.\n");
3978     }
3979 
3980   /* Requires loop versioning with niter checks.  */
3981   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3982     {
3983       /*  FIXME: Make cost depend on complexity of individual check.  */
3984       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3985 			    NULL, NULL, NULL_TREE, 0, vect_prologue);
3986       if (dump_enabled_p ())
3987 	dump_printf (MSG_NOTE,
3988 		     "cost model: Adding cost of checks for loop "
3989 		     "versioning niters.\n");
3990     }
3991 
3992   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3993     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3994 			  vect_prologue);
3995 
3996   /* Count statements in scalar loop.  Using this as scalar cost for a single
3997      iteration for now.
3998 
3999      TODO: Add outer loop support.
4000 
4001      TODO: Consider assigning different costs to different scalar
4002      statements.  */
4003 
4004   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4005 
4006   /* Add additional cost for the peeled instructions in prologue and epilogue
4007      loop.  (For fully-masked loops there will be no peeling.)
4008 
4009      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4010      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4011 
4012      TODO: Build an expression that represents peel_iters for prologue and
4013      epilogue to be used in a run-time test.  */
4014 
4015   bool prologue_need_br_taken_cost = false;
4016   bool prologue_need_br_not_taken_cost = false;
4017 
4018   /* Calculate peel_iters_prologue.  */
4019   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4020     peel_iters_prologue = 0;
4021   else if (npeel < 0)
4022     {
4023       peel_iters_prologue = assumed_vf / 2;
4024       if (dump_enabled_p ())
4025 	dump_printf (MSG_NOTE, "cost model: "
4026 		     "prologue peel iters set to vf/2.\n");
4027 
4028       /* If peeled iterations are unknown, count a taken branch and a not taken
4029 	 branch per peeled loop.  Even if scalar loop iterations are known,
4030 	 vector iterations are not known since peeled prologue iterations are
4031 	 not known.  Hence guards remain the same.  */
4032       prologue_need_br_taken_cost = true;
4033       prologue_need_br_not_taken_cost = true;
4034     }
4035   else
4036     {
4037       peel_iters_prologue = npeel;
4038       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4039 	/* If peeled iterations are known but number of scalar loop
4040 	   iterations are unknown, count a taken branch per peeled loop.  */
4041 	prologue_need_br_taken_cost = true;
4042     }
4043 
4044   bool epilogue_need_br_taken_cost = false;
4045   bool epilogue_need_br_not_taken_cost = false;
4046 
4047   /* Calculate peel_iters_epilogue.  */
4048   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4049     /* We need to peel exactly one iteration for gaps.  */
4050     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4051   else if (npeel < 0)
4052     {
4053       /* If peeling for alignment is unknown, loop bound of main loop
4054 	 becomes unknown.  */
4055       peel_iters_epilogue = assumed_vf / 2;
4056       if (dump_enabled_p ())
4057 	dump_printf (MSG_NOTE, "cost model: "
4058 		     "epilogue peel iters set to vf/2 because "
4059 		     "peeling for alignment is unknown.\n");
4060 
4061       /* See the same reason above in peel_iters_prologue calculation.  */
4062       epilogue_need_br_taken_cost = true;
4063       epilogue_need_br_not_taken_cost = true;
4064     }
4065   else
4066     {
4067       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4068       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4069 	/* If peeled iterations are known but number of scalar loop
4070 	   iterations are unknown, count a taken branch per peeled loop.  */
4071 	epilogue_need_br_taken_cost = true;
4072     }
4073 
4074   stmt_info_for_cost *si;
4075   int j;
4076   /* Add costs associated with peel_iters_prologue.  */
4077   if (peel_iters_prologue)
4078     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4079       {
4080 	(void) add_stmt_cost (target_cost_data,
4081 			      si->count * peel_iters_prologue, si->kind,
4082 			      si->stmt_info, si->node, si->vectype,
4083 			      si->misalign, vect_prologue);
4084       }
4085 
4086   /* Add costs associated with peel_iters_epilogue.  */
4087   if (peel_iters_epilogue)
4088     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4089       {
4090 	(void) add_stmt_cost (target_cost_data,
4091 			      si->count * peel_iters_epilogue, si->kind,
4092 			      si->stmt_info, si->node, si->vectype,
4093 			      si->misalign, vect_epilogue);
4094       }
4095 
4096   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4097 
4098   if (prologue_need_br_taken_cost)
4099     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4100 			  vect_prologue);
4101 
4102   if (prologue_need_br_not_taken_cost)
4103     (void) add_stmt_cost (target_cost_data, 1,
4104 			  cond_branch_not_taken, vect_prologue);
4105 
4106   if (epilogue_need_br_taken_cost)
4107     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4108 			  vect_epilogue);
4109 
4110   if (epilogue_need_br_not_taken_cost)
4111     (void) add_stmt_cost (target_cost_data, 1,
4112 			  cond_branch_not_taken, vect_epilogue);
4113 
4114   /* Take care of special costs for rgroup controls of partial vectors.  */
4115   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4116     {
4117       /* Calculate how many masks we need to generate.  */
4118       unsigned int num_masks = 0;
4119       rgroup_controls *rgm;
4120       unsigned int num_vectors_m1;
4121       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4122 	if (rgm->type)
4123 	  num_masks += num_vectors_m1 + 1;
4124       gcc_assert (num_masks > 0);
4125 
4126       /* In the worst case, we need to generate each mask in the prologue
4127 	 and in the loop body.  One of the loop body mask instructions
4128 	 replaces the comparison in the scalar loop, and since we don't
4129 	 count the scalar comparison against the scalar body, we shouldn't
4130 	 count that vector instruction against the vector body either.
4131 
4132 	 Sometimes we can use unpacks instead of generating prologue
4133 	 masks and sometimes the prologue mask will fold to a constant,
4134 	 so the actual prologue cost might be smaller.  However, it's
4135 	 simpler and safer to use the worst-case cost; if this ends up
4136 	 being the tie-breaker between vectorizing or not, then it's
4137 	 probably better not to vectorize.  */
4138       (void) add_stmt_cost (target_cost_data, num_masks,
4139 			    vector_stmt, NULL, NULL, NULL_TREE, 0,
4140 			    vect_prologue);
4141       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4142 			    vector_stmt, NULL, NULL, NULL_TREE, 0,
4143 			    vect_body);
4144     }
4145   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4146     {
4147       /* Referring to the functions vect_set_loop_condition_partial_vectors
4148 	 and vect_set_loop_controls_directly, we need to generate each
4149 	 length in the prologue and in the loop body if required. Although
4150 	 there are some possible optimizations, we consider the worst case
4151 	 here.  */
4152 
4153       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4154       signed char partial_load_store_bias
4155 	= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4156       bool need_iterate_p
4157 	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4158 	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
4159 
4160       /* Calculate how many statements to be added.  */
4161       unsigned int prologue_stmts = 0;
4162       unsigned int body_stmts = 0;
4163 
4164       rgroup_controls *rgc;
4165       unsigned int num_vectors_m1;
4166       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4167 	if (rgc->type)
4168 	  {
4169 	    /* May need one SHIFT for nitems_total computation.  */
4170 	    unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4171 	    if (nitems != 1 && !niters_known_p)
4172 	      prologue_stmts += 1;
4173 
4174 	    /* May need one MAX and one MINUS for wrap around.  */
4175 	    if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4176 	      prologue_stmts += 2;
4177 
4178 	    /* Need one MAX and one MINUS for each batch limit excepting for
4179 	       the 1st one.  */
4180 	    prologue_stmts += num_vectors_m1 * 2;
4181 
4182 	    unsigned int num_vectors = num_vectors_m1 + 1;
4183 
4184 	    /* Need to set up lengths in prologue, only one MIN required
4185 	       for each since start index is zero.  */
4186 	    prologue_stmts += num_vectors;
4187 
4188 	    /* If we have a non-zero partial load bias, we need one PLUS
4189 	       to adjust the load length.  */
4190 	    if (partial_load_store_bias != 0)
4191 	      body_stmts += 1;
4192 
4193 	    /* Each may need two MINs and one MINUS to update lengths in body
4194 	       for next iteration.  */
4195 	    if (need_iterate_p)
4196 	      body_stmts += 3 * num_vectors;
4197 	  }
4198 
4199       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4200 			    scalar_stmt, vect_prologue);
4201       (void) add_stmt_cost (target_cost_data, body_stmts,
4202 			    scalar_stmt, vect_body);
4203     }
4204 
4205   /* FORNOW: The scalar outside cost is incremented in one of the
4206      following ways:
4207 
4208      1. The vectorizer checks for alignment and aliasing and generates
4209      a condition that allows dynamic vectorization.  A cost model
4210      check is ANDED with the versioning condition.  Hence scalar code
4211      path now has the added cost of the versioning check.
4212 
4213        if (cost > th & versioning_check)
4214          jmp to vector code
4215 
4216      Hence run-time scalar is incremented by not-taken branch cost.
4217 
4218      2. The vectorizer then checks if a prologue is required.  If the
4219      cost model check was not done before during versioning, it has to
4220      be done before the prologue check.
4221 
4222        if (cost <= th)
4223          prologue = scalar_iters
4224        if (prologue == 0)
4225          jmp to vector code
4226        else
4227          execute prologue
4228        if (prologue == num_iters)
4229 	 go to exit
4230 
4231      Hence the run-time scalar cost is incremented by a taken branch,
4232      plus a not-taken branch, plus a taken branch cost.
4233 
4234      3. The vectorizer then checks if an epilogue is required.  If the
4235      cost model check was not done before during prologue check, it
4236      has to be done with the epilogue check.
4237 
4238        if (prologue == 0)
4239          jmp to vector code
4240        else
4241          execute prologue
4242        if (prologue == num_iters)
4243 	 go to exit
4244        vector code:
4245          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4246            jmp to epilogue
4247 
4248      Hence the run-time scalar cost should be incremented by 2 taken
4249      branches.
4250 
4251      TODO: The back end may reorder the BBS's differently and reverse
4252      conditions/branch directions.  Change the estimates below to
4253      something more reasonable.  */
4254 
4255   /* If the number of iterations is known and we do not do versioning, we can
4256      decide whether to vectorize at compile time.  Hence the scalar version
4257      do not carry cost model guard costs.  */
4258   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4259       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4260     {
4261       /* Cost model check occurs at versioning.  */
4262       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4263 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4264       else
4265 	{
4266 	  /* Cost model check occurs at prologue generation.  */
4267 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4268 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4269 	      + vect_get_stmt_cost (cond_branch_not_taken);
4270 	  /* Cost model check occurs at epilogue generation.  */
4271 	  else
4272 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4273 	}
4274     }
4275 
4276   /* Complete the target-specific cost calculations.  */
4277   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4278 	       &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4279 	       suggested_unroll_factor);
4280 
4281   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4282       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4283       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4284 		    *suggested_unroll_factor,
4285 		    LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4286     {
4287       if (dump_enabled_p ())
4288 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4289 			 "can't unroll as unrolled vectorization factor larger"
4290 			 " than maximum vectorization factor: "
4291 			 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4292 			 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4293       *suggested_unroll_factor = 1;
4294     }
4295 
4296   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4297 
4298   if (dump_enabled_p ())
4299     {
4300       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4301       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4302                    vec_inside_cost);
4303       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4304                    vec_prologue_cost);
4305       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4306                    vec_epilogue_cost);
4307       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4308                    scalar_single_iter_cost);
4309       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4310                    scalar_outside_cost);
4311       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4312                    vec_outside_cost);
4313       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4314                    peel_iters_prologue);
4315       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4316                    peel_iters_epilogue);
4317     }
4318 
4319   /* Calculate number of iterations required to make the vector version
4320      profitable, relative to the loop bodies only.  The following condition
4321      must hold true:
4322      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4323      where
4324      SIC = scalar iteration cost, VIC = vector iteration cost,
4325      VOC = vector outside cost, VF = vectorization factor,
4326      NPEEL = prologue iterations + epilogue iterations,
4327      SOC = scalar outside cost for run time cost model check.  */
4328 
4329   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4330 			  - vec_inside_cost);
4331   if (saving_per_viter <= 0)
4332     {
4333       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4334 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4335 		    "vectorization did not happen for a simd loop");
4336 
4337       if (dump_enabled_p ())
4338         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4339 			 "cost model: the vector iteration cost = %d "
4340 			 "divided by the scalar iteration cost = %d "
4341 			 "is greater or equal to the vectorization factor = %d"
4342                          ".\n",
4343 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4344       *ret_min_profitable_niters = -1;
4345       *ret_min_profitable_estimate = -1;
4346       return;
4347     }
4348 
4349   /* ??? The "if" arm is written to handle all cases; see below for what
4350      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4351   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4352     {
4353       /* Rewriting the condition above in terms of the number of
4354 	 vector iterations (vniters) rather than the number of
4355 	 scalar iterations (niters) gives:
4356 
4357 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4358 
4359 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4360 
4361 	 For integer N, X and Y when X > 0:
4362 
4363 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4364       int outside_overhead = (vec_outside_cost
4365 			      - scalar_single_iter_cost * peel_iters_prologue
4366 			      - scalar_single_iter_cost * peel_iters_epilogue
4367 			      - scalar_outside_cost);
4368       /* We're only interested in cases that require at least one
4369 	 vector iteration.  */
4370       int min_vec_niters = 1;
4371       if (outside_overhead > 0)
4372 	min_vec_niters = outside_overhead / saving_per_viter + 1;
4373 
4374       if (dump_enabled_p ())
4375 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4376 		     min_vec_niters);
4377 
4378       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4379 	{
4380 	  /* Now that we know the minimum number of vector iterations,
4381 	     find the minimum niters for which the scalar cost is larger:
4382 
4383 	     SIC * niters > VIC * vniters + VOC - SOC
4384 
4385 	     We know that the minimum niters is no more than
4386 	     vniters * VF + NPEEL, but it might be (and often is) less
4387 	     than that if a partial vector iteration is cheaper than the
4388 	     equivalent scalar code.  */
4389 	  int threshold = (vec_inside_cost * min_vec_niters
4390 			   + vec_outside_cost
4391 			   - scalar_outside_cost);
4392 	  if (threshold <= 0)
4393 	    min_profitable_iters = 1;
4394 	  else
4395 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4396 	}
4397       else
4398 	/* Convert the number of vector iterations into a number of
4399 	   scalar iterations.  */
4400 	min_profitable_iters = (min_vec_niters * assumed_vf
4401 				+ peel_iters_prologue
4402 				+ peel_iters_epilogue);
4403     }
4404   else
4405     {
4406       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4407 			      * assumed_vf
4408 			      - vec_inside_cost * peel_iters_prologue
4409 			      - vec_inside_cost * peel_iters_epilogue);
4410       if (min_profitable_iters <= 0)
4411         min_profitable_iters = 0;
4412       else
4413 	{
4414 	  min_profitable_iters /= saving_per_viter;
4415 
4416 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4417 	      <= (((int) vec_inside_cost * min_profitable_iters)
4418 		  + (((int) vec_outside_cost - scalar_outside_cost)
4419 		     * assumed_vf)))
4420 	    min_profitable_iters++;
4421 	}
4422     }
4423 
4424   if (dump_enabled_p ())
4425     dump_printf (MSG_NOTE,
4426 		 "  Calculated minimum iters for profitability: %d\n",
4427 		 min_profitable_iters);
4428 
4429   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4430       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4431     /* We want the vectorized loop to execute at least once.  */
4432     min_profitable_iters = assumed_vf + peel_iters_prologue;
4433   else if (min_profitable_iters < peel_iters_prologue)
4434     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4435        vectorized loop executes at least once.  */
4436     min_profitable_iters = peel_iters_prologue;
4437 
4438   if (dump_enabled_p ())
4439     dump_printf_loc (MSG_NOTE, vect_location,
4440                      "  Runtime profitability threshold = %d\n",
4441                      min_profitable_iters);
4442 
4443   *ret_min_profitable_niters = min_profitable_iters;
4444 
4445   /* Calculate number of iterations required to make the vector version
4446      profitable, relative to the loop bodies only.
4447 
4448      Non-vectorized variant is SIC * niters and it must win over vector
4449      variant on the expected loop trip count.  The following condition must hold true:
4450      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4451 
4452   if (vec_outside_cost <= 0)
4453     min_profitable_estimate = 0;
4454   /* ??? This "else if" arm is written to handle all cases; see below for
4455      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4456   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4457     {
4458       /* This is a repeat of the code above, but with + SOC rather
4459 	 than - SOC.  */
4460       int outside_overhead = (vec_outside_cost
4461 			      - scalar_single_iter_cost * peel_iters_prologue
4462 			      - scalar_single_iter_cost * peel_iters_epilogue
4463 			      + scalar_outside_cost);
4464       int min_vec_niters = 1;
4465       if (outside_overhead > 0)
4466 	min_vec_niters = outside_overhead / saving_per_viter + 1;
4467 
4468       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4469 	{
4470 	  int threshold = (vec_inside_cost * min_vec_niters
4471 			   + vec_outside_cost
4472 			   + scalar_outside_cost);
4473 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4474 	}
4475       else
4476 	min_profitable_estimate = (min_vec_niters * assumed_vf
4477 				   + peel_iters_prologue
4478 				   + peel_iters_epilogue);
4479     }
4480   else
4481     {
4482       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4483 				 * assumed_vf
4484 				 - vec_inside_cost * peel_iters_prologue
4485 				 - vec_inside_cost * peel_iters_epilogue)
4486 				 / ((scalar_single_iter_cost * assumed_vf)
4487 				   - vec_inside_cost);
4488     }
4489   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4490   if (dump_enabled_p ())
4491     dump_printf_loc (MSG_NOTE, vect_location,
4492 		     "  Static estimate profitability threshold = %d\n",
4493 		     min_profitable_estimate);
4494 
4495   *ret_min_profitable_estimate = min_profitable_estimate;
4496 }
4497 
4498 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4499    vector elements (not bits) for a vector with NELT elements.  */
4500 static void
4501 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4502 			      vec_perm_builder *sel)
4503 {
4504   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4505      by vec_perm_indices.  */
4506   sel->new_vector (nelt, 1, 3);
4507   for (unsigned int i = 0; i < 3; i++)
4508     sel->quick_push (i + offset);
4509 }
4510 
4511 /* Checks whether the target supports whole-vector shifts for vectors of mode
4512    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4513    it supports vec_perm_const with masks for all necessary shift amounts.  */
4514 static bool
4515 have_whole_vector_shift (machine_mode mode)
4516 {
4517   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4518     return true;
4519 
4520   /* Variable-length vectors should be handled via the optab.  */
4521   unsigned int nelt;
4522   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4523     return false;
4524 
4525   vec_perm_builder sel;
4526   vec_perm_indices indices;
4527   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4528     {
4529       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4530       indices.new_vector (sel, 2, nelt);
4531       if (!can_vec_perm_const_p (mode, indices, false))
4532 	return false;
4533     }
4534   return true;
4535 }
4536 
4537 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4538    functions. Design better to avoid maintenance issues.  */
4539 
4540 /* Function vect_model_reduction_cost.
4541 
4542    Models cost for a reduction operation, including the vector ops
4543    generated within the strip-mine loop in some cases, the initial
4544    definition before the loop, and the epilogue code that must be generated.  */
4545 
4546 static void
4547 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4548 			   stmt_vec_info stmt_info, internal_fn reduc_fn,
4549 			   vect_reduction_type reduction_type,
4550 			   int ncopies, stmt_vector_for_cost *cost_vec)
4551 {
4552   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4553   tree vectype;
4554   machine_mode mode;
4555   class loop *loop = NULL;
4556 
4557   if (loop_vinfo)
4558     loop = LOOP_VINFO_LOOP (loop_vinfo);
4559 
4560   /* Condition reductions generate two reductions in the loop.  */
4561   if (reduction_type == COND_REDUCTION)
4562     ncopies *= 2;
4563 
4564   vectype = STMT_VINFO_VECTYPE (stmt_info);
4565   mode = TYPE_MODE (vectype);
4566   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4567 
4568   gimple_match_op op;
4569   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4570     gcc_unreachable ();
4571 
4572   if (reduction_type == EXTRACT_LAST_REDUCTION)
4573     /* No extra instructions are needed in the prologue.  The loop body
4574        operations are costed in vectorizable_condition.  */
4575     inside_cost = 0;
4576   else if (reduction_type == FOLD_LEFT_REDUCTION)
4577     {
4578       /* No extra instructions needed in the prologue.  */
4579       prologue_cost = 0;
4580 
4581       if (reduc_fn != IFN_LAST)
4582 	/* Count one reduction-like operation per vector.  */
4583 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4584 					stmt_info, 0, vect_body);
4585       else
4586 	{
4587 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4588 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4589 	  inside_cost = record_stmt_cost (cost_vec, nelements,
4590 					  vec_to_scalar, stmt_info, 0,
4591 					  vect_body);
4592 	  inside_cost += record_stmt_cost (cost_vec, nelements,
4593 					   scalar_stmt, stmt_info, 0,
4594 					   vect_body);
4595 	}
4596     }
4597   else
4598     {
4599       /* Add in cost for initial definition.
4600 	 For cond reduction we have four vectors: initial index, step,
4601 	 initial result of the data reduction, initial value of the index
4602 	 reduction.  */
4603       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4604       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4605 					 scalar_to_vec, stmt_info, 0,
4606 					 vect_prologue);
4607     }
4608 
4609   /* Determine cost of epilogue code.
4610 
4611      We have a reduction operator that will reduce the vector in one statement.
4612      Also requires scalar extract.  */
4613 
4614   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4615     {
4616       if (reduc_fn != IFN_LAST)
4617 	{
4618 	  if (reduction_type == COND_REDUCTION)
4619 	    {
4620 	      /* An EQ stmt and an COND_EXPR stmt.  */
4621 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4622 						 vector_stmt, stmt_info, 0,
4623 						 vect_epilogue);
4624 	      /* Reduction of the max index and a reduction of the found
4625 		 values.  */
4626 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4627 						 vec_to_scalar, stmt_info, 0,
4628 						 vect_epilogue);
4629 	      /* A broadcast of the max value.  */
4630 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4631 						 scalar_to_vec, stmt_info, 0,
4632 						 vect_epilogue);
4633 	    }
4634 	  else
4635 	    {
4636 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4637 						 stmt_info, 0, vect_epilogue);
4638 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4639 						 vec_to_scalar, stmt_info, 0,
4640 						 vect_epilogue);
4641 	    }
4642 	}
4643       else if (reduction_type == COND_REDUCTION)
4644 	{
4645 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4646 	  /* Extraction of scalar elements.  */
4647 	  epilogue_cost += record_stmt_cost (cost_vec,
4648 					     2 * estimated_nunits,
4649 					     vec_to_scalar, stmt_info, 0,
4650 					     vect_epilogue);
4651 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4652 	  epilogue_cost += record_stmt_cost (cost_vec,
4653 					     2 * estimated_nunits - 3,
4654 					     scalar_stmt, stmt_info, 0,
4655 					     vect_epilogue);
4656 	}
4657       else if (reduction_type == EXTRACT_LAST_REDUCTION
4658 	       || reduction_type == FOLD_LEFT_REDUCTION)
4659 	/* No extra instructions need in the epilogue.  */
4660 	;
4661       else
4662 	{
4663 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4664 	  tree bitsize = TYPE_SIZE (op.type);
4665 	  int element_bitsize = tree_to_uhwi (bitsize);
4666 	  int nelements = vec_size_in_bits / element_bitsize;
4667 
4668 	  if (op.code == COND_EXPR)
4669 	    op.code = MAX_EXPR;
4670 
4671 	  /* We have a whole vector shift available.  */
4672 	  if (VECTOR_MODE_P (mode)
4673 	      && directly_supported_p (op.code, vectype)
4674 	      && have_whole_vector_shift (mode))
4675 	    {
4676 	      /* Final reduction via vector shifts and the reduction operator.
4677 		 Also requires scalar extract.  */
4678 	      epilogue_cost += record_stmt_cost (cost_vec,
4679 						 exact_log2 (nelements) * 2,
4680 						 vector_stmt, stmt_info, 0,
4681 						 vect_epilogue);
4682 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4683 						 vec_to_scalar, stmt_info, 0,
4684 						 vect_epilogue);
4685 	    }
4686 	  else
4687 	    /* Use extracts and reduction op for final reduction.  For N
4688 	       elements, we have N extracts and N-1 reduction ops.  */
4689 	    epilogue_cost += record_stmt_cost (cost_vec,
4690 					       nelements + nelements - 1,
4691 					       vector_stmt, stmt_info, 0,
4692 					       vect_epilogue);
4693 	}
4694     }
4695 
4696   if (dump_enabled_p ())
4697     dump_printf (MSG_NOTE,
4698                  "vect_model_reduction_cost: inside_cost = %d, "
4699                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4700                  prologue_cost, epilogue_cost);
4701 }
4702 
4703 /* SEQ is a sequence of instructions that initialize the reduction
4704    described by REDUC_INFO.  Emit them in the appropriate place.  */
4705 
4706 static void
4707 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4708 				stmt_vec_info reduc_info, gimple *seq)
4709 {
4710   if (reduc_info->reused_accumulator)
4711     {
4712       /* When reusing an accumulator from the main loop, we only need
4713 	 initialization instructions if the main loop can be skipped.
4714 	 In that case, emit the initialization instructions at the end
4715 	 of the guard block that does the skip.  */
4716       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4717       gcc_assert (skip_edge);
4718       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4719       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4720     }
4721   else
4722     {
4723       /* The normal case: emit the initialization instructions on the
4724 	 preheader edge.  */
4725       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4726       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4727     }
4728 }
4729 
4730 /* Function get_initial_def_for_reduction
4731 
4732    Input:
4733    REDUC_INFO - the info_for_reduction
4734    INIT_VAL - the initial value of the reduction variable
4735    NEUTRAL_OP - a value that has no effect on the reduction, as per
4736 		neutral_op_for_reduction
4737 
4738    Output:
4739    Return a vector variable, initialized according to the operation that
4740 	STMT_VINFO performs. This vector will be used as the initial value
4741 	of the vector of partial results.
4742 
4743    The value we need is a vector in which element 0 has value INIT_VAL
4744    and every other element has value NEUTRAL_OP.  */
4745 
4746 static tree
4747 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4748 			       stmt_vec_info reduc_info,
4749 			       tree init_val, tree neutral_op)
4750 {
4751   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4752   tree scalar_type = TREE_TYPE (init_val);
4753   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4754   tree init_def;
4755   gimple_seq stmts = NULL;
4756 
4757   gcc_assert (vectype);
4758 
4759   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4760 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4761 
4762   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4763 	      || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4764 
4765   if (operand_equal_p (init_val, neutral_op))
4766     {
4767       /* If both elements are equal then the vector described above is
4768 	 just a splat.  */
4769       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4770       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4771     }
4772   else
4773     {
4774       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4775       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4776       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4777 	{
4778 	  /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4779 	     element 0.  */
4780 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4781 						   neutral_op);
4782 	  init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4783 				   vectype, init_def, init_val);
4784 	}
4785       else
4786 	{
4787 	  /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4788 	  tree_vector_builder elts (vectype, 1, 2);
4789 	  elts.quick_push (init_val);
4790 	  elts.quick_push (neutral_op);
4791 	  init_def = gimple_build_vector (&stmts, &elts);
4792 	}
4793     }
4794 
4795   if (stmts)
4796     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4797   return init_def;
4798 }
4799 
4800 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4801    which performs a reduction involving GROUP_SIZE scalar statements.
4802    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4803    is nonnull, introducing extra elements of that value will not change the
4804    result.  */
4805 
4806 static void
4807 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4808 				stmt_vec_info reduc_info,
4809 				vec<tree> *vec_oprnds,
4810 				unsigned int number_of_vectors,
4811 				unsigned int group_size, tree neutral_op)
4812 {
4813   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4814   unsigned HOST_WIDE_INT nunits;
4815   unsigned j, number_of_places_left_in_vector;
4816   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4817   unsigned int i;
4818 
4819   gcc_assert (group_size == initial_values.length () || neutral_op);
4820 
4821   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4822      created vectors. It is greater than 1 if unrolling is performed.
4823 
4824      For example, we have two scalar operands, s1 and s2 (e.g., group of
4825      strided accesses of size two), while NUNITS is four (i.e., four scalars
4826      of this type can be packed in a vector).  The output vector will contain
4827      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4828      will be 2).
4829 
4830      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4831      vectors containing the operands.
4832 
4833      For example, NUNITS is four as before, and the group size is 8
4834      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4835      {s5, s6, s7, s8}.  */
4836 
4837   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4838     nunits = group_size;
4839 
4840   number_of_places_left_in_vector = nunits;
4841   bool constant_p = true;
4842   tree_vector_builder elts (vector_type, nunits, 1);
4843   elts.quick_grow (nunits);
4844   gimple_seq ctor_seq = NULL;
4845   for (j = 0; j < nunits * number_of_vectors; ++j)
4846     {
4847       tree op;
4848       i = j % group_size;
4849 
4850       /* Get the def before the loop.  In reduction chain we have only
4851 	 one initial value.  Else we have as many as PHIs in the group.  */
4852       if (i >= initial_values.length () || (j > i && neutral_op))
4853 	op = neutral_op;
4854       else
4855 	op = initial_values[i];
4856 
4857       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4858       number_of_places_left_in_vector--;
4859       elts[nunits - number_of_places_left_in_vector - 1] = op;
4860       if (!CONSTANT_CLASS_P (op))
4861 	constant_p = false;
4862 
4863       if (number_of_places_left_in_vector == 0)
4864 	{
4865 	  tree init;
4866 	  if (constant_p && !neutral_op
4867 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4868 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4869 	    /* Build the vector directly from ELTS.  */
4870 	    init = gimple_build_vector (&ctor_seq, &elts);
4871 	  else if (neutral_op)
4872 	    {
4873 	      /* Build a vector of the neutral value and shift the
4874 		 other elements into place.  */
4875 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4876 						   neutral_op);
4877 	      int k = nunits;
4878 	      while (k > 0 && elts[k - 1] == neutral_op)
4879 		k -= 1;
4880 	      while (k > 0)
4881 		{
4882 		  k -= 1;
4883 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4884 				       vector_type, init, elts[k]);
4885 		}
4886 	    }
4887 	  else
4888 	    {
4889 	      /* First time round, duplicate ELTS to fill the
4890 		 required number of vectors.  */
4891 	      duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4892 					elts, number_of_vectors, *vec_oprnds);
4893 	      break;
4894 	    }
4895 	  vec_oprnds->quick_push (init);
4896 
4897 	  number_of_places_left_in_vector = nunits;
4898 	  elts.new_vector (vector_type, nunits, 1);
4899 	  elts.quick_grow (nunits);
4900 	  constant_p = true;
4901 	}
4902     }
4903   if (ctor_seq != NULL)
4904     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4905 }
4906 
4907 /* For a statement STMT_INFO taking part in a reduction operation return
4908    the stmt_vec_info the meta information is stored on.  */
4909 
4910 stmt_vec_info
4911 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4912 {
4913   stmt_info = vect_orig_stmt (stmt_info);
4914   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4915   if (!is_a <gphi *> (stmt_info->stmt)
4916       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4917     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4918   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4919   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4920     {
4921       if (gimple_phi_num_args (phi) == 1)
4922 	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4923     }
4924   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4925     {
4926       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4927       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4928 	stmt_info = info;
4929     }
4930   return stmt_info;
4931 }
4932 
4933 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4934    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4935    return false.  */
4936 
4937 static bool
4938 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4939 				stmt_vec_info reduc_info)
4940 {
4941   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4942   if (!main_loop_vinfo)
4943     return false;
4944 
4945   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4946     return false;
4947 
4948   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4949   auto_vec<tree, 16> main_loop_results (num_phis);
4950   auto_vec<tree, 16> initial_values (num_phis);
4951   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4952     {
4953       /* The epilogue loop can be entered either from the main loop or
4954 	 from an earlier guard block.  */
4955       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4956       for (tree incoming_value : reduc_info->reduc_initial_values)
4957 	{
4958 	  /* Look for:
4959 
4960 	       INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4961 				    INITIAL_VALUE(guard block)>.  */
4962 	  gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4963 
4964 	  gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4965 	  gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4966 
4967 	  tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4968 	  tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4969 
4970 	  main_loop_results.quick_push (from_main_loop);
4971 	  initial_values.quick_push (from_skip);
4972 	}
4973     }
4974   else
4975     /* The main loop dominates the epilogue loop.  */
4976     main_loop_results.splice (reduc_info->reduc_initial_values);
4977 
4978   /* See if the main loop has the kind of accumulator we need.  */
4979   vect_reusable_accumulator *accumulator
4980     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4981   if (!accumulator
4982       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4983       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4984 		      accumulator->reduc_info->reduc_scalar_results.begin ()))
4985     return false;
4986 
4987   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4988   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4989   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4990   unsigned HOST_WIDE_INT m;
4991   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4992 			    TYPE_VECTOR_SUBPARTS (vectype), &m))
4993     return false;
4994   /* Check the intermediate vector types and operations are available.  */
4995   tree prev_vectype = old_vectype;
4996   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4997   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4998     {
4999       intermediate_nunits = exact_div (intermediate_nunits, 2);
5000       tree intermediate_vectype = get_related_vectype_for_scalar_type
5001 	(TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5002       if (!intermediate_vectype
5003 	  || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5004 				    intermediate_vectype)
5005 	  || !can_vec_extract (TYPE_MODE (prev_vectype),
5006 			       TYPE_MODE (intermediate_vectype)))
5007 	return false;
5008       prev_vectype = intermediate_vectype;
5009     }
5010 
5011   /* Non-SLP reductions might apply an adjustment after the reduction
5012      operation, in order to simplify the initialization of the accumulator.
5013      If the epilogue loop carries on from where the main loop left off,
5014      it should apply the same adjustment to the final reduction result.
5015 
5016      If the epilogue loop can also be entered directly (rather than via
5017      the main loop), we need to be able to handle that case in the same way,
5018      with the same adjustment.  (In principle we could add a PHI node
5019      to select the correct adjustment, but in practice that shouldn't be
5020      necessary.)  */
5021   tree main_adjustment
5022     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5023   if (loop_vinfo->main_loop_edge && main_adjustment)
5024     {
5025       gcc_assert (num_phis == 1);
5026       tree initial_value = initial_values[0];
5027       /* Check that we can use INITIAL_VALUE as the adjustment and
5028 	 initialize the accumulator with a neutral value instead.  */
5029       if (!operand_equal_p (initial_value, main_adjustment))
5030 	return false;
5031       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5032       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5033 						    code, initial_value);
5034     }
5035   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5036   reduc_info->reduc_initial_values.truncate (0);
5037   reduc_info->reduc_initial_values.splice (initial_values);
5038   reduc_info->reused_accumulator = accumulator;
5039   return true;
5040 }
5041 
5042 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5043    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5044 
5045 static tree
5046 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5047 			    gimple_seq *seq)
5048 {
5049   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5050   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5051   tree stype = TREE_TYPE (vectype);
5052   tree new_temp = vec_def;
5053   while (nunits > nunits1)
5054     {
5055       nunits /= 2;
5056       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5057 							   stype, nunits);
5058       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5059 
5060       /* The target has to make sure we support lowpart/highpart
5061 	 extraction, either via direct vector extract or through
5062 	 an integer mode punning.  */
5063       tree dst1, dst2;
5064       gimple *epilog_stmt;
5065       if (convert_optab_handler (vec_extract_optab,
5066 				 TYPE_MODE (TREE_TYPE (new_temp)),
5067 				 TYPE_MODE (vectype1))
5068 	  != CODE_FOR_nothing)
5069 	{
5070 	  /* Extract sub-vectors directly once vec_extract becomes
5071 	     a conversion optab.  */
5072 	  dst1 = make_ssa_name (vectype1);
5073 	  epilog_stmt
5074 	      = gimple_build_assign (dst1, BIT_FIELD_REF,
5075 				     build3 (BIT_FIELD_REF, vectype1,
5076 					     new_temp, TYPE_SIZE (vectype1),
5077 					     bitsize_int (0)));
5078 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5079 	  dst2 =  make_ssa_name (vectype1);
5080 	  epilog_stmt
5081 	      = gimple_build_assign (dst2, BIT_FIELD_REF,
5082 				     build3 (BIT_FIELD_REF, vectype1,
5083 					     new_temp, TYPE_SIZE (vectype1),
5084 					     bitsize_int (bitsize)));
5085 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5086 	}
5087       else
5088 	{
5089 	  /* Extract via punning to appropriately sized integer mode
5090 	     vector.  */
5091 	  tree eltype = build_nonstandard_integer_type (bitsize, 1);
5092 	  tree etype = build_vector_type (eltype, 2);
5093 	  gcc_assert (convert_optab_handler (vec_extract_optab,
5094 					     TYPE_MODE (etype),
5095 					     TYPE_MODE (eltype))
5096 		      != CODE_FOR_nothing);
5097 	  tree tem = make_ssa_name (etype);
5098 	  epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5099 					     build1 (VIEW_CONVERT_EXPR,
5100 						     etype, new_temp));
5101 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5102 	  new_temp = tem;
5103 	  tem = make_ssa_name (eltype);
5104 	  epilog_stmt
5105 	      = gimple_build_assign (tem, BIT_FIELD_REF,
5106 				     build3 (BIT_FIELD_REF, eltype,
5107 					     new_temp, TYPE_SIZE (eltype),
5108 					     bitsize_int (0)));
5109 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5110 	  dst1 = make_ssa_name (vectype1);
5111 	  epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5112 					     build1 (VIEW_CONVERT_EXPR,
5113 						     vectype1, tem));
5114 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5115 	  tem = make_ssa_name (eltype);
5116 	  epilog_stmt
5117 	      = gimple_build_assign (tem, BIT_FIELD_REF,
5118 				     build3 (BIT_FIELD_REF, eltype,
5119 					     new_temp, TYPE_SIZE (eltype),
5120 					     bitsize_int (bitsize)));
5121 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5122 	  dst2 =  make_ssa_name (vectype1);
5123 	  epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5124 					     build1 (VIEW_CONVERT_EXPR,
5125 						     vectype1, tem));
5126 	  gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5127 	}
5128 
5129       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5130     }
5131 
5132   return new_temp;
5133 }
5134 
5135 /* Function vect_create_epilog_for_reduction
5136 
5137    Create code at the loop-epilog to finalize the result of a reduction
5138    computation.
5139 
5140    STMT_INFO is the scalar reduction stmt that is being vectorized.
5141    SLP_NODE is an SLP node containing a group of reduction statements. The
5142      first one in this group is STMT_INFO.
5143    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5144    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5145      (counting from 0)
5146 
5147    This function:
5148    1. Completes the reduction def-use cycles.
5149    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5150       by calling the function specified by REDUC_FN if available, or by
5151       other means (whole-vector shifts or a scalar loop).
5152       The function also creates a new phi node at the loop exit to preserve
5153       loop-closed form, as illustrated below.
5154 
5155      The flow at the entry to this function:
5156 
5157         loop:
5158           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5159           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5160           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5161         loop_exit:
5162           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5163           use <s_out0>
5164           use <s_out0>
5165 
5166      The above is transformed by this function into:
5167 
5168         loop:
5169           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5170           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5171           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5172         loop_exit:
5173           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5174           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5175           v_out2 = reduce <v_out1>
5176           s_out3 = extract_field <v_out2, 0>
5177           s_out4 = adjust_result <s_out3>
5178           use <s_out4>
5179           use <s_out4>
5180 */
5181 
5182 static void
5183 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5184 				  stmt_vec_info stmt_info,
5185 				  slp_tree slp_node,
5186 				  slp_instance slp_node_instance)
5187 {
5188   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5189   gcc_assert (reduc_info->is_reduc_info);
5190   /* For double reductions we need to get at the inner loop reduction
5191      stmt which has the meta info attached.  Our stmt_info is that of the
5192      loop-closed PHI of the inner loop which we remember as
5193      def for the reduction PHI generation.  */
5194   bool double_reduc = false;
5195   stmt_vec_info rdef_info = stmt_info;
5196   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5197     {
5198       gcc_assert (!slp_node);
5199       double_reduc = true;
5200       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5201 					    (stmt_info->stmt, 0));
5202       stmt_info = vect_stmt_to_vectorize (stmt_info);
5203     }
5204   gphi *reduc_def_stmt
5205     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5206   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5207   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5208   tree vectype;
5209   machine_mode mode;
5210   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5211   basic_block exit_bb;
5212   tree scalar_dest;
5213   tree scalar_type;
5214   gimple *new_phi = NULL, *phi;
5215   gimple_stmt_iterator exit_gsi;
5216   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5217   gimple *epilog_stmt = NULL;
5218   gimple *exit_phi;
5219   tree bitsize;
5220   tree def;
5221   tree orig_name, scalar_result;
5222   imm_use_iterator imm_iter, phi_imm_iter;
5223   use_operand_p use_p, phi_use_p;
5224   gimple *use_stmt;
5225   auto_vec<tree> reduc_inputs;
5226   int j, i;
5227   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5228   unsigned int group_size = 1, k;
5229   auto_vec<gimple *> phis;
5230   /* SLP reduction without reduction chain, e.g.,
5231      # a1 = phi <a2, a0>
5232      # b1 = phi <b2, b0>
5233      a2 = operation (a1)
5234      b2 = operation (b1)  */
5235   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5236   bool direct_slp_reduc;
5237   tree induction_index = NULL_TREE;
5238 
5239   if (slp_node)
5240     group_size = SLP_TREE_LANES (slp_node);
5241 
5242   if (nested_in_vect_loop_p (loop, stmt_info))
5243     {
5244       outer_loop = loop;
5245       loop = loop->inner;
5246       gcc_assert (!slp_node && double_reduc);
5247     }
5248 
5249   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5250   gcc_assert (vectype);
5251   mode = TYPE_MODE (vectype);
5252 
5253   tree induc_val = NULL_TREE;
5254   tree adjustment_def = NULL;
5255   if (slp_node)
5256     ;
5257   else
5258     {
5259       /* Optimize: for induction condition reduction, if we can't use zero
5260          for induc_val, use initial_def.  */
5261       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5262 	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5263       else if (double_reduc)
5264 	;
5265       else
5266 	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5267     }
5268 
5269   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5270   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5271   if (slp_reduc)
5272     /* All statements produce live-out values.  */
5273     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5274   else if (slp_node)
5275     {
5276       /* The last statement in the reduction chain produces the live-out
5277 	 value.  Note SLP optimization can shuffle scalar stmts to
5278 	 optimize permutations so we have to search for the last stmt.  */
5279       for (k = 0; k < group_size; ++k)
5280 	if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5281 	  {
5282 	    single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5283 	    break;
5284 	  }
5285     }
5286 
5287   unsigned vec_num;
5288   int ncopies;
5289   if (slp_node)
5290     {
5291       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5292       ncopies = 1;
5293     }
5294   else
5295     {
5296       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5297       vec_num = 1;
5298       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5299     }
5300 
5301   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5302      which is updated with the current index of the loop for every match of
5303      the original loop's cond_expr (VEC_STMT).  This results in a vector
5304      containing the last time the condition passed for that vector lane.
5305      The first match will be a 1 to allow 0 to be used for non-matching
5306      indexes.  If there are no matches at all then the vector will be all
5307      zeroes.
5308 
5309      PR92772: This algorithm is broken for architectures that support
5310      masked vectors, but do not provide fold_extract_last.  */
5311   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5312     {
5313       auto_vec<std::pair<tree, bool>, 2> ccompares;
5314       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5315       cond_info = vect_stmt_to_vectorize (cond_info);
5316       while (cond_info != reduc_info)
5317 	{
5318 	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5319 	    {
5320 	      gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5321 	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5322 	      ccompares.safe_push
5323 		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5324 				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5325 	    }
5326 	  cond_info
5327 	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5328 						 1 + STMT_VINFO_REDUC_IDX
5329 							(cond_info)));
5330 	  cond_info = vect_stmt_to_vectorize (cond_info);
5331 	}
5332       gcc_assert (ccompares.length () != 0);
5333 
5334       tree indx_before_incr, indx_after_incr;
5335       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5336       int scalar_precision
5337 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5338       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5339       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5340 	(TYPE_MODE (vectype), cr_index_scalar_type,
5341 	 TYPE_VECTOR_SUBPARTS (vectype));
5342 
5343       /* First we create a simple vector induction variable which starts
5344 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5345 	 vector size (STEP).  */
5346 
5347       /* Create a {1,2,3,...} vector.  */
5348       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5349 
5350       /* Create a vector of the step value.  */
5351       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5352       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5353 
5354       /* Create an induction variable.  */
5355       gimple_stmt_iterator incr_gsi;
5356       bool insert_after;
5357       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5358       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5359 		 insert_after, &indx_before_incr, &indx_after_incr);
5360 
5361       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5362 	 filled with zeros (VEC_ZERO).  */
5363 
5364       /* Create a vector of 0s.  */
5365       tree zero = build_zero_cst (cr_index_scalar_type);
5366       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5367 
5368       /* Create a vector phi node.  */
5369       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5370       new_phi = create_phi_node (new_phi_tree, loop->header);
5371       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5372 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
5373 
5374       /* Now take the condition from the loops original cond_exprs
5375 	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5376 	 every match uses values from the induction variable
5377 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5378 	 (NEW_PHI_TREE).
5379 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5380 	 the new cond_expr (INDEX_COND_EXPR).  */
5381       gimple_seq stmts = NULL;
5382       for (int i = ccompares.length () - 1; i != -1; --i)
5383 	{
5384 	  tree ccompare = ccompares[i].first;
5385 	  if (ccompares[i].second)
5386 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5387 					 cr_index_vector_type,
5388 					 ccompare,
5389 					 indx_before_incr, new_phi_tree);
5390 	  else
5391 	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5392 					 cr_index_vector_type,
5393 					 ccompare,
5394 					 new_phi_tree, indx_before_incr);
5395 	}
5396       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5397 
5398       /* Update the phi with the vec cond.  */
5399       induction_index = new_phi_tree;
5400       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5401 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
5402     }
5403 
5404   /* 2. Create epilog code.
5405         The reduction epilog code operates across the elements of the vector
5406         of partial results computed by the vectorized loop.
5407         The reduction epilog code consists of:
5408 
5409         step 1: compute the scalar result in a vector (v_out2)
5410         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5411         step 3: adjust the scalar result (s_out3) if needed.
5412 
5413         Step 1 can be accomplished using one the following three schemes:
5414           (scheme 1) using reduc_fn, if available.
5415           (scheme 2) using whole-vector shifts, if available.
5416           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5417                      combined.
5418 
5419           The overall epilog code looks like this:
5420 
5421           s_out0 = phi <s_loop>         # original EXIT_PHI
5422           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5423           v_out2 = reduce <v_out1>              # step 1
5424           s_out3 = extract_field <v_out2, 0>    # step 2
5425           s_out4 = adjust_result <s_out3>       # step 3
5426 
5427           (step 3 is optional, and steps 1 and 2 may be combined).
5428           Lastly, the uses of s_out0 are replaced by s_out4.  */
5429 
5430 
5431   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5432          v_out1 = phi <VECT_DEF>
5433          Store them in NEW_PHIS.  */
5434   if (double_reduc)
5435     loop = outer_loop;
5436   exit_bb = single_exit (loop)->dest;
5437   exit_gsi = gsi_after_labels (exit_bb);
5438   reduc_inputs.create (slp_node ? vec_num : ncopies);
5439   for (unsigned i = 0; i < vec_num; i++)
5440     {
5441       gimple_seq stmts = NULL;
5442       if (slp_node)
5443 	def = vect_get_slp_vect_def (slp_node, i);
5444       else
5445 	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5446       for (j = 0; j < ncopies; j++)
5447 	{
5448 	  tree new_def = copy_ssa_name (def);
5449 	  phi = create_phi_node (new_def, exit_bb);
5450 	  if (j)
5451 	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5452 	  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5453 	  new_def = gimple_convert (&stmts, vectype, new_def);
5454 	  reduc_inputs.quick_push (new_def);
5455 	}
5456       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5457     }
5458 
5459   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5460          (i.e. when reduc_fn is not available) and in the final adjustment
5461 	 code (if needed).  Also get the original scalar reduction variable as
5462          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5463          represents a reduction pattern), the tree-code and scalar-def are
5464          taken from the original stmt that the pattern-stmt (STMT) replaces.
5465          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5466          are taken from STMT.  */
5467 
5468   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5469   if (orig_stmt_info != stmt_info)
5470     {
5471       /* Reduction pattern  */
5472       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5473       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5474     }
5475 
5476   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5477   scalar_type = TREE_TYPE (scalar_dest);
5478   scalar_results.truncate (0);
5479   scalar_results.reserve_exact (group_size);
5480   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5481   bitsize = TYPE_SIZE (scalar_type);
5482 
5483   /* True if we should implement SLP_REDUC using native reduction operations
5484      instead of scalar operations.  */
5485   direct_slp_reduc = (reduc_fn != IFN_LAST
5486 		      && slp_reduc
5487 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5488 
5489   /* In case of reduction chain, e.g.,
5490      # a1 = phi <a3, a0>
5491      a2 = operation (a1)
5492      a3 = operation (a2),
5493 
5494      we may end up with more than one vector result.  Here we reduce them
5495      to one vector.
5496 
5497      The same is true if we couldn't use a single defuse cycle.  */
5498   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5499       || direct_slp_reduc
5500       || ncopies > 1)
5501     {
5502       gimple_seq stmts = NULL;
5503       tree single_input = reduc_inputs[0];
5504       for (k = 1; k < reduc_inputs.length (); k++)
5505 	single_input = gimple_build (&stmts, code, vectype,
5506 				     single_input, reduc_inputs[k]);
5507       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5508 
5509       reduc_inputs.truncate (0);
5510       reduc_inputs.safe_push (single_input);
5511     }
5512 
5513   tree orig_reduc_input = reduc_inputs[0];
5514 
5515   /* If this loop is an epilogue loop that can be skipped after the
5516      main loop, we can only share a reduction operation between the
5517      main loop and the epilogue if we put it at the target of the
5518      skip edge.
5519 
5520      We can still reuse accumulators if this check fails.  Doing so has
5521      the minor(?) benefit of making the epilogue loop's scalar result
5522      independent of the main loop's scalar result.  */
5523   bool unify_with_main_loop_p = false;
5524   if (reduc_info->reused_accumulator
5525       && loop_vinfo->skip_this_loop_edge
5526       && single_succ_p (exit_bb)
5527       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5528     {
5529       unify_with_main_loop_p = true;
5530 
5531       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5532       reduc_inputs[0] = make_ssa_name (vectype);
5533       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5534       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5535 		   UNKNOWN_LOCATION);
5536       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5537 		   loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5538       exit_gsi = gsi_after_labels (reduc_block);
5539     }
5540 
5541   /* Shouldn't be used beyond this point.  */
5542   exit_bb = nullptr;
5543 
5544   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5545       && reduc_fn != IFN_LAST)
5546     {
5547       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5548 	 various data values where the condition matched and another vector
5549 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
5550 	 need to extract the last matching index (which will be the index with
5551 	 highest value) and use this to index into the data vector.
5552 	 For the case where there were no matches, the data vector will contain
5553 	 all default values and the index vector will be all zeros.  */
5554 
5555       /* Get various versions of the type of the vector of indexes.  */
5556       tree index_vec_type = TREE_TYPE (induction_index);
5557       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5558       tree index_scalar_type = TREE_TYPE (index_vec_type);
5559       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5560 
5561       /* Get an unsigned integer version of the type of the data vector.  */
5562       int scalar_precision
5563 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5564       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5565       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5566 						vectype);
5567 
5568       /* First we need to create a vector (ZERO_VEC) of zeros and another
5569 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5570 	 can create using a MAX reduction and then expanding.
5571 	 In the case where the loop never made any matches, the max index will
5572 	 be zero.  */
5573 
5574       /* Vector of {0, 0, 0,...}.  */
5575       tree zero_vec = build_zero_cst (vectype);
5576 
5577       /* Find maximum value from the vector of found indexes.  */
5578       tree max_index = make_ssa_name (index_scalar_type);
5579       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5580 							  1, induction_index);
5581       gimple_call_set_lhs (max_index_stmt, max_index);
5582       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5583 
5584       /* Vector of {max_index, max_index, max_index,...}.  */
5585       tree max_index_vec = make_ssa_name (index_vec_type);
5586       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5587 						      max_index);
5588       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5589 							max_index_vec_rhs);
5590       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5591 
5592       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5593 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5594 	 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5595 	 otherwise.  Only one value should match, resulting in a vector
5596 	 (VEC_COND) with one data value and the rest zeros.
5597 	 In the case where the loop never made any matches, every index will
5598 	 match, resulting in a vector with all data values (which will all be
5599 	 the default value).  */
5600 
5601       /* Compare the max index vector to the vector of found indexes to find
5602 	 the position of the max value.  */
5603       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5604       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5605 						      induction_index,
5606 						      max_index_vec);
5607       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5608 
5609       /* Use the compare to choose either values from the data vector or
5610 	 zero.  */
5611       tree vec_cond = make_ssa_name (vectype);
5612       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5613 						   vec_compare,
5614 						   reduc_inputs[0],
5615 						   zero_vec);
5616       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5617 
5618       /* Finally we need to extract the data value from the vector (VEC_COND)
5619 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5620 	 reduction, but because this doesn't exist, we can use a MAX reduction
5621 	 instead.  The data value might be signed or a float so we need to cast
5622 	 it first.
5623 	 In the case where the loop never made any matches, the data values are
5624 	 all identical, and so will reduce down correctly.  */
5625 
5626       /* Make the matched data values unsigned.  */
5627       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5628       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5629 				       vec_cond);
5630       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5631 							VIEW_CONVERT_EXPR,
5632 							vec_cond_cast_rhs);
5633       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5634 
5635       /* Reduce down to a scalar value.  */
5636       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5637       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5638 							   1, vec_cond_cast);
5639       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5640       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5641 
5642       /* Convert the reduced value back to the result type and set as the
5643 	 result.  */
5644       gimple_seq stmts = NULL;
5645       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5646 			       data_reduc);
5647       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5648       scalar_results.safe_push (new_temp);
5649     }
5650   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5651 	   && reduc_fn == IFN_LAST)
5652     {
5653       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5654 	 idx = 0;
5655          idx_val = induction_index[0];
5656 	 val = data_reduc[0];
5657          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5658 	   if (induction_index[i] > idx_val)
5659 	     val = data_reduc[i], idx_val = induction_index[i];
5660 	 return val;  */
5661 
5662       tree data_eltype = TREE_TYPE (vectype);
5663       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5664       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5665       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5666       /* Enforced by vectorizable_reduction, which ensures we have target
5667 	 support before allowing a conditional reduction on variable-length
5668 	 vectors.  */
5669       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5670       tree idx_val = NULL_TREE, val = NULL_TREE;
5671       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5672 	{
5673 	  tree old_idx_val = idx_val;
5674 	  tree old_val = val;
5675 	  idx_val = make_ssa_name (idx_eltype);
5676 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5677 					     build3 (BIT_FIELD_REF, idx_eltype,
5678 						     induction_index,
5679 						     bitsize_int (el_size),
5680 						     bitsize_int (off)));
5681 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5682 	  val = make_ssa_name (data_eltype);
5683 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5684 					     build3 (BIT_FIELD_REF,
5685 						     data_eltype,
5686 						     reduc_inputs[0],
5687 						     bitsize_int (el_size),
5688 						     bitsize_int (off)));
5689 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5690 	  if (off != 0)
5691 	    {
5692 	      tree new_idx_val = idx_val;
5693 	      if (off != v_size - el_size)
5694 		{
5695 		  new_idx_val = make_ssa_name (idx_eltype);
5696 		  epilog_stmt = gimple_build_assign (new_idx_val,
5697 						     MAX_EXPR, idx_val,
5698 						     old_idx_val);
5699 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5700 		}
5701 	      tree new_val = make_ssa_name (data_eltype);
5702 	      epilog_stmt = gimple_build_assign (new_val,
5703 						 COND_EXPR,
5704 						 build2 (GT_EXPR,
5705 							 boolean_type_node,
5706 							 idx_val,
5707 							 old_idx_val),
5708 						 val, old_val);
5709 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5710 	      idx_val = new_idx_val;
5711 	      val = new_val;
5712 	    }
5713 	}
5714       /* Convert the reduced value back to the result type and set as the
5715 	 result.  */
5716       gimple_seq stmts = NULL;
5717       val = gimple_convert (&stmts, scalar_type, val);
5718       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5719       scalar_results.safe_push (val);
5720     }
5721 
5722   /* 2.3 Create the reduction code, using one of the three schemes described
5723          above. In SLP we simply need to extract all the elements from the
5724          vector (without reducing them), so we use scalar shifts.  */
5725   else if (reduc_fn != IFN_LAST && !slp_reduc)
5726     {
5727       tree tmp;
5728       tree vec_elem_type;
5729 
5730       /* Case 1:  Create:
5731          v_out2 = reduc_expr <v_out1>  */
5732 
5733       if (dump_enabled_p ())
5734         dump_printf_loc (MSG_NOTE, vect_location,
5735 			 "Reduce using direct vector reduction.\n");
5736 
5737       gimple_seq stmts = NULL;
5738       vec_elem_type = TREE_TYPE (vectype);
5739       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5740 			       vec_elem_type, reduc_inputs[0]);
5741       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5742       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5743 
5744       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5745 	  && induc_val)
5746 	{
5747 	  /* Earlier we set the initial value to be a vector if induc_val
5748 	     values.  Check the result and if it is induc_val then replace
5749 	     with the original initial value, unless induc_val is
5750 	     the same as initial_def already.  */
5751 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5752 				  induc_val);
5753 	  tree initial_def = reduc_info->reduc_initial_values[0];
5754 
5755 	  tmp = make_ssa_name (new_scalar_dest);
5756 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5757 					     initial_def, new_temp);
5758 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5759 	  new_temp = tmp;
5760 	}
5761 
5762       scalar_results.safe_push (new_temp);
5763     }
5764   else if (direct_slp_reduc)
5765     {
5766       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5767 	 with the elements for other SLP statements replaced with the
5768 	 neutral value.  We can then do a normal reduction on each vector.  */
5769 
5770       /* Enforced by vectorizable_reduction.  */
5771       gcc_assert (reduc_inputs.length () == 1);
5772       gcc_assert (pow2p_hwi (group_size));
5773 
5774       gimple_seq seq = NULL;
5775 
5776       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5777 	 and the same element size as VECTYPE.  */
5778       tree index = build_index_vector (vectype, 0, 1);
5779       tree index_type = TREE_TYPE (index);
5780       tree index_elt_type = TREE_TYPE (index_type);
5781       tree mask_type = truth_type_for (index_type);
5782 
5783       /* Create a vector that, for each element, identifies which of
5784 	 the REDUC_GROUP_SIZE results should use it.  */
5785       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5786       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5787 			    build_vector_from_val (index_type, index_mask));
5788 
5789       /* Get a neutral vector value.  This is simply a splat of the neutral
5790 	 scalar value if we have one, otherwise the initial scalar value
5791 	 is itself a neutral value.  */
5792       tree vector_identity = NULL_TREE;
5793       tree neutral_op = NULL_TREE;
5794       if (slp_node)
5795 	{
5796 	  tree initial_value = NULL_TREE;
5797 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5798 	    initial_value = reduc_info->reduc_initial_values[0];
5799 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5800 						 initial_value);
5801 	}
5802       if (neutral_op)
5803 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5804 							neutral_op);
5805       for (unsigned int i = 0; i < group_size; ++i)
5806 	{
5807 	  /* If there's no univeral neutral value, we can use the
5808 	     initial scalar value from the original PHI.  This is used
5809 	     for MIN and MAX reduction, for example.  */
5810 	  if (!neutral_op)
5811 	    {
5812 	      tree scalar_value = reduc_info->reduc_initial_values[i];
5813 	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5814 					     scalar_value);
5815 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5816 							      scalar_value);
5817 	    }
5818 
5819 	  /* Calculate the equivalent of:
5820 
5821 	     sel[j] = (index[j] == i);
5822 
5823 	     which selects the elements of REDUC_INPUTS[0] that should
5824 	     be included in the result.  */
5825 	  tree compare_val = build_int_cst (index_elt_type, i);
5826 	  compare_val = build_vector_from_val (index_type, compare_val);
5827 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5828 				   index, compare_val);
5829 
5830 	  /* Calculate the equivalent of:
5831 
5832 	     vec = seq ? reduc_inputs[0] : vector_identity;
5833 
5834 	     VEC is now suitable for a full vector reduction.  */
5835 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5836 				   sel, reduc_inputs[0], vector_identity);
5837 
5838 	  /* Do the reduction and convert it to the appropriate type.  */
5839 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5840 				      TREE_TYPE (vectype), vec);
5841 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5842 	  scalar_results.safe_push (scalar);
5843 	}
5844       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5845     }
5846   else
5847     {
5848       bool reduce_with_shift;
5849       tree vec_temp;
5850 
5851       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5852 
5853       /* See if the target wants to do the final (shift) reduction
5854 	 in a vector mode of smaller size and first reduce upper/lower
5855 	 halves against each other.  */
5856       enum machine_mode mode1 = mode;
5857       tree stype = TREE_TYPE (vectype);
5858       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5859       unsigned nunits1 = nunits;
5860       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5861 	  && reduc_inputs.length () == 1)
5862 	{
5863 	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5864 	  /* For SLP reductions we have to make sure lanes match up, but
5865 	     since we're doing individual element final reduction reducing
5866 	     vector width here is even more important.
5867 	     ???  We can also separate lanes with permutes, for the common
5868 	     case of power-of-two group-size odd/even extracts would work.  */
5869 	  if (slp_reduc && nunits != nunits1)
5870 	    {
5871 	      nunits1 = least_common_multiple (nunits1, group_size);
5872 	      gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5873 	    }
5874 	}
5875       if (!slp_reduc
5876 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5877 	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5878 
5879       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5880 							   stype, nunits1);
5881       reduce_with_shift = have_whole_vector_shift (mode1);
5882       if (!VECTOR_MODE_P (mode1)
5883 	  || !directly_supported_p (code, vectype1))
5884 	reduce_with_shift = false;
5885 
5886       /* First reduce the vector to the desired vector size we should
5887 	 do shift reduction on by combining upper and lower halves.  */
5888       gimple_seq stmts = NULL;
5889       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5890 					     code, &stmts);
5891       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5892       reduc_inputs[0] = new_temp;
5893 
5894       if (reduce_with_shift && !slp_reduc)
5895 	{
5896 	  int element_bitsize = tree_to_uhwi (bitsize);
5897 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5898 	     for variable-length vectors and also requires direct target support
5899 	     for loop reductions.  */
5900 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5901 	  int nelements = vec_size_in_bits / element_bitsize;
5902 	  vec_perm_builder sel;
5903 	  vec_perm_indices indices;
5904 
5905           int elt_offset;
5906 
5907           tree zero_vec = build_zero_cst (vectype1);
5908           /* Case 2: Create:
5909              for (offset = nelements/2; offset >= 1; offset/=2)
5910                 {
5911                   Create:  va' = vec_shift <va, offset>
5912                   Create:  va = vop <va, va'>
5913                 }  */
5914 
5915           tree rhs;
5916 
5917           if (dump_enabled_p ())
5918             dump_printf_loc (MSG_NOTE, vect_location,
5919 			     "Reduce using vector shifts\n");
5920 
5921 	  gimple_seq stmts = NULL;
5922 	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
5923           for (elt_offset = nelements / 2;
5924                elt_offset >= 1;
5925                elt_offset /= 2)
5926             {
5927 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5928 	      indices.new_vector (sel, 2, nelements);
5929 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5930 	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5931 				       new_temp, zero_vec, mask);
5932 	      new_temp = gimple_build (&stmts, code,
5933 				       vectype1, new_name, new_temp);
5934             }
5935 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5936 
5937 	  /* 2.4  Extract the final scalar result.  Create:
5938 	     s_out3 = extract_field <v_out2, bitpos>  */
5939 
5940 	  if (dump_enabled_p ())
5941 	    dump_printf_loc (MSG_NOTE, vect_location,
5942 			     "extract scalar result\n");
5943 
5944 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5945 			bitsize, bitsize_zero_node);
5946 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5947 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5948 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5949 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5950 	  scalar_results.safe_push (new_temp);
5951         }
5952       else
5953         {
5954           /* Case 3: Create:
5955              s = extract_field <v_out2, 0>
5956              for (offset = element_size;
5957                   offset < vector_size;
5958                   offset += element_size;)
5959                {
5960                  Create:  s' = extract_field <v_out2, offset>
5961                  Create:  s = op <s, s'>  // For non SLP cases
5962                }  */
5963 
5964           if (dump_enabled_p ())
5965             dump_printf_loc (MSG_NOTE, vect_location,
5966 			     "Reduce using scalar code.\n");
5967 
5968 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5969 	  int element_bitsize = tree_to_uhwi (bitsize);
5970 	  tree compute_type = TREE_TYPE (vectype);
5971 	  gimple_seq stmts = NULL;
5972 	  FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5973             {
5974               int bit_offset;
5975 	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5976 				       vec_temp, bitsize, bitsize_zero_node);
5977 
5978               /* In SLP we don't need to apply reduction operation, so we just
5979                  collect s' values in SCALAR_RESULTS.  */
5980               if (slp_reduc)
5981                 scalar_results.safe_push (new_temp);
5982 
5983               for (bit_offset = element_bitsize;
5984                    bit_offset < vec_size_in_bits;
5985                    bit_offset += element_bitsize)
5986                 {
5987                   tree bitpos = bitsize_int (bit_offset);
5988 		  new_name = gimple_build (&stmts, BIT_FIELD_REF,
5989 					   compute_type, vec_temp,
5990 					   bitsize, bitpos);
5991                   if (slp_reduc)
5992                     {
5993                       /* In SLP we don't need to apply reduction operation, so
5994                          we just collect s' values in SCALAR_RESULTS.  */
5995                       new_temp = new_name;
5996                       scalar_results.safe_push (new_name);
5997                     }
5998                   else
5999 		    new_temp = gimple_build (&stmts, code, compute_type,
6000 					     new_name, new_temp);
6001                 }
6002             }
6003 
6004           /* The only case where we need to reduce scalar results in SLP, is
6005              unrolling.  If the size of SCALAR_RESULTS is greater than
6006              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6007              REDUC_GROUP_SIZE.  */
6008           if (slp_reduc)
6009             {
6010               tree res, first_res, new_res;
6011 
6012               /* Reduce multiple scalar results in case of SLP unrolling.  */
6013               for (j = group_size; scalar_results.iterate (j, &res);
6014                    j++)
6015                 {
6016                   first_res = scalar_results[j % group_size];
6017 		  new_res = gimple_build (&stmts, code, compute_type,
6018 					  first_res, res);
6019                   scalar_results[j % group_size] = new_res;
6020                 }
6021 	      scalar_results.truncate (group_size);
6022 	      for (k = 0; k < group_size; k++)
6023 		scalar_results[k] = gimple_convert (&stmts, scalar_type,
6024 						    scalar_results[k]);
6025             }
6026           else
6027 	    {
6028 	      /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6029 	      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6030 	      scalar_results.safe_push (new_temp);
6031 	    }
6032 
6033 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6034         }
6035 
6036       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6037 	  && induc_val)
6038 	{
6039 	  /* Earlier we set the initial value to be a vector if induc_val
6040 	     values.  Check the result and if it is induc_val then replace
6041 	     with the original initial value, unless induc_val is
6042 	     the same as initial_def already.  */
6043 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6044 				  induc_val);
6045 	  tree initial_def = reduc_info->reduc_initial_values[0];
6046 
6047 	  tree tmp = make_ssa_name (new_scalar_dest);
6048 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6049 					     initial_def, new_temp);
6050 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6051 	  scalar_results[0] = tmp;
6052 	}
6053     }
6054 
6055   /* 2.5 Adjust the final result by the initial value of the reduction
6056 	 variable. (When such adjustment is not needed, then
6057 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
6058 	 new_temp = loop_exit_def + adjustment_def  */
6059 
6060   if (adjustment_def)
6061     {
6062       gcc_assert (!slp_reduc);
6063       gimple_seq stmts = NULL;
6064       if (double_reduc)
6065 	{
6066 	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6067 	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6068 	  new_temp = gimple_build (&stmts, code, vectype,
6069 				   reduc_inputs[0], adjustment_def);
6070 	}
6071       else
6072 	{
6073           new_temp = scalar_results[0];
6074 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6075 	  adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6076 					   adjustment_def);
6077 	  new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6078 	  new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6079 				   new_temp, adjustment_def);
6080 	  new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6081 	}
6082 
6083       epilog_stmt = gimple_seq_last_stmt (stmts);
6084       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6085       scalar_results[0] = new_temp;
6086     }
6087 
6088   /* Record this operation if it could be reused by the epilogue loop.  */
6089   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6090       && vec_num == 1)
6091     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6092 					   { orig_reduc_input, reduc_info });
6093 
6094   if (double_reduc)
6095     loop = outer_loop;
6096 
6097   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6098           phis with new adjusted scalar results, i.e., replace use <s_out0>
6099           with use <s_out4>.
6100 
6101      Transform:
6102         loop_exit:
6103           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6104           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6105           v_out2 = reduce <v_out1>
6106           s_out3 = extract_field <v_out2, 0>
6107           s_out4 = adjust_result <s_out3>
6108           use <s_out0>
6109           use <s_out0>
6110 
6111      into:
6112 
6113         loop_exit:
6114           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6115           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6116           v_out2 = reduce <v_out1>
6117           s_out3 = extract_field <v_out2, 0>
6118           s_out4 = adjust_result <s_out3>
6119           use <s_out4>
6120           use <s_out4> */
6121 
6122   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6123   for (k = 0; k < live_out_stmts.size (); k++)
6124     {
6125       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6126       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6127 
6128       phis.create (3);
6129       /* Find the loop-closed-use at the loop exit of the original scalar
6130          result.  (The reduction result is expected to have two immediate uses,
6131          one at the latch block, and one at the loop exit).  For double
6132          reductions we are looking for exit phis of the outer loop.  */
6133       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6134         {
6135           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6136 	    {
6137 	      if (!is_gimple_debug (USE_STMT (use_p)))
6138 		phis.safe_push (USE_STMT (use_p));
6139 	    }
6140           else
6141             {
6142               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6143                 {
6144                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6145 
6146                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6147                     {
6148                       if (!flow_bb_inside_loop_p (loop,
6149                                              gimple_bb (USE_STMT (phi_use_p)))
6150 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
6151                         phis.safe_push (USE_STMT (phi_use_p));
6152                     }
6153                 }
6154             }
6155         }
6156 
6157       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6158         {
6159           /* Replace the uses:  */
6160           orig_name = PHI_RESULT (exit_phi);
6161 
6162 	  /* Look for a single use at the target of the skip edge.  */
6163 	  if (unify_with_main_loop_p)
6164 	    {
6165 	      use_operand_p use_p;
6166 	      gimple *user;
6167 	      if (!single_imm_use (orig_name, &use_p, &user))
6168 		gcc_unreachable ();
6169 	      orig_name = gimple_get_lhs (user);
6170 	    }
6171 
6172           scalar_result = scalar_results[k];
6173           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6174 	    {
6175 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6176 		SET_USE (use_p, scalar_result);
6177 	      update_stmt (use_stmt);
6178 	    }
6179         }
6180 
6181       phis.release ();
6182     }
6183 }
6184 
6185 /* Return a vector of type VECTYPE that is equal to the vector select
6186    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6187    before GSI.  */
6188 
6189 static tree
6190 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6191 		     tree vec, tree identity)
6192 {
6193   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6194   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6195 					  mask, vec, identity);
6196   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6197   return cond;
6198 }
6199 
6200 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6201    order, starting with LHS.  Insert the extraction statements before GSI and
6202    associate the new scalar SSA names with variable SCALAR_DEST.
6203    Return the SSA name for the result.  */
6204 
6205 static tree
6206 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6207 		       tree_code code, tree lhs, tree vector_rhs)
6208 {
6209   tree vectype = TREE_TYPE (vector_rhs);
6210   tree scalar_type = TREE_TYPE (vectype);
6211   tree bitsize = TYPE_SIZE (scalar_type);
6212   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6213   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6214 
6215   for (unsigned HOST_WIDE_INT bit_offset = 0;
6216        bit_offset < vec_size_in_bits;
6217        bit_offset += element_bitsize)
6218     {
6219       tree bitpos = bitsize_int (bit_offset);
6220       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6221 			 bitsize, bitpos);
6222 
6223       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6224       rhs = make_ssa_name (scalar_dest, stmt);
6225       gimple_assign_set_lhs (stmt, rhs);
6226       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6227 
6228       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6229       tree new_name = make_ssa_name (scalar_dest, stmt);
6230       gimple_assign_set_lhs (stmt, new_name);
6231       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6232       lhs = new_name;
6233     }
6234   return lhs;
6235 }
6236 
6237 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6238    type of the vector input.  */
6239 
6240 static internal_fn
6241 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6242 {
6243   internal_fn mask_reduc_fn;
6244 
6245   switch (reduc_fn)
6246     {
6247     case IFN_FOLD_LEFT_PLUS:
6248       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6249       break;
6250 
6251     default:
6252       return IFN_LAST;
6253     }
6254 
6255   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6256 				      OPTIMIZE_FOR_SPEED))
6257     return mask_reduc_fn;
6258   return IFN_LAST;
6259 }
6260 
6261 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6262    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6263    statement.  CODE is the operation performed by STMT_INFO and OPS are
6264    its scalar operands.  REDUC_INDEX is the index of the operand in
6265    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6266    implements in-order reduction, or IFN_LAST if we should open-code it.
6267    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6268    that should be used to control the operation in a fully-masked loop.  */
6269 
6270 static bool
6271 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6272 			       stmt_vec_info stmt_info,
6273 			       gimple_stmt_iterator *gsi,
6274 			       gimple **vec_stmt, slp_tree slp_node,
6275 			       gimple *reduc_def_stmt,
6276 			       tree_code code, internal_fn reduc_fn,
6277 			       tree ops[3], tree vectype_in,
6278 			       int reduc_index, vec_loop_masks *masks)
6279 {
6280   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6281   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6282   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6283 
6284   int ncopies;
6285   if (slp_node)
6286     ncopies = 1;
6287   else
6288     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6289 
6290   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6291   gcc_assert (ncopies == 1);
6292   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6293 
6294   if (slp_node)
6295     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6296 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
6297 
6298   tree op0 = ops[1 - reduc_index];
6299 
6300   int group_size = 1;
6301   stmt_vec_info scalar_dest_def_info;
6302   auto_vec<tree> vec_oprnds0;
6303   if (slp_node)
6304     {
6305       auto_vec<vec<tree> > vec_defs (2);
6306       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6307       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6308       vec_defs[0].release ();
6309       vec_defs[1].release ();
6310       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6311       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6312     }
6313   else
6314     {
6315       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6316 				     op0, &vec_oprnds0);
6317       scalar_dest_def_info = stmt_info;
6318     }
6319 
6320   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6321   tree scalar_type = TREE_TYPE (scalar_dest);
6322   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6323 
6324   int vec_num = vec_oprnds0.length ();
6325   gcc_assert (vec_num == 1 || slp_node);
6326   tree vec_elem_type = TREE_TYPE (vectype_out);
6327   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6328 
6329   tree vector_identity = NULL_TREE;
6330   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6331     vector_identity = build_zero_cst (vectype_out);
6332 
6333   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6334   int i;
6335   tree def0;
6336   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6337     {
6338       gimple *new_stmt;
6339       tree mask = NULL_TREE;
6340       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6341 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6342 
6343       /* Handle MINUS by adding the negative.  */
6344       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6345 	{
6346 	  tree negated = make_ssa_name (vectype_out);
6347 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6348 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6349 	  def0 = negated;
6350 	}
6351 
6352       if (mask && mask_reduc_fn == IFN_LAST)
6353 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6354 				    vector_identity);
6355 
6356       /* On the first iteration the input is simply the scalar phi
6357 	 result, and for subsequent iterations it is the output of
6358 	 the preceding operation.  */
6359       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6360 	{
6361 	  if (mask && mask_reduc_fn != IFN_LAST)
6362 	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6363 						   def0, mask);
6364 	  else
6365 	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6366 						   def0);
6367 	  /* For chained SLP reductions the output of the previous reduction
6368 	     operation serves as the input of the next. For the final statement
6369 	     the output cannot be a temporary - we reuse the original
6370 	     scalar destination of the last statement.  */
6371 	  if (i != vec_num - 1)
6372 	    {
6373 	      gimple_set_lhs (new_stmt, scalar_dest_var);
6374 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6375 	      gimple_set_lhs (new_stmt, reduc_var);
6376 	    }
6377 	}
6378       else
6379 	{
6380 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6381 					     reduc_var, def0);
6382 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6383 	  /* Remove the statement, so that we can use the same code paths
6384 	     as for statements that we've just created.  */
6385 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6386 	  gsi_remove (&tmp_gsi, true);
6387 	}
6388 
6389       if (i == vec_num - 1)
6390 	{
6391 	  gimple_set_lhs (new_stmt, scalar_dest);
6392 	  vect_finish_replace_stmt (loop_vinfo,
6393 				    scalar_dest_def_info,
6394 				    new_stmt);
6395 	}
6396       else
6397 	vect_finish_stmt_generation (loop_vinfo,
6398 				     scalar_dest_def_info,
6399 				     new_stmt, gsi);
6400 
6401       if (slp_node)
6402 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6403       else
6404 	{
6405 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6406 	  *vec_stmt = new_stmt;
6407 	}
6408     }
6409 
6410   return true;
6411 }
6412 
6413 /* Function is_nonwrapping_integer_induction.
6414 
6415    Check if STMT_VINO (which is part of loop LOOP) both increments and
6416    does not cause overflow.  */
6417 
6418 static bool
6419 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6420 {
6421   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6422   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6423   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6424   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6425   widest_int ni, max_loop_value, lhs_max;
6426   wi::overflow_type overflow = wi::OVF_NONE;
6427 
6428   /* Make sure the loop is integer based.  */
6429   if (TREE_CODE (base) != INTEGER_CST
6430       || TREE_CODE (step) != INTEGER_CST)
6431     return false;
6432 
6433   /* Check that the max size of the loop will not wrap.  */
6434 
6435   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6436     return true;
6437 
6438   if (! max_stmt_executions (loop, &ni))
6439     return false;
6440 
6441   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6442 			    &overflow);
6443   if (overflow)
6444     return false;
6445 
6446   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6447 			    TYPE_SIGN (lhs_type), &overflow);
6448   if (overflow)
6449     return false;
6450 
6451   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6452 	  <= TYPE_PRECISION (lhs_type));
6453 }
6454 
6455 /* Check if masking can be supported by inserting a conditional expression.
6456    CODE is the code for the operation.  COND_FN is the conditional internal
6457    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6458 static bool
6459 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6460 			 tree vectype_in)
6461 {
6462   if (cond_fn != IFN_LAST
6463       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6464 					 OPTIMIZE_FOR_SPEED))
6465     return false;
6466 
6467   if (code.is_tree_code ())
6468     switch (tree_code (code))
6469       {
6470       case DOT_PROD_EXPR:
6471       case SAD_EXPR:
6472 	return true;
6473 
6474       default:
6475 	break;
6476       }
6477   return false;
6478 }
6479 
6480 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6481    code for the operation.  VOP is the array of operands.  MASK is the loop
6482    mask.  GSI is a statement iterator used to place the new conditional
6483    expression.  */
6484 static void
6485 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6486 		      gimple_stmt_iterator *gsi)
6487 {
6488   switch (tree_code (code))
6489     {
6490     case DOT_PROD_EXPR:
6491       {
6492 	tree vectype = TREE_TYPE (vop[1]);
6493 	tree zero = build_zero_cst (vectype);
6494 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6495 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6496 					       mask, vop[1], zero);
6497 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
6498 	vop[1] = masked_op1;
6499 	break;
6500       }
6501 
6502     case SAD_EXPR:
6503       {
6504 	tree vectype = TREE_TYPE (vop[1]);
6505 	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6506 	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6507 					       mask, vop[1], vop[0]);
6508 	gsi_insert_before (gsi, select, GSI_SAME_STMT);
6509 	vop[1] = masked_op1;
6510 	break;
6511       }
6512 
6513     default:
6514       gcc_unreachable ();
6515     }
6516 }
6517 
6518 /* Function vectorizable_reduction.
6519 
6520    Check if STMT_INFO performs a reduction operation that can be vectorized.
6521    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6522    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6523    Return true if STMT_INFO is vectorizable in this way.
6524 
6525    This function also handles reduction idioms (patterns) that have been
6526    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6527    may be of this form:
6528      X = pattern_expr (arg0, arg1, ..., X)
6529    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6530    sequence that had been detected and replaced by the pattern-stmt
6531    (STMT_INFO).
6532 
6533    This function also handles reduction of condition expressions, for example:
6534      for (int i = 0; i < N; i++)
6535        if (a[i] < value)
6536 	 last = a[i];
6537    This is handled by vectorising the loop and creating an additional vector
6538    containing the loop indexes for which "a[i] < value" was true.  In the
6539    function epilogue this is reduced to a single max value and then used to
6540    index into the vector of results.
6541 
6542    In some cases of reduction patterns, the type of the reduction variable X is
6543    different than the type of the other arguments of STMT_INFO.
6544    In such cases, the vectype that is used when transforming STMT_INFO into
6545    a vector stmt is different than the vectype that is used to determine the
6546    vectorization factor, because it consists of a different number of elements
6547    than the actual number of elements that are being operated upon in parallel.
6548 
6549    For example, consider an accumulation of shorts into an int accumulator.
6550    On some targets it's possible to vectorize this pattern operating on 8
6551    shorts at a time (hence, the vectype for purposes of determining the
6552    vectorization factor should be V8HI); on the other hand, the vectype that
6553    is used to create the vector form is actually V4SI (the type of the result).
6554 
6555    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6556    indicates what is the actual level of parallelism (V8HI in the example), so
6557    that the right vectorization factor would be derived.  This vectype
6558    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6559    be used to create the vectorized stmt.  The right vectype for the vectorized
6560    stmt is obtained from the type of the result X:
6561       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6562 
6563    This means that, contrary to "regular" reductions (or "regular" stmts in
6564    general), the following equation:
6565       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6566    does *NOT* necessarily hold for reduction patterns.  */
6567 
6568 bool
6569 vectorizable_reduction (loop_vec_info loop_vinfo,
6570 			stmt_vec_info stmt_info, slp_tree slp_node,
6571 			slp_instance slp_node_instance,
6572 			stmt_vector_for_cost *cost_vec)
6573 {
6574   tree vectype_in = NULL_TREE;
6575   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
6576   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6577   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6578   stmt_vec_info cond_stmt_vinfo = NULL;
6579   int i;
6580   int ncopies;
6581   bool single_defuse_cycle = false;
6582   bool nested_cycle = false;
6583   bool double_reduc = false;
6584   int vec_num;
6585   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6586   tree cond_reduc_val = NULL_TREE;
6587 
6588   /* Make sure it was already recognized as a reduction computation.  */
6589   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6590       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6591       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6592     return false;
6593 
6594   /* The stmt we store reduction analysis meta on.  */
6595   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6596   reduc_info->is_reduc_info = true;
6597 
6598   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6599     {
6600       if (is_a <gphi *> (stmt_info->stmt))
6601 	{
6602 	  if (slp_node)
6603 	    {
6604 	      /* We eventually need to set a vector type on invariant
6605 		 arguments.  */
6606 	      unsigned j;
6607 	      slp_tree child;
6608 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6609 		if (!vect_maybe_update_slp_op_vectype
6610 		       (child, SLP_TREE_VECTYPE (slp_node)))
6611 		  {
6612 		    if (dump_enabled_p ())
6613 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6614 				       "incompatible vector types for "
6615 				       "invariants\n");
6616 		    return false;
6617 		  }
6618 	    }
6619 	  /* Analysis for double-reduction is done on the outer
6620 	     loop PHI, nested cycles have no further restrictions.  */
6621 	  STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6622 	}
6623       else
6624 	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6625       return true;
6626     }
6627 
6628   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6629   stmt_vec_info phi_info = stmt_info;
6630   if (!is_a <gphi *> (stmt_info->stmt))
6631     {
6632       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6633       return true;
6634     }
6635   if (slp_node)
6636     {
6637       slp_node_instance->reduc_phis = slp_node;
6638       /* ???  We're leaving slp_node to point to the PHIs, we only
6639 	 need it to get at the number of vector stmts which wasn't
6640 	 yet initialized for the instance root.  */
6641     }
6642   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6643     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6644   else
6645     {
6646       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6647 		  == vect_double_reduction_def);
6648       use_operand_p use_p;
6649       gimple *use_stmt;
6650       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6651 				 &use_p, &use_stmt);
6652       gcc_assert (res);
6653       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6654       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6655     }
6656 
6657   /* PHIs should not participate in patterns.  */
6658   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6659   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6660 
6661   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6662      and compute the reduction chain length.  Discover the real
6663      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6664   tree reduc_def
6665     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6666 			     loop_latch_edge
6667 			       (gimple_bb (reduc_def_phi)->loop_father));
6668   unsigned reduc_chain_length = 0;
6669   bool only_slp_reduc_chain = true;
6670   stmt_info = NULL;
6671   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6672   while (reduc_def != PHI_RESULT (reduc_def_phi))
6673     {
6674       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6675       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6676       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6677 	{
6678 	  if (dump_enabled_p ())
6679 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680 			     "reduction chain broken by patterns.\n");
6681 	  return false;
6682 	}
6683       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6684 	only_slp_reduc_chain = false;
6685       /* For epilogue generation live members of the chain need
6686          to point back to the PHI via their original stmt for
6687 	 info_for_reduction to work.  For SLP we need to look at
6688 	 all lanes here - even though we only will vectorize from
6689 	 the SLP node with live lane zero the other live lanes also
6690 	 need to be identified as part of a reduction to be able
6691 	 to skip code generation for them.  */
6692       if (slp_for_stmt_info)
6693 	{
6694 	  for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6695 	    if (STMT_VINFO_LIVE_P (s))
6696 	      STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6697 	}
6698       else if (STMT_VINFO_LIVE_P (vdef))
6699 	STMT_VINFO_REDUC_DEF (def) = phi_info;
6700       gimple_match_op op;
6701       if (!gimple_extract_op (vdef->stmt, &op))
6702 	{
6703 	  if (dump_enabled_p ())
6704 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6705 			     "reduction chain includes unsupported"
6706 			     " statement type.\n");
6707 	  return false;
6708 	}
6709       if (CONVERT_EXPR_CODE_P (op.code))
6710 	{
6711 	  if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6712 	    {
6713 	      if (dump_enabled_p ())
6714 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6715 				 "conversion in the reduction chain.\n");
6716 	      return false;
6717 	    }
6718 	}
6719       else if (!stmt_info)
6720 	/* First non-conversion stmt.  */
6721 	stmt_info = vdef;
6722       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6723       reduc_chain_length++;
6724       if (!stmt_info && slp_node)
6725 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6726     }
6727   /* PHIs should not participate in patterns.  */
6728   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6729 
6730   if (nested_in_vect_loop_p (loop, stmt_info))
6731     {
6732       loop = loop->inner;
6733       nested_cycle = true;
6734     }
6735 
6736   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6737      element.  */
6738   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6739     {
6740       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6741       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6742     }
6743   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6744     gcc_assert (slp_node
6745 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6746 
6747   /* 1. Is vectorizable reduction?  */
6748   /* Not supportable if the reduction variable is used in the loop, unless
6749      it's a reduction chain.  */
6750   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6751       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6752     return false;
6753 
6754   /* Reductions that are not used even in an enclosing outer-loop,
6755      are expected to be "live" (used out of the loop).  */
6756   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6757       && !STMT_VINFO_LIVE_P (stmt_info))
6758     return false;
6759 
6760   /* 2. Has this been recognized as a reduction pattern?
6761 
6762      Check if STMT represents a pattern that has been recognized
6763      in earlier analysis stages.  For stmts that represent a pattern,
6764      the STMT_VINFO_RELATED_STMT field records the last stmt in
6765      the original sequence that constitutes the pattern.  */
6766 
6767   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6768   if (orig_stmt_info)
6769     {
6770       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6771       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6772     }
6773 
6774   /* 3. Check the operands of the operation.  The first operands are defined
6775         inside the loop body. The last operand is the reduction variable,
6776         which is defined by the loop-header-phi.  */
6777 
6778   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6779   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6780   gimple_match_op op;
6781   if (!gimple_extract_op (stmt_info->stmt, &op))
6782     gcc_unreachable ();
6783   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6784 			    || op.code == WIDEN_SUM_EXPR
6785 			    || op.code == SAD_EXPR);
6786   enum optab_subtype optab_query_kind = optab_vector;
6787   if (op.code == DOT_PROD_EXPR
6788       && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6789 	  != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6790     optab_query_kind = optab_vector_mixed_sign;
6791 
6792   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6793       && !SCALAR_FLOAT_TYPE_P (op.type))
6794     return false;
6795 
6796   /* Do not try to vectorize bit-precision reductions.  */
6797   if (!type_has_mode_precision_p (op.type))
6798     return false;
6799 
6800   /* For lane-reducing ops we're reducing the number of reduction PHIs
6801      which means the only use of that may be in the lane-reducing operation.  */
6802   if (lane_reduc_code_p
6803       && reduc_chain_length != 1
6804       && !only_slp_reduc_chain)
6805     {
6806       if (dump_enabled_p ())
6807 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808 			 "lane-reducing reduction with extra stmts.\n");
6809       return false;
6810     }
6811 
6812   /* All uses but the last are expected to be defined in the loop.
6813      The last use is the reduction variable.  In case of nested cycle this
6814      assumption is not true: we use reduc_index to record the index of the
6815      reduction variable.  */
6816   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6817   /* We need to skip an extra operand for COND_EXPRs with embedded
6818      comparison.  */
6819   unsigned opno_adjust = 0;
6820   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6821     opno_adjust = 1;
6822   for (i = 0; i < (int) op.num_ops; i++)
6823     {
6824       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6825       if (i == 0 && op.code == COND_EXPR)
6826         continue;
6827 
6828       stmt_vec_info def_stmt_info;
6829       enum vect_def_type dt;
6830       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6831 			       i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6832 			       &vectype_op[i], &def_stmt_info))
6833 	{
6834 	  if (dump_enabled_p ())
6835 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6836 			     "use not simple.\n");
6837 	  return false;
6838 	}
6839       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6840 	continue;
6841 
6842       /* There should be only one cycle def in the stmt, the one
6843          leading to reduc_def.  */
6844       if (VECTORIZABLE_CYCLE_DEF (dt))
6845 	return false;
6846 
6847       if (!vectype_op[i])
6848 	vectype_op[i]
6849 	  = get_vectype_for_scalar_type (loop_vinfo,
6850 					 TREE_TYPE (op.ops[i]), slp_op[i]);
6851 
6852       /* To properly compute ncopies we are interested in the widest
6853 	 non-reduction input type in case we're looking at a widening
6854 	 accumulation that we later handle in vect_transform_reduction.  */
6855       if (lane_reduc_code_p
6856 	  && vectype_op[i]
6857 	  && (!vectype_in
6858 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6859 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
6860 	vectype_in = vectype_op[i];
6861 
6862       if (op.code == COND_EXPR)
6863 	{
6864 	  /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6865 	  if (dt == vect_constant_def)
6866 	    {
6867 	      cond_reduc_dt = dt;
6868 	      cond_reduc_val = op.ops[i];
6869 	    }
6870 	  if (dt == vect_induction_def
6871 	      && def_stmt_info
6872 	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6873 	    {
6874 	      cond_reduc_dt = dt;
6875 	      cond_stmt_vinfo = def_stmt_info;
6876 	    }
6877 	}
6878     }
6879   if (!vectype_in)
6880     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6881   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6882 
6883   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6884   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6885   /* If we have a condition reduction, see if we can simplify it further.  */
6886   if (v_reduc_type == COND_REDUCTION)
6887     {
6888       if (slp_node)
6889 	return false;
6890 
6891       /* When the condition uses the reduction value in the condition, fail.  */
6892       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6893 	{
6894 	  if (dump_enabled_p ())
6895 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896 			     "condition depends on previous iteration\n");
6897 	  return false;
6898 	}
6899 
6900       if (reduc_chain_length == 1
6901 	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6902 					     vectype_in, OPTIMIZE_FOR_SPEED))
6903 	{
6904 	  if (dump_enabled_p ())
6905 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906 			     "optimizing condition reduction with"
6907 			     " FOLD_EXTRACT_LAST.\n");
6908 	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6909 	}
6910       else if (cond_reduc_dt == vect_induction_def)
6911 	{
6912 	  tree base
6913 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6914 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6915 
6916 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6917 		      && TREE_CODE (step) == INTEGER_CST);
6918 	  cond_reduc_val = NULL_TREE;
6919 	  enum tree_code cond_reduc_op_code = ERROR_MARK;
6920 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6921 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6922 	    ;
6923 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6924 	     above base; punt if base is the minimum value of the type for
6925 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6926 	  else if (tree_int_cst_sgn (step) == -1)
6927 	    {
6928 	      cond_reduc_op_code = MIN_EXPR;
6929 	      if (tree_int_cst_sgn (base) == -1)
6930 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6931 	      else if (tree_int_cst_lt (base,
6932 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6933 		cond_reduc_val
6934 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6935 	    }
6936 	  else
6937 	    {
6938 	      cond_reduc_op_code = MAX_EXPR;
6939 	      if (tree_int_cst_sgn (base) == 1)
6940 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6941 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6942 					base))
6943 		cond_reduc_val
6944 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6945 	    }
6946 	  if (cond_reduc_val)
6947 	    {
6948 	      if (dump_enabled_p ())
6949 		dump_printf_loc (MSG_NOTE, vect_location,
6950 				 "condition expression based on "
6951 				 "integer induction.\n");
6952 	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6953 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6954 		= cond_reduc_val;
6955 	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6956 	    }
6957 	}
6958       else if (cond_reduc_dt == vect_constant_def)
6959 	{
6960 	  enum vect_def_type cond_initial_dt;
6961 	  tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6962 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6963 	  if (cond_initial_dt == vect_constant_def
6964 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6965 				     TREE_TYPE (cond_reduc_val)))
6966 	    {
6967 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6968 				    cond_initial_val, cond_reduc_val);
6969 	      if (e && (integer_onep (e) || integer_zerop (e)))
6970 		{
6971 		  if (dump_enabled_p ())
6972 		    dump_printf_loc (MSG_NOTE, vect_location,
6973 				     "condition expression based on "
6974 				     "compile time constant.\n");
6975 		  /* Record reduction code at analysis stage.  */
6976 		  STMT_VINFO_REDUC_CODE (reduc_info)
6977 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6978 		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6979 		}
6980 	    }
6981 	}
6982     }
6983 
6984   if (STMT_VINFO_LIVE_P (phi_info))
6985     return false;
6986 
6987   if (slp_node)
6988     ncopies = 1;
6989   else
6990     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6991 
6992   gcc_assert (ncopies >= 1);
6993 
6994   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6995 
6996   if (nested_cycle)
6997     {
6998       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6999 		  == vect_double_reduction_def);
7000       double_reduc = true;
7001     }
7002 
7003   /* 4.2. Check support for the epilog operation.
7004 
7005           If STMT represents a reduction pattern, then the type of the
7006           reduction variable may be different than the type of the rest
7007           of the arguments.  For example, consider the case of accumulation
7008           of shorts into an int accumulator; The original code:
7009                         S1: int_a = (int) short_a;
7010           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7011 
7012           was replaced with:
7013                         STMT: int_acc = widen_sum <short_a, int_acc>
7014 
7015           This means that:
7016           1. The tree-code that is used to create the vector operation in the
7017              epilog code (that reduces the partial results) is not the
7018              tree-code of STMT, but is rather the tree-code of the original
7019              stmt from the pattern that STMT is replacing.  I.e, in the example
7020              above we want to use 'widen_sum' in the loop, but 'plus' in the
7021              epilog.
7022           2. The type (mode) we use to check available target support
7023              for the vector operation to be created in the *epilog*, is
7024              determined by the type of the reduction variable (in the example
7025              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7026              However the type (mode) we use to check available target support
7027              for the vector operation to be created *inside the loop*, is
7028              determined by the type of the other arguments to STMT (in the
7029              example we'd check this: optab_handler (widen_sum_optab,
7030 	     vect_short_mode)).
7031 
7032           This is contrary to "regular" reductions, in which the types of all
7033           the arguments are the same as the type of the reduction variable.
7034           For "regular" reductions we can therefore use the same vector type
7035           (and also the same tree-code) when generating the epilog code and
7036           when generating the code inside the loop.  */
7037 
7038   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7039   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7040 
7041   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7042   if (reduction_type == TREE_CODE_REDUCTION)
7043     {
7044       /* Check whether it's ok to change the order of the computation.
7045 	 Generally, when vectorizing a reduction we change the order of the
7046 	 computation.  This may change the behavior of the program in some
7047 	 cases, so we need to check that this is ok.  One exception is when
7048 	 vectorizing an outer-loop: the inner-loop is executed sequentially,
7049 	 and therefore vectorizing reductions in the inner-loop during
7050 	 outer-loop vectorization is safe.  Likewise when we are vectorizing
7051 	 a series of reductions using SLP and the VF is one the reductions
7052 	 are performed in scalar order.  */
7053       if (slp_node
7054 	  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7055 	  && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7056 	;
7057       else if (needs_fold_left_reduction_p (op.type, orig_code))
7058 	{
7059 	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
7060 	     is not directy used in stmt.  */
7061 	  if (!only_slp_reduc_chain
7062 	      && reduc_chain_length != 1)
7063 	    {
7064 	      if (dump_enabled_p ())
7065 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7066 				 "in-order reduction chain without SLP.\n");
7067 	      return false;
7068 	    }
7069 	  STMT_VINFO_REDUC_TYPE (reduc_info)
7070 	    = reduction_type = FOLD_LEFT_REDUCTION;
7071 	}
7072       else if (!commutative_binary_op_p (orig_code, op.type)
7073 	       || !associative_binary_op_p (orig_code, op.type))
7074 	{
7075 	  if (dump_enabled_p ())
7076 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7077 			    "reduction: not commutative/associative");
7078 	  return false;
7079 	}
7080     }
7081 
7082   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7083       && ncopies > 1)
7084     {
7085       if (dump_enabled_p ())
7086 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087 			 "multiple types in double reduction or condition "
7088 			 "reduction or fold-left reduction.\n");
7089       return false;
7090     }
7091 
7092   internal_fn reduc_fn = IFN_LAST;
7093   if (reduction_type == TREE_CODE_REDUCTION
7094       || reduction_type == FOLD_LEFT_REDUCTION
7095       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7096       || reduction_type == CONST_COND_REDUCTION)
7097     {
7098       if (reduction_type == FOLD_LEFT_REDUCTION
7099 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
7100 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7101 	{
7102 	  if (reduc_fn != IFN_LAST
7103 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7104 						  OPTIMIZE_FOR_SPEED))
7105 	    {
7106 	      if (dump_enabled_p ())
7107 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7108 				 "reduc op not supported by target.\n");
7109 
7110 	      reduc_fn = IFN_LAST;
7111 	    }
7112 	}
7113       else
7114 	{
7115 	  if (!nested_cycle || double_reduc)
7116 	    {
7117 	      if (dump_enabled_p ())
7118 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7119 				 "no reduc code for scalar code.\n");
7120 
7121 	      return false;
7122 	    }
7123 	}
7124     }
7125   else if (reduction_type == COND_REDUCTION)
7126     {
7127       int scalar_precision
7128 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7129       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7130       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7131 						vectype_out);
7132 
7133       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7134 					  OPTIMIZE_FOR_SPEED))
7135 	reduc_fn = IFN_REDUC_MAX;
7136     }
7137   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7138 
7139   if (reduction_type != EXTRACT_LAST_REDUCTION
7140       && (!nested_cycle || double_reduc)
7141       && reduc_fn == IFN_LAST
7142       && !nunits_out.is_constant ())
7143     {
7144       if (dump_enabled_p ())
7145 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7146 			 "missing target support for reduction on"
7147 			 " variable-length vectors.\n");
7148       return false;
7149     }
7150 
7151   /* For SLP reductions, see if there is a neutral value we can use.  */
7152   tree neutral_op = NULL_TREE;
7153   if (slp_node)
7154     {
7155       tree initial_value = NULL_TREE;
7156       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7157 	initial_value = vect_phi_initial_value (reduc_def_phi);
7158       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7159 					     orig_code, initial_value);
7160     }
7161 
7162   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7163     {
7164       /* We can't support in-order reductions of code such as this:
7165 
7166 	   for (int i = 0; i < n1; ++i)
7167 	     for (int j = 0; j < n2; ++j)
7168 	       l += a[j];
7169 
7170 	 since GCC effectively transforms the loop when vectorizing:
7171 
7172 	   for (int i = 0; i < n1 / VF; ++i)
7173 	     for (int j = 0; j < n2; ++j)
7174 	       for (int k = 0; k < VF; ++k)
7175 		 l += a[j];
7176 
7177 	 which is a reassociation of the original operation.  */
7178       if (dump_enabled_p ())
7179 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7180 			 "in-order double reduction not supported.\n");
7181 
7182       return false;
7183     }
7184 
7185   if (reduction_type == FOLD_LEFT_REDUCTION
7186       && slp_node
7187       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7188     {
7189       /* We cannot use in-order reductions in this case because there is
7190 	 an implicit reassociation of the operations involved.  */
7191       if (dump_enabled_p ())
7192 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7193 			 "in-order unchained SLP reductions not supported.\n");
7194       return false;
7195     }
7196 
7197   /* For double reductions, and for SLP reductions with a neutral value,
7198      we construct a variable-length initial vector by loading a vector
7199      full of the neutral value and then shift-and-inserting the start
7200      values into the low-numbered elements.  */
7201   if ((double_reduc || neutral_op)
7202       && !nunits_out.is_constant ()
7203       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7204 					  vectype_out, OPTIMIZE_FOR_SPEED))
7205     {
7206       if (dump_enabled_p ())
7207 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7208 			 "reduction on variable-length vectors requires"
7209 			 " target support for a vector-shift-and-insert"
7210 			 " operation.\n");
7211       return false;
7212     }
7213 
7214   /* Check extra constraints for variable-length unchained SLP reductions.  */
7215   if (slp_node
7216       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7217       && !nunits_out.is_constant ())
7218     {
7219       /* We checked above that we could build the initial vector when
7220 	 there's a neutral element value.  Check here for the case in
7221 	 which each SLP statement has its own initial value and in which
7222 	 that value needs to be repeated for every instance of the
7223 	 statement within the initial vector.  */
7224       unsigned int group_size = SLP_TREE_LANES (slp_node);
7225       if (!neutral_op
7226 	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7227 					      TREE_TYPE (vectype_out)))
7228 	{
7229 	  if (dump_enabled_p ())
7230 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7231 			     "unsupported form of SLP reduction for"
7232 			     " variable-length vectors: cannot build"
7233 			     " initial vector.\n");
7234 	  return false;
7235 	}
7236       /* The epilogue code relies on the number of elements being a multiple
7237 	 of the group size.  The duplicate-and-interleave approach to setting
7238 	 up the initial vector does too.  */
7239       if (!multiple_p (nunits_out, group_size))
7240 	{
7241 	  if (dump_enabled_p ())
7242 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7243 			     "unsupported form of SLP reduction for"
7244 			     " variable-length vectors: the vector size"
7245 			     " is not a multiple of the number of results.\n");
7246 	  return false;
7247 	}
7248     }
7249 
7250   if (reduction_type == COND_REDUCTION)
7251     {
7252       widest_int ni;
7253 
7254       if (! max_loop_iterations (loop, &ni))
7255 	{
7256 	  if (dump_enabled_p ())
7257 	    dump_printf_loc (MSG_NOTE, vect_location,
7258 			     "loop count not known, cannot create cond "
7259 			     "reduction.\n");
7260 	  return false;
7261 	}
7262       /* Convert backedges to iterations.  */
7263       ni += 1;
7264 
7265       /* The additional index will be the same type as the condition.  Check
7266 	 that the loop can fit into this less one (because we'll use up the
7267 	 zero slot for when there are no matches).  */
7268       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7269       if (wi::geu_p (ni, wi::to_widest (max_index)))
7270 	{
7271 	  if (dump_enabled_p ())
7272 	    dump_printf_loc (MSG_NOTE, vect_location,
7273 			     "loop size is greater than data size.\n");
7274 	  return false;
7275 	}
7276     }
7277 
7278   /* In case the vectorization factor (VF) is bigger than the number
7279      of elements that we can fit in a vectype (nunits), we have to generate
7280      more than one vector stmt - i.e - we need to "unroll" the
7281      vector stmt by a factor VF/nunits.  For more details see documentation
7282      in vectorizable_operation.  */
7283 
7284   /* If the reduction is used in an outer loop we need to generate
7285      VF intermediate results, like so (e.g. for ncopies=2):
7286 	r0 = phi (init, r0)
7287 	r1 = phi (init, r1)
7288 	r0 = x0 + r0;
7289         r1 = x1 + r1;
7290     (i.e. we generate VF results in 2 registers).
7291     In this case we have a separate def-use cycle for each copy, and therefore
7292     for each copy we get the vector def for the reduction variable from the
7293     respective phi node created for this copy.
7294 
7295     Otherwise (the reduction is unused in the loop nest), we can combine
7296     together intermediate results, like so (e.g. for ncopies=2):
7297 	r = phi (init, r)
7298 	r = x0 + r;
7299 	r = x1 + r;
7300    (i.e. we generate VF/2 results in a single register).
7301    In this case for each copy we get the vector def for the reduction variable
7302    from the vectorized reduction operation generated in the previous iteration.
7303 
7304    This only works when we see both the reduction PHI and its only consumer
7305    in vectorizable_reduction and there are no intermediate stmts
7306    participating.  When unrolling we want each unrolled iteration to have its
7307    own reduction accumulator since one of the main goals of unrolling a
7308    reduction is to reduce the aggregate loop-carried latency.  */
7309   if (ncopies > 1
7310       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7311       && reduc_chain_length == 1
7312       && loop_vinfo->suggested_unroll_factor == 1)
7313     single_defuse_cycle = true;
7314 
7315   if (single_defuse_cycle || lane_reduc_code_p)
7316     {
7317       gcc_assert (op.code != COND_EXPR);
7318 
7319       /* 4. Supportable by target?  */
7320       bool ok = true;
7321 
7322       /* 4.1. check support for the operation in the loop  */
7323       machine_mode vec_mode = TYPE_MODE (vectype_in);
7324       if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7325         {
7326           if (dump_enabled_p ())
7327             dump_printf (MSG_NOTE, "op not supported by target.\n");
7328 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7329 	      || !vect_can_vectorize_without_simd_p (op.code))
7330 	    ok = false;
7331 	  else
7332 	    if (dump_enabled_p ())
7333 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7334         }
7335 
7336       if (vect_emulated_vector_p (vectype_in)
7337 	  && !vect_can_vectorize_without_simd_p (op.code))
7338 	{
7339 	  if (dump_enabled_p ())
7340 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
7341 	  return false;
7342 	}
7343 
7344       /* lane-reducing operations have to go through vect_transform_reduction.
7345          For the other cases try without the single cycle optimization.  */
7346       if (!ok)
7347 	{
7348 	  if (lane_reduc_code_p)
7349 	    return false;
7350 	  else
7351 	    single_defuse_cycle = false;
7352 	}
7353     }
7354   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7355 
7356   /* If the reduction stmt is one of the patterns that have lane
7357      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7358   if ((ncopies > 1 && ! single_defuse_cycle)
7359       && lane_reduc_code_p)
7360     {
7361       if (dump_enabled_p ())
7362 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363 			 "multi def-use cycle not possible for lane-reducing "
7364 			 "reduction operation\n");
7365       return false;
7366     }
7367 
7368   if (slp_node
7369       && !(!single_defuse_cycle
7370 	   && !lane_reduc_code_p
7371 	   && reduction_type != FOLD_LEFT_REDUCTION))
7372     for (i = 0; i < (int) op.num_ops; i++)
7373       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7374 	{
7375 	  if (dump_enabled_p ())
7376 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7377 			     "incompatible vector types for invariants\n");
7378 	  return false;
7379 	}
7380 
7381   if (slp_node)
7382     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7383   else
7384     vec_num = 1;
7385 
7386   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7387 			     reduction_type, ncopies, cost_vec);
7388   /* Cost the reduction op inside the loop if transformed via
7389      vect_transform_reduction.  Otherwise this is costed by the
7390      separate vectorizable_* routines.  */
7391   if (single_defuse_cycle || lane_reduc_code_p)
7392     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7393 
7394   if (dump_enabled_p ()
7395       && reduction_type == FOLD_LEFT_REDUCTION)
7396     dump_printf_loc (MSG_NOTE, vect_location,
7397 		     "using an in-order (fold-left) reduction.\n");
7398   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7399   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7400      reductions go through their own vectorizable_* routines.  */
7401   if (!single_defuse_cycle
7402       && !lane_reduc_code_p
7403       && reduction_type != FOLD_LEFT_REDUCTION)
7404     {
7405       stmt_vec_info tem
7406 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7407       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7408 	{
7409 	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7410 	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7411 	}
7412       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7413       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7414     }
7415   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7416     {
7417       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7418       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7419 
7420       if (reduction_type != FOLD_LEFT_REDUCTION
7421 	  && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7422 	  && (cond_fn == IFN_LAST
7423 	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7424 						  OPTIMIZE_FOR_SPEED)))
7425 	{
7426 	  if (dump_enabled_p ())
7427 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7428 			     "can't operate on partial vectors because"
7429 			     " no conditional operation is available.\n");
7430 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7431 	}
7432       else if (reduction_type == FOLD_LEFT_REDUCTION
7433 	       && reduc_fn == IFN_LAST
7434 	       && !expand_vec_cond_expr_p (vectype_in,
7435 					   truth_type_for (vectype_in),
7436 					   SSA_NAME))
7437 	{
7438 	  if (dump_enabled_p ())
7439 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7440 			     "can't operate on partial vectors because"
7441 			     " no conditional operation is available.\n");
7442 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7443 	}
7444       else
7445 	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7446 			       vectype_in, NULL);
7447     }
7448   return true;
7449 }
7450 
7451 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7452    value.  */
7453 
7454 bool
7455 vect_transform_reduction (loop_vec_info loop_vinfo,
7456 			  stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7457 			  gimple **vec_stmt, slp_tree slp_node)
7458 {
7459   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7460   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7461   int i;
7462   int ncopies;
7463   int vec_num;
7464 
7465   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7466   gcc_assert (reduc_info->is_reduc_info);
7467 
7468   if (nested_in_vect_loop_p (loop, stmt_info))
7469     {
7470       loop = loop->inner;
7471       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7472     }
7473 
7474   gimple_match_op op;
7475   if (!gimple_extract_op (stmt_info->stmt, &op))
7476     gcc_unreachable ();
7477 
7478   /* All uses but the last are expected to be defined in the loop.
7479      The last use is the reduction variable.  In case of nested cycle this
7480      assumption is not true: we use reduc_index to record the index of the
7481      reduction variable.  */
7482   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7483   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7484   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7485   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7486 
7487   if (slp_node)
7488     {
7489       ncopies = 1;
7490       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7491     }
7492   else
7493     {
7494       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7495       vec_num = 1;
7496     }
7497 
7498   code_helper code = canonicalize_code (op.code, op.type);
7499   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7500   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7501   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7502 
7503   /* Transform.  */
7504   tree new_temp = NULL_TREE;
7505   auto_vec<tree> vec_oprnds0;
7506   auto_vec<tree> vec_oprnds1;
7507   auto_vec<tree> vec_oprnds2;
7508   tree def0;
7509 
7510   if (dump_enabled_p ())
7511     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7512 
7513   /* FORNOW: Multiple types are not supported for condition.  */
7514   if (code == COND_EXPR)
7515     gcc_assert (ncopies == 1);
7516 
7517   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7518 
7519   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7520   if (reduction_type == FOLD_LEFT_REDUCTION)
7521     {
7522       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7523       gcc_assert (code.is_tree_code ());
7524       return vectorize_fold_left_reduction
7525 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7526 	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7527     }
7528 
7529   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7530   gcc_assert (single_defuse_cycle
7531 	      || code == DOT_PROD_EXPR
7532 	      || code == WIDEN_SUM_EXPR
7533 	      || code == SAD_EXPR);
7534 
7535   /* Create the destination vector  */
7536   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7537   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7538 
7539   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7540 		     single_defuse_cycle && reduc_index == 0
7541 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
7542 		     single_defuse_cycle && reduc_index == 1
7543 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
7544 		     op.num_ops == 3
7545 		     && !(single_defuse_cycle && reduc_index == 2)
7546 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7547   if (single_defuse_cycle)
7548     {
7549       gcc_assert (!slp_node);
7550       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7551 				     op.ops[reduc_index],
7552 				     reduc_index == 0 ? &vec_oprnds0
7553 				     : (reduc_index == 1 ? &vec_oprnds1
7554 					: &vec_oprnds2));
7555     }
7556 
7557   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7558     {
7559       gimple *new_stmt;
7560       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7561       if (masked_loop_p && !mask_by_cond_expr)
7562 	{
7563 	  /* Make sure that the reduction accumulator is vop[0].  */
7564 	  if (reduc_index == 1)
7565 	    {
7566 	      gcc_assert (commutative_binary_op_p (code, op.type));
7567 	      std::swap (vop[0], vop[1]);
7568 	    }
7569 	  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7570 					  vectype_in, i);
7571 	  gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7572 						    vop[0], vop[1], vop[0]);
7573 	  new_temp = make_ssa_name (vec_dest, call);
7574 	  gimple_call_set_lhs (call, new_temp);
7575 	  gimple_call_set_nothrow (call, true);
7576 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7577 	  new_stmt = call;
7578 	}
7579       else
7580 	{
7581 	  if (op.num_ops == 3)
7582 	    vop[2] = vec_oprnds2[i];
7583 
7584 	  if (masked_loop_p && mask_by_cond_expr)
7585 	    {
7586 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7587 					      vectype_in, i);
7588 	      build_vect_cond_expr (code, vop, mask, gsi);
7589 	    }
7590 
7591 	  if (code.is_internal_fn ())
7592 	    new_stmt = gimple_build_call_internal (internal_fn (code),
7593 						   op.num_ops,
7594 						   vop[0], vop[1], vop[2]);
7595 	  else
7596 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7597 					    vop[0], vop[1], vop[2]);
7598 	  new_temp = make_ssa_name (vec_dest, new_stmt);
7599 	  gimple_set_lhs (new_stmt, new_temp);
7600 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7601 	}
7602 
7603       if (slp_node)
7604 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7605       else if (single_defuse_cycle
7606 	       && i < ncopies - 1)
7607 	{
7608 	  if (reduc_index == 0)
7609 	    vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7610 	  else if (reduc_index == 1)
7611 	    vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7612 	  else if (reduc_index == 2)
7613 	    vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7614 	}
7615       else
7616 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7617     }
7618 
7619   if (!slp_node)
7620     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7621 
7622   return true;
7623 }
7624 
7625 /* Transform phase of a cycle PHI.  */
7626 
7627 bool
7628 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7629 			  stmt_vec_info stmt_info, gimple **vec_stmt,
7630 			  slp_tree slp_node, slp_instance slp_node_instance)
7631 {
7632   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7634   int i;
7635   int ncopies;
7636   int j;
7637   bool nested_cycle = false;
7638   int vec_num;
7639 
7640   if (nested_in_vect_loop_p (loop, stmt_info))
7641     {
7642       loop = loop->inner;
7643       nested_cycle = true;
7644     }
7645 
7646   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7647   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7648   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7649   gcc_assert (reduc_info->is_reduc_info);
7650 
7651   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7652       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7653     /* Leave the scalar phi in place.  */
7654     return true;
7655 
7656   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7657   /* For a nested cycle we do not fill the above.  */
7658   if (!vectype_in)
7659     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7660   gcc_assert (vectype_in);
7661 
7662   if (slp_node)
7663     {
7664       /* The size vect_schedule_slp_instance computes is off for us.  */
7665       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7666 				      * SLP_TREE_LANES (slp_node), vectype_in);
7667       ncopies = 1;
7668     }
7669   else
7670     {
7671       vec_num = 1;
7672       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7673     }
7674 
7675   /* Check whether we should use a single PHI node and accumulate
7676      vectors to one before the backedge.  */
7677   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7678     ncopies = 1;
7679 
7680   /* Create the destination vector  */
7681   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7682   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7683 					       vectype_out);
7684 
7685   /* Get the loop-entry arguments.  */
7686   tree vec_initial_def = NULL_TREE;
7687   auto_vec<tree> vec_initial_defs;
7688   if (slp_node)
7689     {
7690       vec_initial_defs.reserve (vec_num);
7691       if (nested_cycle)
7692 	{
7693 	  unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7694 	  vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7695 			     &vec_initial_defs);
7696 	}
7697       else
7698 	{
7699 	  gcc_assert (slp_node == slp_node_instance->reduc_phis);
7700 	  vec<tree> &initial_values = reduc_info->reduc_initial_values;
7701 	  vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7702 
7703 	  unsigned int num_phis = stmts.length ();
7704 	  if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7705 	    num_phis = 1;
7706 	  initial_values.reserve (num_phis);
7707 	  for (unsigned int i = 0; i < num_phis; ++i)
7708 	    {
7709 	      gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7710 	      initial_values.quick_push (vect_phi_initial_value (this_phi));
7711 	    }
7712 	  if (vec_num == 1)
7713 	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7714 	  if (!initial_values.is_empty ())
7715 	    {
7716 	      tree initial_value
7717 		= (num_phis == 1 ? initial_values[0] : NULL_TREE);
7718 	      code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7719 	      tree neutral_op
7720 		= neutral_op_for_reduction (TREE_TYPE (vectype_out),
7721 					    code, initial_value);
7722 	      get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7723 					      &vec_initial_defs, vec_num,
7724 					      stmts.length (), neutral_op);
7725 	    }
7726 	}
7727     }
7728   else
7729     {
7730       /* Get at the scalar def before the loop, that defines the initial
7731 	 value of the reduction variable.  */
7732       tree initial_def = vect_phi_initial_value (phi);
7733       reduc_info->reduc_initial_values.safe_push (initial_def);
7734       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7735 	 and we can't use zero for induc_val, use initial_def.  Similarly
7736 	 for REDUC_MIN and initial_def larger than the base.  */
7737       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7738 	{
7739 	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7740 	  if (TREE_CODE (initial_def) == INTEGER_CST
7741 	      && !integer_zerop (induc_val)
7742 	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7743 		   && tree_int_cst_lt (initial_def, induc_val))
7744 		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7745 		      && tree_int_cst_lt (induc_val, initial_def))))
7746 	    {
7747 	      induc_val = initial_def;
7748 	      /* Communicate we used the initial_def to epilouge
7749 		 generation.  */
7750 	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7751 	    }
7752 	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7753 	}
7754       else if (nested_cycle)
7755 	{
7756 	  /* Do not use an adjustment def as that case is not supported
7757 	     correctly if ncopies is not one.  */
7758 	  vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7759 					 ncopies, initial_def,
7760 					 &vec_initial_defs);
7761 	}
7762       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7763 	       || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7764 	/* Fill the initial vector with the initial scalar value.  */
7765 	vec_initial_def
7766 	  = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7767 					   initial_def, initial_def);
7768       else
7769 	{
7770 	  if (ncopies == 1)
7771 	    vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7772 	  if (!reduc_info->reduc_initial_values.is_empty ())
7773 	    {
7774 	      initial_def = reduc_info->reduc_initial_values[0];
7775 	      code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7776 	      tree neutral_op
7777 		= neutral_op_for_reduction (TREE_TYPE (initial_def),
7778 					    code, initial_def);
7779 	      gcc_assert (neutral_op);
7780 	      /* Try to simplify the vector initialization by applying an
7781 		 adjustment after the reduction has been performed.  */
7782 	      if (!reduc_info->reused_accumulator
7783 		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7784 		  && !operand_equal_p (neutral_op, initial_def))
7785 		{
7786 		  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7787 		    = initial_def;
7788 		  initial_def = neutral_op;
7789 		}
7790 	      vec_initial_def
7791 		= get_initial_def_for_reduction (loop_vinfo, reduc_info,
7792 						 initial_def, neutral_op);
7793 	    }
7794 	}
7795     }
7796 
7797   if (vec_initial_def)
7798     {
7799       vec_initial_defs.create (ncopies);
7800       for (i = 0; i < ncopies; ++i)
7801 	vec_initial_defs.quick_push (vec_initial_def);
7802     }
7803 
7804   if (auto *accumulator = reduc_info->reused_accumulator)
7805     {
7806       tree def = accumulator->reduc_input;
7807       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7808 	{
7809 	  unsigned int nreduc;
7810 	  bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7811 					    (TREE_TYPE (def)),
7812 					  TYPE_VECTOR_SUBPARTS (vectype_out),
7813 					  &nreduc);
7814 	  gcc_assert (res);
7815 	  gimple_seq stmts = NULL;
7816 	  /* Reduce the single vector to a smaller one.  */
7817 	  if (nreduc != 1)
7818 	    {
7819 	      /* Perform the reduction in the appropriate type.  */
7820 	      tree rvectype = vectype_out;
7821 	      if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7822 					      TREE_TYPE (TREE_TYPE (def))))
7823 		rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7824 					      TYPE_VECTOR_SUBPARTS
7825 						(vectype_out));
7826 	      def = vect_create_partial_epilog (def, rvectype,
7827 						STMT_VINFO_REDUC_CODE
7828 						  (reduc_info),
7829 						&stmts);
7830 	    }
7831 	  /* The epilogue loop might use a different vector mode, like
7832 	     VNx2DI vs. V2DI.  */
7833 	  if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7834 	    {
7835 	      tree reduc_type = build_vector_type_for_mode
7836 		(TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7837 	      def = gimple_convert (&stmts, reduc_type, def);
7838 	    }
7839 	  /* Adjust the input so we pick up the partially reduced value
7840 	     for the skip edge in vect_create_epilog_for_reduction.  */
7841 	  accumulator->reduc_input = def;
7842 	  /* And the reduction could be carried out using a different sign.  */
7843 	  if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7844 	    def = gimple_convert (&stmts, vectype_out, def);
7845 	  if (loop_vinfo->main_loop_edge)
7846 	    {
7847 	      /* While we'd like to insert on the edge this will split
7848 		 blocks and disturb bookkeeping, we also will eventually
7849 		 need this on the skip edge.  Rely on sinking to
7850 		 fixup optimal placement and insert in the pred.  */
7851 	      gimple_stmt_iterator gsi
7852 		= gsi_last_bb (loop_vinfo->main_loop_edge->src);
7853 	      /* Insert before a cond that eventually skips the
7854 		 epilogue.  */
7855 	      if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7856 		gsi_prev (&gsi);
7857 	      gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7858 	    }
7859 	  else
7860 	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7861 					      stmts);
7862 	}
7863       if (loop_vinfo->main_loop_edge)
7864 	vec_initial_defs[0]
7865 	  = vect_get_main_loop_result (loop_vinfo, def,
7866 				       vec_initial_defs[0]);
7867       else
7868 	vec_initial_defs.safe_push (def);
7869     }
7870 
7871   /* Generate the reduction PHIs upfront.  */
7872   for (i = 0; i < vec_num; i++)
7873     {
7874       tree vec_init_def = vec_initial_defs[i];
7875       for (j = 0; j < ncopies; j++)
7876 	{
7877 	  /* Create the reduction-phi that defines the reduction
7878 	     operand.  */
7879 	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
7880 
7881 	  /* Set the loop-entry arg of the reduction-phi.  */
7882 	  if (j != 0 && nested_cycle)
7883 	    vec_init_def = vec_initial_defs[j];
7884 	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7885 		       UNKNOWN_LOCATION);
7886 
7887 	  /* The loop-latch arg is set in epilogue processing.  */
7888 
7889 	  if (slp_node)
7890 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7891 	  else
7892 	    {
7893 	      if (j == 0)
7894 		*vec_stmt = new_phi;
7895 	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7896 	    }
7897 	}
7898     }
7899 
7900   return true;
7901 }
7902 
7903 /* Vectorizes LC PHIs.  */
7904 
7905 bool
7906 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7907 		     stmt_vec_info stmt_info, gimple **vec_stmt,
7908 		     slp_tree slp_node)
7909 {
7910   if (!loop_vinfo
7911       || !is_a <gphi *> (stmt_info->stmt)
7912       || gimple_phi_num_args (stmt_info->stmt) != 1)
7913     return false;
7914 
7915   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7916       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7917     return false;
7918 
7919   if (!vec_stmt) /* transformation not required.  */
7920     {
7921       /* Deal with copies from externs or constants that disguise as
7922 	 loop-closed PHI nodes (PR97886).  */
7923       if (slp_node
7924 	  && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7925 						SLP_TREE_VECTYPE (slp_node)))
7926 	{
7927 	  if (dump_enabled_p ())
7928 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7929 			     "incompatible vector types for invariants\n");
7930 	  return false;
7931 	}
7932       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7933       return true;
7934     }
7935 
7936   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7937   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7938   basic_block bb = gimple_bb (stmt_info->stmt);
7939   edge e = single_pred_edge (bb);
7940   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7941   auto_vec<tree> vec_oprnds;
7942   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7943 		     !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7944 		     gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7945   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7946     {
7947       /* Create the vectorized LC PHI node.  */
7948       gphi *new_phi = create_phi_node (vec_dest, bb);
7949       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7950       if (slp_node)
7951 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7952       else
7953 	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7954     }
7955   if (!slp_node)
7956     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7957 
7958   return true;
7959 }
7960 
7961 /* Vectorizes PHIs.  */
7962 
7963 bool
7964 vectorizable_phi (vec_info *,
7965 		  stmt_vec_info stmt_info, gimple **vec_stmt,
7966 		  slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7967 {
7968   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7969     return false;
7970 
7971   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7972     return false;
7973 
7974   tree vectype = SLP_TREE_VECTYPE (slp_node);
7975 
7976   if (!vec_stmt) /* transformation not required.  */
7977     {
7978       slp_tree child;
7979       unsigned i;
7980       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7981 	if (!child)
7982 	  {
7983 	    if (dump_enabled_p ())
7984 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7985 			       "PHI node with unvectorized backedge def\n");
7986 	    return false;
7987 	  }
7988 	else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7989 	  {
7990 	    if (dump_enabled_p ())
7991 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992 			       "incompatible vector types for invariants\n");
7993 	    return false;
7994 	  }
7995 	else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7996 		 && !useless_type_conversion_p (vectype,
7997 						SLP_TREE_VECTYPE (child)))
7998 	  {
7999 	    /* With bools we can have mask and non-mask precision vectors
8000 	       or different non-mask precisions.  while pattern recog is
8001 	       supposed to guarantee consistency here bugs in it can cause
8002 	       mismatches (PR103489 and PR103800 for example).
8003 	       Deal with them here instead of ICEing later.  */
8004 	    if (dump_enabled_p ())
8005 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8006 			       "incompatible vector type setup from "
8007 			       "bool pattern detection\n");
8008 	    return false;
8009 	  }
8010 
8011       /* For single-argument PHIs assume coalescing which means zero cost
8012 	 for the scalar and the vector PHIs.  This avoids artificially
8013 	 favoring the vector path (but may pessimize it in some cases).  */
8014       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8015 	record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8016 			  vector_stmt, stmt_info, vectype, 0, vect_body);
8017       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8018       return true;
8019     }
8020 
8021   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8022   basic_block bb = gimple_bb (stmt_info->stmt);
8023   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8024   auto_vec<gphi *> new_phis;
8025   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8026     {
8027       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8028 
8029       /* Skip not yet vectorized defs.  */
8030       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8031 	  && SLP_TREE_VEC_STMTS (child).is_empty ())
8032 	continue;
8033 
8034       auto_vec<tree> vec_oprnds;
8035       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8036       if (!new_phis.exists ())
8037 	{
8038 	  new_phis.create (vec_oprnds.length ());
8039 	  for (unsigned j = 0; j < vec_oprnds.length (); j++)
8040 	    {
8041 	      /* Create the vectorized LC PHI node.  */
8042 	      new_phis.quick_push (create_phi_node (vec_dest, bb));
8043 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8044 	    }
8045 	}
8046       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8047       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8048 	add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8049     }
8050   /* We should have at least one already vectorized child.  */
8051   gcc_assert (new_phis.exists ());
8052 
8053   return true;
8054 }
8055 
8056 /* Return true if VECTYPE represents a vector that requires lowering
8057    by the vector lowering pass.  */
8058 
8059 bool
8060 vect_emulated_vector_p (tree vectype)
8061 {
8062   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8063 	  && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8064 	      || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8065 }
8066 
8067 /* Return true if we can emulate CODE on an integer mode representation
8068    of a vector.  */
8069 
8070 bool
8071 vect_can_vectorize_without_simd_p (tree_code code)
8072 {
8073   switch (code)
8074     {
8075     case PLUS_EXPR:
8076     case MINUS_EXPR:
8077     case NEGATE_EXPR:
8078     case BIT_AND_EXPR:
8079     case BIT_IOR_EXPR:
8080     case BIT_XOR_EXPR:
8081     case BIT_NOT_EXPR:
8082       return true;
8083 
8084     default:
8085       return false;
8086     }
8087 }
8088 
8089 /* Likewise, but taking a code_helper.  */
8090 
8091 bool
8092 vect_can_vectorize_without_simd_p (code_helper code)
8093 {
8094   return (code.is_tree_code ()
8095 	  && vect_can_vectorize_without_simd_p (tree_code (code)));
8096 }
8097 
8098 /* Function vectorizable_induction
8099 
8100    Check if STMT_INFO performs an induction computation that can be vectorized.
8101    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8102    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8103    Return true if STMT_INFO is vectorizable in this way.  */
8104 
8105 bool
8106 vectorizable_induction (loop_vec_info loop_vinfo,
8107 			stmt_vec_info stmt_info,
8108 			gimple **vec_stmt, slp_tree slp_node,
8109 			stmt_vector_for_cost *cost_vec)
8110 {
8111   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8112   unsigned ncopies;
8113   bool nested_in_vect_loop = false;
8114   class loop *iv_loop;
8115   tree vec_def;
8116   edge pe = loop_preheader_edge (loop);
8117   basic_block new_bb;
8118   tree new_vec, vec_init, vec_step, t;
8119   tree new_name;
8120   gimple *new_stmt;
8121   gphi *induction_phi;
8122   tree induc_def, vec_dest;
8123   tree init_expr, step_expr;
8124   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8125   unsigned i;
8126   tree expr;
8127   gimple_stmt_iterator si;
8128 
8129   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8130   if (!phi)
8131     return false;
8132 
8133   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8134     return false;
8135 
8136   /* Make sure it was recognized as induction computation.  */
8137   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8138     return false;
8139 
8140   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8141   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8142 
8143   if (slp_node)
8144     ncopies = 1;
8145   else
8146     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8147   gcc_assert (ncopies >= 1);
8148 
8149   /* FORNOW. These restrictions should be relaxed.  */
8150   if (nested_in_vect_loop_p (loop, stmt_info))
8151     {
8152       imm_use_iterator imm_iter;
8153       use_operand_p use_p;
8154       gimple *exit_phi;
8155       edge latch_e;
8156       tree loop_arg;
8157 
8158       if (ncopies > 1)
8159 	{
8160 	  if (dump_enabled_p ())
8161 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8162 			     "multiple types in nested loop.\n");
8163 	  return false;
8164 	}
8165 
8166       exit_phi = NULL;
8167       latch_e = loop_latch_edge (loop->inner);
8168       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8169       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8170 	{
8171 	  gimple *use_stmt = USE_STMT (use_p);
8172 	  if (is_gimple_debug (use_stmt))
8173 	    continue;
8174 
8175 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8176 	    {
8177 	      exit_phi = use_stmt;
8178 	      break;
8179 	    }
8180 	}
8181       if (exit_phi)
8182 	{
8183 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8184 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8185 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8186 	    {
8187 	      if (dump_enabled_p ())
8188 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8189 				 "inner-loop induction only used outside "
8190 				 "of the outer vectorized loop.\n");
8191 	      return false;
8192 	    }
8193 	}
8194 
8195       nested_in_vect_loop = true;
8196       iv_loop = loop->inner;
8197     }
8198   else
8199     iv_loop = loop;
8200   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8201 
8202   if (slp_node && !nunits.is_constant ())
8203     {
8204       /* The current SLP code creates the step value element-by-element.  */
8205       if (dump_enabled_p ())
8206 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8207 			 "SLP induction not supported for variable-length"
8208 			 " vectors.\n");
8209       return false;
8210     }
8211 
8212   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8213     {
8214       if (dump_enabled_p ())
8215 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8216 			 "floating point induction vectorization disabled\n");
8217       return false;
8218     }
8219 
8220   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8221   gcc_assert (step_expr != NULL_TREE);
8222   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8223 
8224   /* Check for backend support of PLUS/MINUS_EXPR. */
8225   if (!directly_supported_p (PLUS_EXPR, step_vectype)
8226       || !directly_supported_p (MINUS_EXPR, step_vectype))
8227     return false;
8228 
8229   if (!vec_stmt) /* transformation not required.  */
8230     {
8231       unsigned inside_cost = 0, prologue_cost = 0;
8232       if (slp_node)
8233 	{
8234 	  /* We eventually need to set a vector type on invariant
8235 	     arguments.  */
8236 	  unsigned j;
8237 	  slp_tree child;
8238 	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8239 	    if (!vect_maybe_update_slp_op_vectype
8240 		(child, SLP_TREE_VECTYPE (slp_node)))
8241 	      {
8242 		if (dump_enabled_p ())
8243 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8244 				   "incompatible vector types for "
8245 				   "invariants\n");
8246 		return false;
8247 	      }
8248 	  /* loop cost for vec_loop.  */
8249 	  inside_cost
8250 	    = record_stmt_cost (cost_vec,
8251 				SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8252 				vector_stmt, stmt_info, 0, vect_body);
8253 	  /* prologue cost for vec_init (if not nested) and step.  */
8254 	  prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8255 					    scalar_to_vec,
8256 					    stmt_info, 0, vect_prologue);
8257 	}
8258       else /* if (!slp_node) */
8259 	{
8260 	  /* loop cost for vec_loop.  */
8261 	  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8262 					  stmt_info, 0, vect_body);
8263 	  /* prologue cost for vec_init and vec_step.  */
8264 	  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8265 					    stmt_info, 0, vect_prologue);
8266 	}
8267       if (dump_enabled_p ())
8268 	dump_printf_loc (MSG_NOTE, vect_location,
8269 			 "vect_model_induction_cost: inside_cost = %d, "
8270 			 "prologue_cost = %d .\n", inside_cost,
8271 			 prologue_cost);
8272 
8273       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8274       DUMP_VECT_SCOPE ("vectorizable_induction");
8275       return true;
8276     }
8277 
8278   /* Transform.  */
8279 
8280   /* Compute a vector variable, initialized with the first VF values of
8281      the induction variable.  E.g., for an iv with IV_PHI='X' and
8282      evolution S, for a vector of 4 units, we want to compute:
8283      [X, X + S, X + 2*S, X + 3*S].  */
8284 
8285   if (dump_enabled_p ())
8286     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8287 
8288   pe = loop_preheader_edge (iv_loop);
8289   /* Find the first insertion point in the BB.  */
8290   basic_block bb = gimple_bb (phi);
8291   si = gsi_after_labels (bb);
8292 
8293   /* For SLP induction we have to generate several IVs as for example
8294      with group size 3 we need
8295        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8296        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8297   if (slp_node)
8298     {
8299       /* Enforced above.  */
8300       unsigned int const_nunits = nunits.to_constant ();
8301 
8302       /* The initial values are vectorized, but any lanes > group_size
8303 	 need adjustment.  */
8304       slp_tree init_node
8305 	= SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8306 
8307       /* Gather steps.  Since we do not vectorize inductions as
8308 	 cycles we have to reconstruct the step from SCEV data.  */
8309       unsigned group_size = SLP_TREE_LANES (slp_node);
8310       tree *steps = XALLOCAVEC (tree, group_size);
8311       tree *inits = XALLOCAVEC (tree, group_size);
8312       stmt_vec_info phi_info;
8313       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8314 	{
8315 	  steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8316 	  if (!init_node)
8317 	    inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8318 					   pe->dest_idx);
8319 	}
8320 
8321       /* Now generate the IVs.  */
8322       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8323       gcc_assert ((const_nunits * nvects) % group_size == 0);
8324       unsigned nivs;
8325       if (nested_in_vect_loop)
8326 	nivs = nvects;
8327       else
8328 	{
8329 	  /* Compute the number of distinct IVs we need.  First reduce
8330 	     group_size if it is a multiple of const_nunits so we get
8331 	     one IV for a group_size of 4 but const_nunits 2.  */
8332 	  unsigned group_sizep = group_size;
8333 	  if (group_sizep % const_nunits == 0)
8334 	    group_sizep = group_sizep / const_nunits;
8335 	  nivs = least_common_multiple (group_sizep,
8336 					const_nunits) / const_nunits;
8337 	}
8338       tree stept = TREE_TYPE (step_vectype);
8339       tree lupdate_mul = NULL_TREE;
8340       if (!nested_in_vect_loop)
8341 	{
8342 	  /* The number of iterations covered in one vector iteration.  */
8343 	  unsigned lup_mul = (nvects * const_nunits) / group_size;
8344 	  lupdate_mul
8345 	    = build_vector_from_val (step_vectype,
8346 				     SCALAR_FLOAT_TYPE_P (stept)
8347 				     ? build_real_from_wide (stept, lup_mul,
8348 							     UNSIGNED)
8349 				     : build_int_cstu (stept, lup_mul));
8350 	}
8351       tree peel_mul = NULL_TREE;
8352       gimple_seq init_stmts = NULL;
8353       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8354 	{
8355 	  if (SCALAR_FLOAT_TYPE_P (stept))
8356 	    peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8357 				     LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8358 	  else
8359 	    peel_mul = gimple_convert (&init_stmts, stept,
8360 				       LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8361 	  peel_mul = gimple_build_vector_from_val (&init_stmts,
8362 						   step_vectype, peel_mul);
8363 	}
8364       unsigned ivn;
8365       auto_vec<tree> vec_steps;
8366       for (ivn = 0; ivn < nivs; ++ivn)
8367 	{
8368 	  tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8369 	  tree_vector_builder init_elts (vectype, const_nunits, 1);
8370 	  tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8371 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8372 	    {
8373 	      /* The scalar steps of the IVs.  */
8374 	      tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8375 	      elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8376 	      step_elts.quick_push (elt);
8377 	      if (!init_node)
8378 		{
8379 		  /* The scalar inits of the IVs if not vectorized.  */
8380 		  elt = inits[(ivn*const_nunits + eltn) % group_size];
8381 		  if (!useless_type_conversion_p (TREE_TYPE (vectype),
8382 						  TREE_TYPE (elt)))
8383 		    elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8384 					TREE_TYPE (vectype), elt);
8385 		  init_elts.quick_push (elt);
8386 		}
8387 	      /* The number of steps to add to the initial values.  */
8388 	      unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8389 	      mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8390 				   ? build_real_from_wide (stept,
8391 							   mul_elt, UNSIGNED)
8392 				   : build_int_cstu (stept, mul_elt));
8393 	    }
8394 	  vec_step = gimple_build_vector (&init_stmts, &step_elts);
8395 	  vec_steps.safe_push (vec_step);
8396 	  tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8397 	  if (peel_mul)
8398 	    step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8399 				     step_mul, peel_mul);
8400 	  if (!init_node)
8401 	    vec_init = gimple_build_vector (&init_stmts, &init_elts);
8402 
8403 	  /* Create the induction-phi that defines the induction-operand.  */
8404 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8405 					    "vec_iv_");
8406 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
8407 	  induc_def = PHI_RESULT (induction_phi);
8408 
8409 	  /* Create the iv update inside the loop  */
8410 	  tree up = vec_step;
8411 	  if (lupdate_mul)
8412 	    up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8413 			       vec_step, lupdate_mul);
8414 	  gimple_seq stmts = NULL;
8415 	  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8416 	  vec_def = gimple_build (&stmts,
8417 				  PLUS_EXPR, step_vectype, vec_def, up);
8418 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
8419 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8420 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8421 		       UNKNOWN_LOCATION);
8422 
8423 	  if (init_node)
8424 	    vec_init = vect_get_slp_vect_def (init_node, ivn);
8425 	  if (!nested_in_vect_loop
8426 	      && !integer_zerop (step_mul))
8427 	    {
8428 	      vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8429 	      up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8430 				 vec_step, step_mul);
8431 	      vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8432 				      vec_def, up);
8433 	      vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8434 	    }
8435 
8436 	  /* Set the arguments of the phi node:  */
8437 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8438 
8439 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8440 	}
8441       if (!nested_in_vect_loop)
8442 	{
8443 	  /* Fill up to the number of vectors we need for the whole group.  */
8444 	  nivs = least_common_multiple (group_size,
8445 					const_nunits) / const_nunits;
8446 	  vec_steps.reserve (nivs-ivn);
8447 	  for (; ivn < nivs; ++ivn)
8448 	    {
8449 	      SLP_TREE_VEC_STMTS (slp_node)
8450 		.quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8451 	      vec_steps.quick_push (vec_steps[0]);
8452 	    }
8453 	}
8454 
8455       /* Re-use IVs when we can.  We are generating further vector
8456 	 stmts by adding VF' * stride to the IVs generated above.  */
8457       if (ivn < nvects)
8458 	{
8459 	  unsigned vfp
8460 	    = least_common_multiple (group_size, const_nunits) / group_size;
8461 	  tree lupdate_mul
8462 	    = build_vector_from_val (step_vectype,
8463 				     SCALAR_FLOAT_TYPE_P (stept)
8464 				     ? build_real_from_wide (stept,
8465 							     vfp, UNSIGNED)
8466 				     : build_int_cstu (stept, vfp));
8467 	  for (; ivn < nvects; ++ivn)
8468 	    {
8469 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8470 	      tree def = gimple_get_lhs (iv);
8471 	      if (ivn < 2*nivs)
8472 		vec_steps[ivn - nivs]
8473 		  = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8474 				  vec_steps[ivn - nivs], lupdate_mul);
8475 	      gimple_seq stmts = NULL;
8476 	      def = gimple_convert (&stmts, step_vectype, def);
8477 	      def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8478 				  def, vec_steps[ivn % nivs]);
8479 	      def = gimple_convert (&stmts, vectype, def);
8480 	      if (gimple_code (iv) == GIMPLE_PHI)
8481 		gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8482 	      else
8483 		{
8484 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8485 		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8486 		}
8487 	      SLP_TREE_VEC_STMTS (slp_node)
8488 		.quick_push (SSA_NAME_DEF_STMT (def));
8489 	    }
8490 	}
8491 
8492       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8493       gcc_assert (!new_bb);
8494 
8495       return true;
8496     }
8497 
8498   init_expr = vect_phi_initial_value (phi);
8499 
8500   gimple_seq stmts = NULL;
8501   if (!nested_in_vect_loop)
8502     {
8503       /* Convert the initial value to the IV update type.  */
8504       tree new_type = TREE_TYPE (step_expr);
8505       init_expr = gimple_convert (&stmts, new_type, init_expr);
8506 
8507       /* If we are using the loop mask to "peel" for alignment then we need
8508 	 to adjust the start value here.  */
8509       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8510       if (skip_niters != NULL_TREE)
8511 	{
8512 	  if (FLOAT_TYPE_P (vectype))
8513 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8514 					skip_niters);
8515 	  else
8516 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8517 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8518 					 skip_niters, step_expr);
8519 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8520 				    init_expr, skip_step);
8521 	}
8522     }
8523 
8524   if (stmts)
8525     {
8526       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8527       gcc_assert (!new_bb);
8528     }
8529 
8530   /* Create the vector that holds the initial_value of the induction.  */
8531   if (nested_in_vect_loop)
8532     {
8533       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8534 	 been created during vectorization of previous stmts.  We obtain it
8535 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8536       auto_vec<tree> vec_inits;
8537       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8538 				     init_expr, &vec_inits);
8539       vec_init = vec_inits[0];
8540       /* If the initial value is not of proper type, convert it.  */
8541       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8542 	{
8543 	  new_stmt
8544 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
8545 							  vect_simple_var,
8546 							  "vec_iv_"),
8547 				   VIEW_CONVERT_EXPR,
8548 				   build1 (VIEW_CONVERT_EXPR, vectype,
8549 					   vec_init));
8550 	  vec_init = gimple_assign_lhs (new_stmt);
8551 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8552 						 new_stmt);
8553 	  gcc_assert (!new_bb);
8554 	}
8555     }
8556   else
8557     {
8558       /* iv_loop is the loop to be vectorized. Create:
8559 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8560       stmts = NULL;
8561       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8562 
8563       unsigned HOST_WIDE_INT const_nunits;
8564       if (nunits.is_constant (&const_nunits))
8565 	{
8566 	  tree_vector_builder elts (step_vectype, const_nunits, 1);
8567 	  elts.quick_push (new_name);
8568 	  for (i = 1; i < const_nunits; i++)
8569 	    {
8570 	      /* Create: new_name_i = new_name + step_expr  */
8571 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8572 				       new_name, step_expr);
8573 	      elts.quick_push (new_name);
8574 	    }
8575 	  /* Create a vector from [new_name_0, new_name_1, ...,
8576 	     new_name_nunits-1]  */
8577 	  vec_init = gimple_build_vector (&stmts, &elts);
8578 	}
8579       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8580 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
8581 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8582 				 new_name, step_expr);
8583       else
8584 	{
8585 	  /* Build:
8586 	        [base, base, base, ...]
8587 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8588 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8589 	  gcc_assert (flag_associative_math);
8590 	  tree index = build_index_vector (step_vectype, 0, 1);
8591 	  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8592 							new_name);
8593 	  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8594 							step_expr);
8595 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8596 	  vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8597 				   vec_init, step_vec);
8598 	  vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8599 				   vec_init, base_vec);
8600 	}
8601       vec_init = gimple_convert (&stmts, vectype, vec_init);
8602 
8603       if (stmts)
8604 	{
8605 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8606 	  gcc_assert (!new_bb);
8607 	}
8608     }
8609 
8610 
8611   /* Create the vector that holds the step of the induction.  */
8612   if (nested_in_vect_loop)
8613     /* iv_loop is nested in the loop to be vectorized. Generate:
8614        vec_step = [S, S, S, S]  */
8615     new_name = step_expr;
8616   else
8617     {
8618       /* iv_loop is the loop to be vectorized. Generate:
8619 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8620       gimple_seq seq = NULL;
8621       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8622 	{
8623 	  expr = build_int_cst (integer_type_node, vf);
8624 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8625 	}
8626       else
8627 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
8628       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8629 			       expr, step_expr);
8630       if (seq)
8631 	{
8632 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8633 	  gcc_assert (!new_bb);
8634 	}
8635     }
8636 
8637   t = unshare_expr (new_name);
8638   gcc_assert (CONSTANT_CLASS_P (new_name)
8639 	      || TREE_CODE (new_name) == SSA_NAME);
8640   new_vec = build_vector_from_val (step_vectype, t);
8641   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8642 			       new_vec, step_vectype, NULL);
8643 
8644 
8645   /* Create the following def-use cycle:
8646      loop prolog:
8647          vec_init = ...
8648 	 vec_step = ...
8649      loop:
8650          vec_iv = PHI <vec_init, vec_loop>
8651          ...
8652          STMT
8653          ...
8654          vec_loop = vec_iv + vec_step;  */
8655 
8656   /* Create the induction-phi that defines the induction-operand.  */
8657   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8658   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8659   induc_def = PHI_RESULT (induction_phi);
8660 
8661   /* Create the iv update inside the loop  */
8662   stmts = NULL;
8663   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8664   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8665   vec_def = gimple_convert (&stmts, vectype, vec_def);
8666   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8667   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8668 
8669   /* Set the arguments of the phi node:  */
8670   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8671   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8672 	       UNKNOWN_LOCATION);
8673 
8674   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8675   *vec_stmt = induction_phi;
8676 
8677   /* In case that vectorization factor (VF) is bigger than the number
8678      of elements that we can fit in a vectype (nunits), we have to generate
8679      more than one vector stmt - i.e - we need to "unroll" the
8680      vector stmt by a factor VF/nunits.  For more details see documentation
8681      in vectorizable_operation.  */
8682 
8683   if (ncopies > 1)
8684     {
8685       gimple_seq seq = NULL;
8686       /* FORNOW. This restriction should be relaxed.  */
8687       gcc_assert (!nested_in_vect_loop);
8688 
8689       /* Create the vector that holds the step of the induction.  */
8690       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8691 	{
8692 	  expr = build_int_cst (integer_type_node, nunits);
8693 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8694 	}
8695       else
8696 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8697       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8698 			       expr, step_expr);
8699       if (seq)
8700 	{
8701 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8702 	  gcc_assert (!new_bb);
8703 	}
8704 
8705       t = unshare_expr (new_name);
8706       gcc_assert (CONSTANT_CLASS_P (new_name)
8707 		  || TREE_CODE (new_name) == SSA_NAME);
8708       new_vec = build_vector_from_val (step_vectype, t);
8709       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8710 				   new_vec, step_vectype, NULL);
8711 
8712       vec_def = induc_def;
8713       for (i = 1; i < ncopies; i++)
8714 	{
8715 	  /* vec_i = vec_prev + vec_step  */
8716 	  gimple_seq stmts = NULL;
8717 	  vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8718 	  vec_def = gimple_build (&stmts,
8719 				  PLUS_EXPR, step_vectype, vec_def, vec_step);
8720 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
8721 
8722 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8723 	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
8724 	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8725 	}
8726     }
8727 
8728   if (dump_enabled_p ())
8729     dump_printf_loc (MSG_NOTE, vect_location,
8730 		     "transform induction: created def-use cycle: %G%G",
8731 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
8732 
8733   return true;
8734 }
8735 
8736 /* Function vectorizable_live_operation.
8737 
8738    STMT_INFO computes a value that is used outside the loop.  Check if
8739    it can be supported.  */
8740 
8741 bool
8742 vectorizable_live_operation (vec_info *vinfo,
8743 			     stmt_vec_info stmt_info,
8744 			     gimple_stmt_iterator *gsi,
8745 			     slp_tree slp_node, slp_instance slp_node_instance,
8746 			     int slp_index, bool vec_stmt_p,
8747 			     stmt_vector_for_cost *cost_vec)
8748 {
8749   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8750   imm_use_iterator imm_iter;
8751   tree lhs, lhs_type, bitsize;
8752   tree vectype = (slp_node
8753 		  ? SLP_TREE_VECTYPE (slp_node)
8754 		  : STMT_VINFO_VECTYPE (stmt_info));
8755   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8756   int ncopies;
8757   gimple *use_stmt;
8758   auto_vec<tree> vec_oprnds;
8759   int vec_entry = 0;
8760   poly_uint64 vec_index = 0;
8761 
8762   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8763 
8764   /* If a stmt of a reduction is live, vectorize it via
8765      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8766      validity so just trigger the transform here.  */
8767   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8768     {
8769       if (!vec_stmt_p)
8770 	return true;
8771       if (slp_node)
8772 	{
8773 	  /* For reduction chains the meta-info is attached to
8774 	     the group leader.  */
8775 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8776 	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8777 	  /* For SLP reductions we vectorize the epilogue for
8778 	     all involved stmts together.  */
8779 	  else if (slp_index != 0)
8780 	    return true;
8781 	}
8782       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8783       gcc_assert (reduc_info->is_reduc_info);
8784       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8785 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8786 	return true;
8787       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8788 					slp_node_instance);
8789       return true;
8790     }
8791 
8792   /* If STMT is not relevant and it is a simple assignment and its inputs are
8793      invariant then it can remain in place, unvectorized.  The original last
8794      scalar value that it computes will be used.  */
8795   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8796     {
8797       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8798       if (dump_enabled_p ())
8799 	dump_printf_loc (MSG_NOTE, vect_location,
8800 			 "statement is simple and uses invariant.  Leaving in "
8801 			 "place.\n");
8802       return true;
8803     }
8804 
8805   if (slp_node)
8806     ncopies = 1;
8807   else
8808     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8809 
8810   if (slp_node)
8811     {
8812       gcc_assert (slp_index >= 0);
8813 
8814       /* Get the last occurrence of the scalar index from the concatenation of
8815 	 all the slp vectors. Calculate which slp vector it is and the index
8816 	 within.  */
8817       int num_scalar = SLP_TREE_LANES (slp_node);
8818       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8819       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8820 
8821       /* Calculate which vector contains the result, and which lane of
8822 	 that vector we need.  */
8823       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8824 	{
8825 	  if (dump_enabled_p ())
8826 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8827 			     "Cannot determine which vector holds the"
8828 			     " final result.\n");
8829 	  return false;
8830 	}
8831     }
8832 
8833   if (!vec_stmt_p)
8834     {
8835       /* No transformation required.  */
8836       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8837 	{
8838 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8839 					       OPTIMIZE_FOR_SPEED))
8840 	    {
8841 	      if (dump_enabled_p ())
8842 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8843 				 "can't operate on partial vectors "
8844 				 "because the target doesn't support extract "
8845 				 "last reduction.\n");
8846 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8847 	    }
8848 	  else if (slp_node)
8849 	    {
8850 	      if (dump_enabled_p ())
8851 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8852 				 "can't operate on partial vectors "
8853 				 "because an SLP statement is live after "
8854 				 "the loop.\n");
8855 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8856 	    }
8857 	  else if (ncopies > 1)
8858 	    {
8859 	      if (dump_enabled_p ())
8860 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8861 				 "can't operate on partial vectors "
8862 				 "because ncopies is greater than 1.\n");
8863 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8864 	    }
8865 	  else
8866 	    {
8867 	      gcc_assert (ncopies == 1 && !slp_node);
8868 	      vect_record_loop_mask (loop_vinfo,
8869 				     &LOOP_VINFO_MASKS (loop_vinfo),
8870 				     1, vectype, NULL);
8871 	    }
8872 	}
8873       /* ???  Enable for loop costing as well.  */
8874       if (!loop_vinfo)
8875 	record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8876 			  0, vect_epilogue);
8877       return true;
8878     }
8879 
8880   /* Use the lhs of the original scalar statement.  */
8881   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8882   if (dump_enabled_p ())
8883     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8884 		     "stmt %G", stmt);
8885 
8886   lhs = gimple_get_lhs (stmt);
8887   lhs_type = TREE_TYPE (lhs);
8888 
8889   bitsize = vector_element_bits_tree (vectype);
8890 
8891   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8892   tree vec_lhs, bitstart;
8893   gimple *vec_stmt;
8894   if (slp_node)
8895     {
8896       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8897 
8898       /* Get the correct slp vectorized stmt.  */
8899       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8900       vec_lhs = gimple_get_lhs (vec_stmt);
8901 
8902       /* Get entry to use.  */
8903       bitstart = bitsize_int (vec_index);
8904       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8905     }
8906   else
8907     {
8908       /* For multiple copies, get the last copy.  */
8909       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8910       vec_lhs = gimple_get_lhs (vec_stmt);
8911 
8912       /* Get the last lane in the vector.  */
8913       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8914     }
8915 
8916   if (loop_vinfo)
8917     {
8918       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8919 	 requirement, insert one phi node for it.  It looks like:
8920 	   loop;
8921 	 BB:
8922 	   # lhs' = PHI <lhs>
8923 	 ==>
8924 	   loop;
8925 	 BB:
8926 	   # vec_lhs' = PHI <vec_lhs>
8927 	   new_tree = lane_extract <vec_lhs', ...>;
8928 	   lhs' = new_tree;  */
8929 
8930       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8931       basic_block exit_bb = single_exit (loop)->dest;
8932       gcc_assert (single_pred_p (exit_bb));
8933 
8934       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8935       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8936       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8937 
8938       gimple_seq stmts = NULL;
8939       tree new_tree;
8940       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8941 	{
8942 	  /* Emit:
8943 
8944 	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8945 
8946 	     where VEC_LHS is the vectorized live-out result and MASK is
8947 	     the loop mask for the final iteration.  */
8948 	  gcc_assert (ncopies == 1 && !slp_node);
8949 	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8950 	  tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8951 					  1, vectype, 0);
8952 	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8953 					  mask, vec_lhs_phi);
8954 
8955 	  /* Convert the extracted vector element to the scalar type.  */
8956 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8957 	}
8958       else
8959 	{
8960 	  tree bftype = TREE_TYPE (vectype);
8961 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
8962 	    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8963 	  new_tree = build3 (BIT_FIELD_REF, bftype,
8964 			     vec_lhs_phi, bitsize, bitstart);
8965 	  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8966 					   &stmts, true, NULL_TREE);
8967 	}
8968 
8969       if (stmts)
8970 	{
8971 	  gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8972 	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8973 
8974 	  /* Remove existing phi from lhs and create one copy from new_tree.  */
8975 	  tree lhs_phi = NULL_TREE;
8976 	  gimple_stmt_iterator gsi;
8977 	  for (gsi = gsi_start_phis (exit_bb);
8978 	       !gsi_end_p (gsi); gsi_next (&gsi))
8979 	    {
8980 	      gimple *phi = gsi_stmt (gsi);
8981 	      if ((gimple_phi_arg_def (phi, 0) == lhs))
8982 		{
8983 		  remove_phi_node (&gsi, false);
8984 		  lhs_phi = gimple_phi_result (phi);
8985 		  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8986 		  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8987 		  break;
8988 		}
8989 	    }
8990 	}
8991 
8992       /* Replace use of lhs with newly computed result.  If the use stmt is a
8993 	 single arg PHI, just replace all uses of PHI result.  It's necessary
8994 	 because lcssa PHI defining lhs may be before newly inserted stmt.  */
8995       use_operand_p use_p;
8996       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8997 	if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8998 	    && !is_gimple_debug (use_stmt))
8999 	  {
9000 	    if (gimple_code (use_stmt) == GIMPLE_PHI
9001 		&& gimple_phi_num_args (use_stmt) == 1)
9002 	      {
9003 		replace_uses_by (gimple_phi_result (use_stmt), new_tree);
9004 	      }
9005 	    else
9006 	      {
9007 		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9008 		    SET_USE (use_p, new_tree);
9009 	      }
9010 	    update_stmt (use_stmt);
9011 	  }
9012     }
9013   else
9014     {
9015       /* For basic-block vectorization simply insert the lane-extraction.  */
9016       tree bftype = TREE_TYPE (vectype);
9017       if (VECTOR_BOOLEAN_TYPE_P (vectype))
9018 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9019       tree new_tree = build3 (BIT_FIELD_REF, bftype,
9020 			      vec_lhs, bitsize, bitstart);
9021       gimple_seq stmts = NULL;
9022       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9023 				       &stmts, true, NULL_TREE);
9024       if (TREE_CODE (new_tree) == SSA_NAME
9025 	  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9026 	SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9027       if (is_a <gphi *> (vec_stmt))
9028 	{
9029 	  gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9030 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9031 	}
9032       else
9033 	{
9034 	  gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9035 	  gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9036 	}
9037 
9038       /* Replace use of lhs with newly computed result.  If the use stmt is a
9039 	 single arg PHI, just replace all uses of PHI result.  It's necessary
9040 	 because lcssa PHI defining lhs may be before newly inserted stmt.  */
9041       use_operand_p use_p;
9042       stmt_vec_info use_stmt_info;
9043       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9044 	if (!is_gimple_debug (use_stmt)
9045 	    && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9046 		|| !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9047 	  {
9048 	    /* ???  This can happen when the live lane ends up being
9049 	       used in a vector construction code-generated by an
9050 	       external SLP node (and code-generation for that already
9051 	       happened).  See gcc.dg/vect/bb-slp-47.c.
9052 	       Doing this is what would happen if that vector CTOR
9053 	       were not code-generated yet so it is not too bad.
9054 	       ???  In fact we'd likely want to avoid this situation
9055 	       in the first place.  */
9056 	    if (TREE_CODE (new_tree) == SSA_NAME
9057 		&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9058 		&& gimple_code (use_stmt) != GIMPLE_PHI
9059 		&& !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9060 						use_stmt))
9061 	      {
9062 		enum tree_code code = gimple_assign_rhs_code (use_stmt);
9063 		gcc_checking_assert (code == SSA_NAME
9064 				     || code == CONSTRUCTOR
9065 				     || code == VIEW_CONVERT_EXPR
9066 				     || CONVERT_EXPR_CODE_P (code));
9067 		if (dump_enabled_p ())
9068 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9069 				   "Using original scalar computation for "
9070 				   "live lane because use preceeds vector "
9071 				   "def\n");
9072 		continue;
9073 	      }
9074 	    /* ???  It can also happen that we end up pulling a def into
9075 	       a loop where replacing out-of-loop uses would require
9076 	       a new LC SSA PHI node.  Retain the original scalar in
9077 	       those cases as well.  PR98064.  */
9078 	    if (TREE_CODE (new_tree) == SSA_NAME
9079 		&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9080 		&& (gimple_bb (use_stmt)->loop_father
9081 		    != gimple_bb (vec_stmt)->loop_father)
9082 		&& !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9083 					gimple_bb (use_stmt)->loop_father))
9084 	      {
9085 		if (dump_enabled_p ())
9086 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9087 				   "Using original scalar computation for "
9088 				   "live lane because there is an out-of-loop "
9089 				   "definition for it\n");
9090 		continue;
9091 	      }
9092 	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9093 	      SET_USE (use_p, new_tree);
9094 	    update_stmt (use_stmt);
9095 	  }
9096     }
9097 
9098   return true;
9099 }
9100 
9101 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9102 
9103 static void
9104 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9105 {
9106   ssa_op_iter op_iter;
9107   imm_use_iterator imm_iter;
9108   def_operand_p def_p;
9109   gimple *ustmt;
9110 
9111   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9112     {
9113       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9114 	{
9115 	  basic_block bb;
9116 
9117 	  if (!is_gimple_debug (ustmt))
9118 	    continue;
9119 
9120 	  bb = gimple_bb (ustmt);
9121 
9122 	  if (!flow_bb_inside_loop_p (loop, bb))
9123 	    {
9124 	      if (gimple_debug_bind_p (ustmt))
9125 		{
9126 		  if (dump_enabled_p ())
9127 		    dump_printf_loc (MSG_NOTE, vect_location,
9128                                      "killing debug use\n");
9129 
9130 		  gimple_debug_bind_reset_value (ustmt);
9131 		  update_stmt (ustmt);
9132 		}
9133 	      else
9134 		gcc_unreachable ();
9135 	    }
9136 	}
9137     }
9138 }
9139 
9140 /* Given loop represented by LOOP_VINFO, return true if computation of
9141    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9142    otherwise.  */
9143 
9144 static bool
9145 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9146 {
9147   /* Constant case.  */
9148   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9149     {
9150       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9151       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9152 
9153       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9154       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9155       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9156 	return true;
9157     }
9158 
9159   widest_int max;
9160   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9161   /* Check the upper bound of loop niters.  */
9162   if (get_max_loop_iterations (loop, &max))
9163     {
9164       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9165       signop sgn = TYPE_SIGN (type);
9166       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9167       if (max < type_max)
9168 	return true;
9169     }
9170   return false;
9171 }
9172 
9173 /* Return a mask type with half the number of elements as OLD_TYPE,
9174    given that it should have mode NEW_MODE.  */
9175 
9176 tree
9177 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9178 {
9179   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9180   return build_truth_vector_type_for_mode (nunits, new_mode);
9181 }
9182 
9183 /* Return a mask type with twice as many elements as OLD_TYPE,
9184    given that it should have mode NEW_MODE.  */
9185 
9186 tree
9187 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9188 {
9189   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9190   return build_truth_vector_type_for_mode (nunits, new_mode);
9191 }
9192 
9193 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9194    contain a sequence of NVECTORS masks that each control a vector of type
9195    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9196    these vector masks with the vector version of SCALAR_MASK.  */
9197 
9198 void
9199 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9200 		       unsigned int nvectors, tree vectype, tree scalar_mask)
9201 {
9202   gcc_assert (nvectors != 0);
9203   if (masks->length () < nvectors)
9204     masks->safe_grow_cleared (nvectors, true);
9205   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9206   /* The number of scalars per iteration and the number of vectors are
9207      both compile-time constants.  */
9208   unsigned int nscalars_per_iter
9209     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9210 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9211 
9212   if (scalar_mask)
9213     {
9214       scalar_cond_masked_key cond (scalar_mask, nvectors);
9215       loop_vinfo->scalar_cond_masked_set.add (cond);
9216     }
9217 
9218   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9219     {
9220       rgm->max_nscalars_per_iter = nscalars_per_iter;
9221       rgm->type = truth_type_for (vectype);
9222       rgm->factor = 1;
9223     }
9224 }
9225 
9226 /* Given a complete set of masks MASKS, extract mask number INDEX
9227    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9228    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9229 
9230    See the comment above vec_loop_masks for more details about the mask
9231    arrangement.  */
9232 
9233 tree
9234 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9235 		    unsigned int nvectors, tree vectype, unsigned int index)
9236 {
9237   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9238   tree mask_type = rgm->type;
9239 
9240   /* Populate the rgroup's mask array, if this is the first time we've
9241      used it.  */
9242   if (rgm->controls.is_empty ())
9243     {
9244       rgm->controls.safe_grow_cleared (nvectors, true);
9245       for (unsigned int i = 0; i < nvectors; ++i)
9246 	{
9247 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9248 	  /* Provide a dummy definition until the real one is available.  */
9249 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9250 	  rgm->controls[i] = mask;
9251 	}
9252     }
9253 
9254   tree mask = rgm->controls[index];
9255   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9256 		TYPE_VECTOR_SUBPARTS (vectype)))
9257     {
9258       /* A loop mask for data type X can be reused for data type Y
9259 	 if X has N times more elements than Y and if Y's elements
9260 	 are N times bigger than X's.  In this case each sequence
9261 	 of N elements in the loop mask will be all-zero or all-one.
9262 	 We can then view-convert the mask so that each sequence of
9263 	 N elements is replaced by a single element.  */
9264       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9265 			      TYPE_VECTOR_SUBPARTS (vectype)));
9266       gimple_seq seq = NULL;
9267       mask_type = truth_type_for (vectype);
9268       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9269       if (seq)
9270 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9271     }
9272   return mask;
9273 }
9274 
9275 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9276    lengths for controlling an operation on VECTYPE.  The operation splits
9277    each element of VECTYPE into FACTOR separate subelements, measuring the
9278    length as a number of these subelements.  */
9279 
9280 void
9281 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9282 		      unsigned int nvectors, tree vectype, unsigned int factor)
9283 {
9284   gcc_assert (nvectors != 0);
9285   if (lens->length () < nvectors)
9286     lens->safe_grow_cleared (nvectors, true);
9287   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9288 
9289   /* The number of scalars per iteration, scalar occupied bytes and
9290      the number of vectors are both compile-time constants.  */
9291   unsigned int nscalars_per_iter
9292     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9293 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9294 
9295   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9296     {
9297       /* For now, we only support cases in which all loads and stores fall back
9298 	 to VnQI or none do.  */
9299       gcc_assert (!rgl->max_nscalars_per_iter
9300 		  || (rgl->factor == 1 && factor == 1)
9301 		  || (rgl->max_nscalars_per_iter * rgl->factor
9302 		      == nscalars_per_iter * factor));
9303       rgl->max_nscalars_per_iter = nscalars_per_iter;
9304       rgl->type = vectype;
9305       rgl->factor = factor;
9306     }
9307 }
9308 
9309 /* Given a complete set of length LENS, extract length number INDEX for an
9310    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9311 
9312 tree
9313 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9314 		   unsigned int nvectors, unsigned int index)
9315 {
9316   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9317   bool use_bias_adjusted_len =
9318     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9319 
9320   /* Populate the rgroup's len array, if this is the first time we've
9321      used it.  */
9322   if (rgl->controls.is_empty ())
9323     {
9324       rgl->controls.safe_grow_cleared (nvectors, true);
9325       for (unsigned int i = 0; i < nvectors; ++i)
9326 	{
9327 	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9328 	  gcc_assert (len_type != NULL_TREE);
9329 
9330 	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9331 
9332 	  /* Provide a dummy definition until the real one is available.  */
9333 	  SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9334 	  rgl->controls[i] = len;
9335 
9336 	  if (use_bias_adjusted_len)
9337 	    {
9338 	      gcc_assert (i == 0);
9339 	      tree adjusted_len =
9340 		make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9341 	      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9342 	      rgl->bias_adjusted_ctrl = adjusted_len;
9343 	    }
9344 	}
9345     }
9346 
9347   if (use_bias_adjusted_len)
9348     return rgl->bias_adjusted_ctrl;
9349   else
9350     return rgl->controls[index];
9351 }
9352 
9353 /* Scale profiling counters by estimation for LOOP which is vectorized
9354    by factor VF.  */
9355 
9356 static void
9357 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9358 {
9359   edge preheader = loop_preheader_edge (loop);
9360   /* Reduce loop iterations by the vectorization factor.  */
9361   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9362   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9363 
9364   if (freq_h.nonzero_p ())
9365     {
9366       profile_probability p;
9367 
9368       /* Avoid dropping loop body profile counter to 0 because of zero count
9369 	 in loop's preheader.  */
9370       if (!(freq_e == profile_count::zero ()))
9371         freq_e = freq_e.force_nonzero ();
9372       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9373       scale_loop_frequencies (loop, p);
9374     }
9375 
9376   edge exit_e = single_exit (loop);
9377   exit_e->probability = profile_probability::always ()
9378 				 .apply_scale (1, new_est_niter + 1);
9379 
9380   edge exit_l = single_pred_edge (loop->latch);
9381   profile_probability prob = exit_l->probability;
9382   exit_l->probability = exit_e->probability.invert ();
9383   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9384     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9385 }
9386 
9387 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9388    latch edge values originally defined by it.  */
9389 
9390 static void
9391 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9392 				     stmt_vec_info def_stmt_info)
9393 {
9394   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9395   if (!def || TREE_CODE (def) != SSA_NAME)
9396     return;
9397   stmt_vec_info phi_info;
9398   imm_use_iterator iter;
9399   use_operand_p use_p;
9400   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9401     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9402       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9403 	  && (phi_info = loop_vinfo->lookup_stmt (phi))
9404 	  && STMT_VINFO_RELEVANT_P (phi_info)
9405 	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9406 	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9407 	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9408 	{
9409 	  loop_p loop = gimple_bb (phi)->loop_father;
9410 	  edge e = loop_latch_edge (loop);
9411 	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9412 	    {
9413 	      vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9414 	      vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9415 	      gcc_assert (phi_defs.length () == latch_defs.length ());
9416 	      for (unsigned i = 0; i < phi_defs.length (); ++i)
9417 		add_phi_arg (as_a <gphi *> (phi_defs[i]),
9418 			     gimple_get_lhs (latch_defs[i]), e,
9419 			     gimple_phi_arg_location (phi, e->dest_idx));
9420 	    }
9421 	}
9422 }
9423 
9424 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9425    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9426    stmt_vec_info.  */
9427 
9428 static bool
9429 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9430 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9431 {
9432   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9433   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9434 
9435   if (dump_enabled_p ())
9436     dump_printf_loc (MSG_NOTE, vect_location,
9437 		     "------>vectorizing statement: %G", stmt_info->stmt);
9438 
9439   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9440     vect_loop_kill_debug_uses (loop, stmt_info);
9441 
9442   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9443       && !STMT_VINFO_LIVE_P (stmt_info))
9444     return false;
9445 
9446   if (STMT_VINFO_VECTYPE (stmt_info))
9447     {
9448       poly_uint64 nunits
9449 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9450       if (!STMT_SLP_TYPE (stmt_info)
9451 	  && maybe_ne (nunits, vf)
9452 	  && dump_enabled_p ())
9453 	/* For SLP VF is set according to unrolling factor, and not
9454 	   to vector size, hence for SLP this print is not valid.  */
9455 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9456     }
9457 
9458   /* Pure SLP statements have already been vectorized.  We still need
9459      to apply loop vectorization to hybrid SLP statements.  */
9460   if (PURE_SLP_STMT (stmt_info))
9461     return false;
9462 
9463   if (dump_enabled_p ())
9464     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9465 
9466   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9467     *seen_store = stmt_info;
9468 
9469   return true;
9470 }
9471 
9472 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9473    in the hash_map with its corresponding values.  */
9474 
9475 static tree
9476 find_in_mapping (tree t, void *context)
9477 {
9478   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9479 
9480   tree *value = mapping->get (t);
9481   return value ? *value : t;
9482 }
9483 
9484 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9485    original loop that has now been vectorized.
9486 
9487    The inits of the data_references need to be advanced with the number of
9488    iterations of the main loop.  This has been computed in vect_do_peeling and
9489    is stored in parameter ADVANCE.  We first restore the data_references
9490    initial offset with the values recored in ORIG_DRS_INIT.
9491 
9492    Since the loop_vec_info of this EPILOGUE was constructed for the original
9493    loop, its stmt_vec_infos all point to the original statements.  These need
9494    to be updated to point to their corresponding copies as well as the SSA_NAMES
9495    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9496 
9497    The data_reference's connections also need to be updated.  Their
9498    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9499    stmt_vec_infos, their statements need to point to their corresponding copy,
9500    if they are gather loads or scatter stores then their reference needs to be
9501    updated to point to its corresponding copy and finally we set
9502    'base_misaligned' to false as we have already peeled for alignment in the
9503    prologue of the main loop.  */
9504 
9505 static void
9506 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9507 {
9508   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9509   auto_vec<gimple *> stmt_worklist;
9510   hash_map<tree,tree> mapping;
9511   gimple *orig_stmt, *new_stmt;
9512   gimple_stmt_iterator epilogue_gsi;
9513   gphi_iterator epilogue_phi_gsi;
9514   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9515   basic_block *epilogue_bbs = get_loop_body (epilogue);
9516   unsigned i;
9517 
9518   free (LOOP_VINFO_BBS (epilogue_vinfo));
9519   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9520 
9521   /* Advance data_reference's with the number of iterations of the previous
9522      loop and its prologue.  */
9523   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9524 
9525 
9526   /* The EPILOGUE loop is a copy of the original loop so they share the same
9527      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9528      point to the copied statements.  We also create a mapping of all LHS' in
9529      the original loop and all the LHS' in the EPILOGUE and create worklists to
9530      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9531   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9532     {
9533       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9534 	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9535 	{
9536 	  new_stmt = epilogue_phi_gsi.phi ();
9537 
9538 	  gcc_assert (gimple_uid (new_stmt) > 0);
9539 	  stmt_vinfo
9540 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9541 
9542 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9543 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9544 
9545 	  mapping.put (gimple_phi_result (orig_stmt),
9546 		       gimple_phi_result (new_stmt));
9547 	  /* PHI nodes can not have patterns or related statements.  */
9548 	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9549 		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9550 	}
9551 
9552       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9553 	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9554 	{
9555 	  new_stmt = gsi_stmt (epilogue_gsi);
9556 	  if (is_gimple_debug (new_stmt))
9557 	    continue;
9558 
9559 	  gcc_assert (gimple_uid (new_stmt) > 0);
9560 	  stmt_vinfo
9561 	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9562 
9563 	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9564 	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9565 
9566 	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
9567 	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9568 
9569 	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9570 	    {
9571 	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9572 	      for (gimple_stmt_iterator gsi = gsi_start (seq);
9573 		   !gsi_end_p (gsi); gsi_next (&gsi))
9574 		stmt_worklist.safe_push (gsi_stmt (gsi));
9575 	    }
9576 
9577 	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9578 	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9579 	    {
9580 	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9581 	      stmt_worklist.safe_push (stmt);
9582 	      /* Set BB such that the assert in
9583 		'get_initial_def_for_reduction' is able to determine that
9584 		the BB of the related stmt is inside this loop.  */
9585 	      gimple_set_bb (stmt,
9586 			     gimple_bb (new_stmt));
9587 	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9588 	      gcc_assert (related_vinfo == NULL
9589 			  || related_vinfo == stmt_vinfo);
9590 	    }
9591 	}
9592     }
9593 
9594   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9595      using the original main loop and thus need to be updated to refer to the
9596      cloned variables used in the epilogue.  */
9597   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9598     {
9599       gimple *stmt = stmt_worklist[i];
9600       tree *new_op;
9601 
9602       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9603 	{
9604 	  tree op = gimple_op (stmt, j);
9605 	  if ((new_op = mapping.get(op)))
9606 	    gimple_set_op (stmt, j, *new_op);
9607 	  else
9608 	    {
9609 	      /* PR92429: The last argument of simplify_replace_tree disables
9610 		 folding when replacing arguments.  This is required as
9611 		 otherwise you might end up with different statements than the
9612 		 ones analyzed in vect_loop_analyze, leading to different
9613 		 vectorization.  */
9614 	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9615 					  &find_in_mapping, &mapping, false);
9616 	      gimple_set_op (stmt, j, op);
9617 	    }
9618 	}
9619     }
9620 
9621   struct data_reference *dr;
9622   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9623   FOR_EACH_VEC_ELT (datarefs, i, dr)
9624     {
9625       orig_stmt = DR_STMT (dr);
9626       gcc_assert (gimple_uid (orig_stmt) > 0);
9627       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9628       /* Data references for gather loads and scatter stores do not use the
9629 	 updated offset we set using ADVANCE.  Instead we have to make sure the
9630 	 reference in the data references point to the corresponding copy of
9631 	 the original in the epilogue.  */
9632       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9633 	  == VMAT_GATHER_SCATTER)
9634 	{
9635 	  DR_REF (dr)
9636 	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9637 				     &find_in_mapping, &mapping);
9638 	  DR_BASE_ADDRESS (dr)
9639 	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9640 				     &find_in_mapping, &mapping);
9641 	}
9642       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9643       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9644       /* The vector size of the epilogue is smaller than that of the main loop
9645 	 so the alignment is either the same or lower. This means the dr will
9646 	 thus by definition be aligned.  */
9647       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9648     }
9649 
9650   epilogue_vinfo->shared->datarefs_copy.release ();
9651   epilogue_vinfo->shared->save_datarefs ();
9652 }
9653 
9654 /* Function vect_transform_loop.
9655 
9656    The analysis phase has determined that the loop is vectorizable.
9657    Vectorize the loop - created vectorized stmts to replace the scalar
9658    stmts in the loop, and update the loop exit condition.
9659    Returns scalar epilogue loop if any.  */
9660 
9661 class loop *
9662 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9663 {
9664   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9665   class loop *epilogue = NULL;
9666   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9667   int nbbs = loop->num_nodes;
9668   int i;
9669   tree niters_vector = NULL_TREE;
9670   tree step_vector = NULL_TREE;
9671   tree niters_vector_mult_vf = NULL_TREE;
9672   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9673   unsigned int lowest_vf = constant_lower_bound (vf);
9674   gimple *stmt;
9675   bool check_profitability = false;
9676   unsigned int th;
9677 
9678   DUMP_VECT_SCOPE ("vec_transform_loop");
9679 
9680   loop_vinfo->shared->check_datarefs ();
9681 
9682   /* Use the more conservative vectorization threshold.  If the number
9683      of iterations is constant assume the cost check has been performed
9684      by our caller.  If the threshold makes all loops profitable that
9685      run at least the (estimated) vectorization factor number of times
9686      checking is pointless, too.  */
9687   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9688   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9689     {
9690       if (dump_enabled_p ())
9691 	dump_printf_loc (MSG_NOTE, vect_location,
9692 			 "Profitability threshold is %d loop iterations.\n",
9693 			 th);
9694       check_profitability = true;
9695     }
9696 
9697   /* Make sure there exists a single-predecessor exit bb.  Do this before
9698      versioning.   */
9699   edge e = single_exit (loop);
9700   if (! single_pred_p (e->dest))
9701     {
9702       split_loop_exit_edge (e, true);
9703       if (dump_enabled_p ())
9704 	dump_printf (MSG_NOTE, "split exit edge\n");
9705     }
9706 
9707   /* Version the loop first, if required, so the profitability check
9708      comes first.  */
9709 
9710   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9711     {
9712       class loop *sloop
9713 	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9714       sloop->force_vectorize = false;
9715       check_profitability = false;
9716     }
9717 
9718   /* Make sure there exists a single-predecessor exit bb also on the
9719      scalar loop copy.  Do this after versioning but before peeling
9720      so CFG structure is fine for both scalar and if-converted loop
9721      to make slpeel_duplicate_current_defs_from_edges face matched
9722      loop closed PHI nodes on the exit.  */
9723   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9724     {
9725       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9726       if (! single_pred_p (e->dest))
9727 	{
9728 	  split_loop_exit_edge (e, true);
9729 	  if (dump_enabled_p ())
9730 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9731 	}
9732     }
9733 
9734   tree niters = vect_build_loop_niters (loop_vinfo);
9735   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9736   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9737   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9738   tree advance;
9739   drs_init_vec orig_drs_init;
9740 
9741   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9742 			      &step_vector, &niters_vector_mult_vf, th,
9743 			      check_profitability, niters_no_overflow,
9744 			      &advance);
9745 
9746   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9747       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9748     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9749 			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9750 
9751   if (niters_vector == NULL_TREE)
9752     {
9753       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9754 	  && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9755 	  && known_eq (lowest_vf, vf))
9756 	{
9757 	  niters_vector
9758 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9759 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9760 	  step_vector = build_one_cst (TREE_TYPE (niters));
9761 	}
9762       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9763 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9764 				     &step_vector, niters_no_overflow);
9765       else
9766 	/* vect_do_peeling subtracted the number of peeled prologue
9767 	   iterations from LOOP_VINFO_NITERS.  */
9768 	vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9769 				     &niters_vector, &step_vector,
9770 				     niters_no_overflow);
9771     }
9772 
9773   /* 1) Make sure the loop header has exactly two entries
9774      2) Make sure we have a preheader basic block.  */
9775 
9776   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9777 
9778   split_edge (loop_preheader_edge (loop));
9779 
9780   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9781     /* This will deal with any possible peeling.  */
9782     vect_prepare_for_masked_peels (loop_vinfo);
9783 
9784   /* Schedule the SLP instances first, then handle loop vectorization
9785      below.  */
9786   if (!loop_vinfo->slp_instances.is_empty ())
9787     {
9788       DUMP_VECT_SCOPE ("scheduling SLP instances");
9789       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9790     }
9791 
9792   /* FORNOW: the vectorizer supports only loops which body consist
9793      of one basic block (header + empty latch). When the vectorizer will
9794      support more involved loop forms, the order by which the BBs are
9795      traversed need to be reconsidered.  */
9796 
9797   for (i = 0; i < nbbs; i++)
9798     {
9799       basic_block bb = bbs[i];
9800       stmt_vec_info stmt_info;
9801 
9802       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9803 	   gsi_next (&si))
9804 	{
9805 	  gphi *phi = si.phi ();
9806 	  if (dump_enabled_p ())
9807 	    dump_printf_loc (MSG_NOTE, vect_location,
9808 			     "------>vectorizing phi: %G", phi);
9809 	  stmt_info = loop_vinfo->lookup_stmt (phi);
9810 	  if (!stmt_info)
9811 	    continue;
9812 
9813 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9814 	    vect_loop_kill_debug_uses (loop, stmt_info);
9815 
9816 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
9817 	      && !STMT_VINFO_LIVE_P (stmt_info))
9818 	    continue;
9819 
9820 	  if (STMT_VINFO_VECTYPE (stmt_info)
9821 	      && (maybe_ne
9822 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9823 	      && dump_enabled_p ())
9824 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9825 
9826 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9827 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9828 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9829 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9830 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9831 	      && ! PURE_SLP_STMT (stmt_info))
9832 	    {
9833 	      if (dump_enabled_p ())
9834 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9835 	      vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9836 	    }
9837 	}
9838 
9839       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9840 	   gsi_next (&si))
9841 	{
9842 	  gphi *phi = si.phi ();
9843 	  stmt_info = loop_vinfo->lookup_stmt (phi);
9844 	  if (!stmt_info)
9845 	    continue;
9846 
9847 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
9848 	      && !STMT_VINFO_LIVE_P (stmt_info))
9849 	    continue;
9850 
9851 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9852 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9853 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9854 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9855 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9856 	      && ! PURE_SLP_STMT (stmt_info))
9857 	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9858 	}
9859 
9860       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9861 	   !gsi_end_p (si);)
9862 	{
9863 	  stmt = gsi_stmt (si);
9864 	  /* During vectorization remove existing clobber stmts.  */
9865 	  if (gimple_clobber_p (stmt))
9866 	    {
9867 	      unlink_stmt_vdef (stmt);
9868 	      gsi_remove (&si, true);
9869 	      release_defs (stmt);
9870 	    }
9871 	  else
9872 	    {
9873 	      /* Ignore vector stmts created in the outer loop.  */
9874 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
9875 
9876 	      /* vector stmts created in the outer-loop during vectorization of
9877 		 stmts in an inner-loop may not have a stmt_info, and do not
9878 		 need to be vectorized.  */
9879 	      stmt_vec_info seen_store = NULL;
9880 	      if (stmt_info)
9881 		{
9882 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9883 		    {
9884 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9885 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9886 			   !gsi_end_p (subsi); gsi_next (&subsi))
9887 			{
9888 			  stmt_vec_info pat_stmt_info
9889 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9890 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9891 						    &si, &seen_store);
9892 			}
9893 		      stmt_vec_info pat_stmt_info
9894 			= STMT_VINFO_RELATED_STMT (stmt_info);
9895 		      if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9896 						    &si, &seen_store))
9897 			maybe_set_vectorized_backedge_value (loop_vinfo,
9898 							     pat_stmt_info);
9899 		    }
9900 		  else
9901 		    {
9902 		      if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9903 						    &seen_store))
9904 			maybe_set_vectorized_backedge_value (loop_vinfo,
9905 							     stmt_info);
9906 		    }
9907 		}
9908 	      gsi_next (&si);
9909 	      if (seen_store)
9910 		{
9911 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9912 		    /* Interleaving.  If IS_STORE is TRUE, the
9913 		       vectorization of the interleaving chain was
9914 		       completed - free all the stores in the chain.  */
9915 		    vect_remove_stores (loop_vinfo,
9916 					DR_GROUP_FIRST_ELEMENT (seen_store));
9917 		  else
9918 		    /* Free the attached stmt_vec_info and remove the stmt.  */
9919 		    loop_vinfo->remove_stmt (stmt_info);
9920 		}
9921 	    }
9922 	}
9923 
9924       /* Stub out scalar statements that must not survive vectorization.
9925 	 Doing this here helps with grouped statements, or statements that
9926 	 are involved in patterns.  */
9927       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9928 	   !gsi_end_p (gsi); gsi_next (&gsi))
9929 	{
9930 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9931 	  if (!call || !gimple_call_internal_p (call))
9932 	    continue;
9933 	  internal_fn ifn = gimple_call_internal_fn (call);
9934 	  if (ifn == IFN_MASK_LOAD)
9935 	    {
9936 	      tree lhs = gimple_get_lhs (call);
9937 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9938 		{
9939 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
9940 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
9941 		  gsi_replace (&gsi, new_stmt, true);
9942 		}
9943 	    }
9944 	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9945 	    {
9946 	      tree lhs = gimple_get_lhs (call);
9947 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9948 		{
9949 		  tree else_arg
9950 		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9951 		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9952 		  gsi_replace (&gsi, new_stmt, true);
9953 		}
9954 	    }
9955 	}
9956     }				/* BBs in loop */
9957 
9958   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9959      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9960   if (integer_onep (step_vector))
9961     niters_no_overflow = true;
9962   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9963 			   niters_vector_mult_vf, !niters_no_overflow);
9964 
9965   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9966   scale_profile_for_vect_loop (loop, assumed_vf);
9967 
9968   /* True if the final iteration might not handle a full vector's
9969      worth of scalar iterations.  */
9970   bool final_iter_may_be_partial
9971     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9972   /* The minimum number of iterations performed by the epilogue.  This
9973      is 1 when peeling for gaps because we always need a final scalar
9974      iteration.  */
9975   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9976   /* +1 to convert latch counts to loop iteration counts,
9977      -min_epilogue_iters to remove iterations that cannot be performed
9978        by the vector code.  */
9979   int bias_for_lowest = 1 - min_epilogue_iters;
9980   int bias_for_assumed = bias_for_lowest;
9981   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9982   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9983     {
9984       /* When the amount of peeling is known at compile time, the first
9985 	 iteration will have exactly alignment_npeels active elements.
9986 	 In the worst case it will have at least one.  */
9987       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9988       bias_for_lowest += lowest_vf - min_first_active;
9989       bias_for_assumed += assumed_vf - min_first_active;
9990     }
9991   /* In these calculations the "- 1" converts loop iteration counts
9992      back to latch counts.  */
9993   if (loop->any_upper_bound)
9994     {
9995       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9996       loop->nb_iterations_upper_bound
9997 	= (final_iter_may_be_partial
9998 	   ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9999 			    lowest_vf) - 1
10000 	   : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
10001 			     lowest_vf) - 1);
10002       if (main_vinfo
10003 	  /* Both peeling for alignment and peeling for gaps can end up
10004 	     with the scalar epilogue running for more than VF-1 iterations.  */
10005 	  && !main_vinfo->peeling_for_alignment
10006 	  && !main_vinfo->peeling_for_gaps)
10007 	{
10008 	  unsigned int bound;
10009 	  poly_uint64 main_iters
10010 	    = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
10011 			   LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
10012 	  main_iters
10013 	    = upper_bound (main_iters,
10014 			   LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
10015 	  if (can_div_away_from_zero_p (main_iters,
10016 					LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10017 					&bound))
10018 	    loop->nb_iterations_upper_bound
10019 	      = wi::umin ((widest_int) (bound - 1),
10020 			  loop->nb_iterations_upper_bound);
10021       }
10022   }
10023   if (loop->any_likely_upper_bound)
10024     loop->nb_iterations_likely_upper_bound
10025       = (final_iter_may_be_partial
10026 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10027 			  + bias_for_lowest, lowest_vf) - 1
10028 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10029 			   + bias_for_lowest, lowest_vf) - 1);
10030   if (loop->any_estimate)
10031     loop->nb_iterations_estimate
10032       = (final_iter_may_be_partial
10033 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10034 			  assumed_vf) - 1
10035 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10036 			   assumed_vf) - 1);
10037 
10038   if (dump_enabled_p ())
10039     {
10040       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10041 	{
10042 	  dump_printf_loc (MSG_NOTE, vect_location,
10043 			   "LOOP VECTORIZED\n");
10044 	  if (loop->inner)
10045 	    dump_printf_loc (MSG_NOTE, vect_location,
10046 			     "OUTER LOOP VECTORIZED\n");
10047 	  dump_printf (MSG_NOTE, "\n");
10048 	}
10049       else
10050 	dump_printf_loc (MSG_NOTE, vect_location,
10051 			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10052 			 GET_MODE_NAME (loop_vinfo->vector_mode));
10053     }
10054 
10055   /* Loops vectorized with a variable factor won't benefit from
10056      unrolling/peeling.  */
10057   if (!vf.is_constant ())
10058     {
10059       loop->unroll = 1;
10060       if (dump_enabled_p ())
10061 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10062 			 " variable-length vectorization factor\n");
10063     }
10064   /* Free SLP instances here because otherwise stmt reference counting
10065      won't work.  */
10066   slp_instance instance;
10067   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10068     vect_free_slp_instance (instance);
10069   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10070   /* Clear-up safelen field since its value is invalid after vectorization
10071      since vectorized loop can have loop-carried dependencies.  */
10072   loop->safelen = 0;
10073 
10074   if (epilogue)
10075     {
10076       update_epilogue_loop_vinfo (epilogue, advance);
10077 
10078       epilogue->simduid = loop->simduid;
10079       epilogue->force_vectorize = loop->force_vectorize;
10080       epilogue->dont_vectorize = false;
10081     }
10082 
10083   return epilogue;
10084 }
10085 
10086 /* The code below is trying to perform simple optimization - revert
10087    if-conversion for masked stores, i.e. if the mask of a store is zero
10088    do not perform it and all stored value producers also if possible.
10089    For example,
10090      for (i=0; i<n; i++)
10091        if (c[i])
10092 	{
10093 	  p1[i] += 1;
10094 	  p2[i] = p3[i] +2;
10095 	}
10096    this transformation will produce the following semi-hammock:
10097 
10098    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10099      {
10100        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10101        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10102        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10103        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10104        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10105        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10106      }
10107 */
10108 
10109 void
10110 optimize_mask_stores (class loop *loop)
10111 {
10112   basic_block *bbs = get_loop_body (loop);
10113   unsigned nbbs = loop->num_nodes;
10114   unsigned i;
10115   basic_block bb;
10116   class loop *bb_loop;
10117   gimple_stmt_iterator gsi;
10118   gimple *stmt;
10119   auto_vec<gimple *> worklist;
10120   auto_purge_vect_location sentinel;
10121 
10122   vect_location = find_loop_location (loop);
10123   /* Pick up all masked stores in loop if any.  */
10124   for (i = 0; i < nbbs; i++)
10125     {
10126       bb = bbs[i];
10127       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10128 	   gsi_next (&gsi))
10129 	{
10130 	  stmt = gsi_stmt (gsi);
10131 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10132 	    worklist.safe_push (stmt);
10133 	}
10134     }
10135 
10136   free (bbs);
10137   if (worklist.is_empty ())
10138     return;
10139 
10140   /* Loop has masked stores.  */
10141   while (!worklist.is_empty ())
10142     {
10143       gimple *last, *last_store;
10144       edge e, efalse;
10145       tree mask;
10146       basic_block store_bb, join_bb;
10147       gimple_stmt_iterator gsi_to;
10148       tree vdef, new_vdef;
10149       gphi *phi;
10150       tree vectype;
10151       tree zero;
10152 
10153       last = worklist.pop ();
10154       mask = gimple_call_arg (last, 2);
10155       bb = gimple_bb (last);
10156       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10157 	 the same loop as if_bb.  It could be different to LOOP when two
10158 	 level loop-nest is vectorized and mask_store belongs to the inner
10159 	 one.  */
10160       e = split_block (bb, last);
10161       bb_loop = bb->loop_father;
10162       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10163       join_bb = e->dest;
10164       store_bb = create_empty_bb (bb);
10165       add_bb_to_loop (store_bb, bb_loop);
10166       e->flags = EDGE_TRUE_VALUE;
10167       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10168       /* Put STORE_BB to likely part.  */
10169       efalse->probability = profile_probability::unlikely ();
10170       store_bb->count = efalse->count ();
10171       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10172       if (dom_info_available_p (CDI_DOMINATORS))
10173 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10174       if (dump_enabled_p ())
10175 	dump_printf_loc (MSG_NOTE, vect_location,
10176 			 "Create new block %d to sink mask stores.",
10177 			 store_bb->index);
10178       /* Create vector comparison with boolean result.  */
10179       vectype = TREE_TYPE (mask);
10180       zero = build_zero_cst (vectype);
10181       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10182       gsi = gsi_last_bb (bb);
10183       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10184       /* Create new PHI node for vdef of the last masked store:
10185 	 .MEM_2 = VDEF <.MEM_1>
10186 	 will be converted to
10187 	 .MEM.3 = VDEF <.MEM_1>
10188 	 and new PHI node will be created in join bb
10189 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
10190       */
10191       vdef = gimple_vdef (last);
10192       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10193       gimple_set_vdef (last, new_vdef);
10194       phi = create_phi_node (vdef, join_bb);
10195       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10196 
10197       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10198       while (true)
10199 	{
10200 	  gimple_stmt_iterator gsi_from;
10201 	  gimple *stmt1 = NULL;
10202 
10203 	  /* Move masked store to STORE_BB.  */
10204 	  last_store = last;
10205 	  gsi = gsi_for_stmt (last);
10206 	  gsi_from = gsi;
10207 	  /* Shift GSI to the previous stmt for further traversal.  */
10208 	  gsi_prev (&gsi);
10209 	  gsi_to = gsi_start_bb (store_bb);
10210 	  gsi_move_before (&gsi_from, &gsi_to);
10211 	  /* Setup GSI_TO to the non-empty block start.  */
10212 	  gsi_to = gsi_start_bb (store_bb);
10213 	  if (dump_enabled_p ())
10214 	    dump_printf_loc (MSG_NOTE, vect_location,
10215 			     "Move stmt to created bb\n%G", last);
10216 	  /* Move all stored value producers if possible.  */
10217 	  while (!gsi_end_p (gsi))
10218 	    {
10219 	      tree lhs;
10220 	      imm_use_iterator imm_iter;
10221 	      use_operand_p use_p;
10222 	      bool res;
10223 
10224 	      /* Skip debug statements.  */
10225 	      if (is_gimple_debug (gsi_stmt (gsi)))
10226 		{
10227 		  gsi_prev (&gsi);
10228 		  continue;
10229 		}
10230 	      stmt1 = gsi_stmt (gsi);
10231 	      /* Do not consider statements writing to memory or having
10232 		 volatile operand.  */
10233 	      if (gimple_vdef (stmt1)
10234 		  || gimple_has_volatile_ops (stmt1))
10235 		break;
10236 	      gsi_from = gsi;
10237 	      gsi_prev (&gsi);
10238 	      lhs = gimple_get_lhs (stmt1);
10239 	      if (!lhs)
10240 		break;
10241 
10242 	      /* LHS of vectorized stmt must be SSA_NAME.  */
10243 	      if (TREE_CODE (lhs) != SSA_NAME)
10244 		break;
10245 
10246 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10247 		{
10248 		  /* Remove dead scalar statement.  */
10249 		  if (has_zero_uses (lhs))
10250 		    {
10251 		      gsi_remove (&gsi_from, true);
10252 		      continue;
10253 		    }
10254 		}
10255 
10256 	      /* Check that LHS does not have uses outside of STORE_BB.  */
10257 	      res = true;
10258 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10259 		{
10260 		  gimple *use_stmt;
10261 		  use_stmt = USE_STMT (use_p);
10262 		  if (is_gimple_debug (use_stmt))
10263 		    continue;
10264 		  if (gimple_bb (use_stmt) != store_bb)
10265 		    {
10266 		      res = false;
10267 		      break;
10268 		    }
10269 		}
10270 	      if (!res)
10271 		break;
10272 
10273 	      if (gimple_vuse (stmt1)
10274 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
10275 		break;
10276 
10277 	      /* Can move STMT1 to STORE_BB.  */
10278 	      if (dump_enabled_p ())
10279 		dump_printf_loc (MSG_NOTE, vect_location,
10280 				 "Move stmt to created bb\n%G", stmt1);
10281 	      gsi_move_before (&gsi_from, &gsi_to);
10282 	      /* Shift GSI_TO for further insertion.  */
10283 	      gsi_prev (&gsi_to);
10284 	    }
10285 	  /* Put other masked stores with the same mask to STORE_BB.  */
10286 	  if (worklist.is_empty ()
10287 	      || gimple_call_arg (worklist.last (), 2) != mask
10288 	      || worklist.last () != stmt1)
10289 	    break;
10290 	  last = worklist.pop ();
10291 	}
10292       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10293     }
10294 }
10295 
10296 /* Decide whether it is possible to use a zero-based induction variable
10297    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10298    the value that the induction variable must be able to hold in order
10299    to ensure that the rgroups eventually have no active vector elements.
10300    Return -1 otherwise.  */
10301 
10302 widest_int
10303 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10304 {
10305   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10306   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10307   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10308 
10309   /* Calculate the value that the induction variable must be able
10310      to hit in order to ensure that we end the loop with an all-false mask.
10311      This involves adding the maximum number of inactive trailing scalar
10312      iterations.  */
10313   widest_int iv_limit = -1;
10314   if (max_loop_iterations (loop, &iv_limit))
10315     {
10316       if (niters_skip)
10317 	{
10318 	  /* Add the maximum number of skipped iterations to the
10319 	     maximum iteration count.  */
10320 	  if (TREE_CODE (niters_skip) == INTEGER_CST)
10321 	    iv_limit += wi::to_widest (niters_skip);
10322 	  else
10323 	    iv_limit += max_vf - 1;
10324 	}
10325       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10326 	/* Make a conservatively-correct assumption.  */
10327 	iv_limit += max_vf - 1;
10328 
10329       /* IV_LIMIT is the maximum number of latch iterations, which is also
10330 	 the maximum in-range IV value.  Round this value down to the previous
10331 	 vector alignment boundary and then add an extra full iteration.  */
10332       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10333       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10334     }
10335   return iv_limit;
10336 }
10337 
10338 /* For the given rgroup_controls RGC, check whether an induction variable
10339    would ever hit a value that produces a set of all-false masks or zero
10340    lengths before wrapping around.  Return true if it's possible to wrap
10341    around before hitting the desirable value, otherwise return false.  */
10342 
10343 bool
10344 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10345 {
10346   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10347 
10348   if (iv_limit == -1)
10349     return true;
10350 
10351   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10352   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10353   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10354 
10355   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10356     return true;
10357 
10358   return false;
10359 }
10360