xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-loop.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /* Loop Vectorization
2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 
58 /* Loop Vectorization Pass.
59 
60    This pass tries to vectorize loops.
61 
62    For example, the vectorizer transforms the following simple loop:
63 
64         short a[N]; short b[N]; short c[N]; int i;
65 
66         for (i=0; i<N; i++){
67           a[i] = b[i] + c[i];
68         }
69 
70    as if it was manually vectorized by rewriting the source code into:
71 
72         typedef int __attribute__((mode(V8HI))) v8hi;
73         short a[N];  short b[N]; short c[N];   int i;
74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75         v8hi va, vb, vc;
76 
77         for (i=0; i<N/8; i++){
78           vb = pb[i];
79           vc = pc[i];
80           va = vb + vc;
81           pa[i] = va;
82         }
83 
84         The main entry to this pass is vectorize_loops(), in which
85    the vectorizer applies a set of analyses on a given set of loops,
86    followed by the actual vectorization transformation for the loops that
87    had successfully passed the analysis phase.
88         Throughout this pass we make a distinction between two types of
89    data: scalars (which are represented by SSA_NAMES), and memory references
90    ("data-refs").  These two types of data require different handling both
91    during analysis and transformation. The types of data-refs that the
92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94    accesses are required to have a simple (consecutive) access pattern.
95 
96    Analysis phase:
97    ===============
98         The driver for the analysis phase is vect_analyze_loop().
99    It applies a set of analyses, some of which rely on the scalar evolution
100    analyzer (scev) developed by Sebastian Pop.
101 
102         During the analysis phase the vectorizer records some information
103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104    loop, as well as general information about the loop as a whole, which is
105    recorded in a "loop_vec_info" struct attached to each loop.
106 
107    Transformation phase:
108    =====================
109         The loop transformation phase scans all the stmts in the loop, and
110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111    the loop that needs to be vectorized.  It inserts the vector code sequence
112    just before the scalar stmt S, and records a pointer to the vector code
113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114    attached to S).  This pointer will be used for the vectorization of following
115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116    otherwise, we rely on dead code elimination for removing it.
117 
118         For example, say stmt S1 was vectorized into stmt VS1:
119 
120    VS1: vb = px[i];
121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122    S2:  a = b;
123 
124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
127    resulting sequence would be:
128 
129    VS1: vb = px[i];
130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131    VS2: va = vb;
132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 
134         Operands that are not SSA_NAMEs, are data-refs that appear in
135    load/store operations (like 'x[i]' in S1), and are handled differently.
136 
137    Target modeling:
138    =================
139         Currently the only target specific information that is used is the
140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141    Targets that can support different sizes of vectors, for now will need
142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
143    flexibility will be added in the future.
144 
145         Since we only vectorize operations which vector form can be
146    expressed using existing tree codes, to verify that an operation is
147    supported, the vectorizer checks the relevant optab at the relevant
148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
149    the value found is CODE_FOR_nothing, then there's no target support, and
150    we can't vectorize the stmt.
151 
152    For additional information on this project see:
153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155 
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160    may already be set for general statements (not just data refs).  */
161 
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 			      bool vectype_maybe_set_p,
165 			      poly_uint64 *vf,
166 			      vec<stmt_vec_info > *mask_producers)
167 {
168   gimple *stmt = stmt_info->stmt;
169 
170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171        && !STMT_VINFO_LIVE_P (stmt_info))
172       || gimple_clobber_p (stmt))
173     {
174       if (dump_enabled_p ())
175 	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176       return opt_result::success ();
177     }
178 
179   tree stmt_vectype, nunits_vectype;
180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 						   &nunits_vectype);
182   if (!res)
183     return res;
184 
185   if (stmt_vectype)
186     {
187       if (STMT_VINFO_VECTYPE (stmt_info))
188 	/* The only case when a vectype had been already set is for stmts
189 	   that contain a data ref, or for "pattern-stmts" (stmts generated
190 	   by the vectorizer to represent/replace a certain idiom).  */
191 	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 		     || vectype_maybe_set_p)
193 		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194       else if (stmt_vectype == boolean_type_node)
195 	mask_producers->safe_push (stmt_info);
196       else
197 	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198     }
199 
200   if (nunits_vectype)
201     vect_update_max_nunits (vf, nunits_vectype);
202 
203   return opt_result::success ();
204 }
205 
206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
207    types of STMT_INFO and all attached pattern statements and update
208    the vectorization factor VF accordingly.  If some of the statements
209    produce a mask result whose vector type can only be calculated later,
210    add them to MASK_PRODUCERS.  Return true on success or false if
211    something prevented vectorization.  */
212 
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 			    vec<stmt_vec_info > *mask_producers)
216 {
217   vec_info *vinfo = stmt_info->vinfo;
218   if (dump_enabled_p ())
219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 		     stmt_info->stmt);
221   opt_result res
222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223   if (!res)
224     return res;
225 
226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227       && STMT_VINFO_RELATED_STMT (stmt_info))
228     {
229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231 
232       /* If a pattern statement has def stmts, analyze them too.  */
233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 	   !gsi_end_p (si); gsi_next (&si))
235 	{
236 	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 	  if (dump_enabled_p ())
238 	    dump_printf_loc (MSG_NOTE, vect_location,
239 			     "==> examining pattern def stmt: %G",
240 			     def_stmt_info->stmt);
241 	  if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 					     vf, mask_producers))
243 	  res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 					      vf, mask_producers);
245 	  if (!res)
246 	    return res;
247 	}
248 
249       if (dump_enabled_p ())
250 	dump_printf_loc (MSG_NOTE, vect_location,
251 			 "==> examining pattern statement: %G",
252 			 stmt_info->stmt);
253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254       if (!res)
255 	return res;
256     }
257 
258   return opt_result::success ();
259 }
260 
261 /* Function vect_determine_vectorization_factor
262 
263    Determine the vectorization factor (VF).  VF is the number of data elements
264    that are operated upon in parallel in a single iteration of the vectorized
265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267    elements can fit in a single vector register.
268 
269    We currently support vectorization of loops in which all types operated upon
270    are of the same size.  Therefore this function currently sets VF according to
271    the size of the types operated upon, and fails if there are multiple sizes
272    in the loop.
273 
274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
275    original loop:
276         for (i=0; i<N; i++){
277           a[i] = b[i] + c[i];
278         }
279 
280    vectorized loop:
281         for (i=0; i<N; i+=VF){
282           a[i:VF] = b[i:VF] + c[i:VF];
283         }
284 */
285 
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 {
289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291   unsigned nbbs = loop->num_nodes;
292   poly_uint64 vectorization_factor = 1;
293   tree scalar_type = NULL_TREE;
294   gphi *phi;
295   tree vectype;
296   stmt_vec_info stmt_info;
297   unsigned i;
298   auto_vec<stmt_vec_info> mask_producers;
299 
300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301 
302   for (i = 0; i < nbbs; i++)
303     {
304       basic_block bb = bbs[i];
305 
306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 	   gsi_next (&si))
308 	{
309 	  phi = si.phi ();
310 	  stmt_info = loop_vinfo->lookup_stmt (phi);
311 	  if (dump_enabled_p ())
312 	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 			     phi);
314 
315 	  gcc_assert (stmt_info);
316 
317 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
318 	      || STMT_VINFO_LIVE_P (stmt_info))
319             {
320 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
322 
323 	      if (dump_enabled_p ())
324 		dump_printf_loc (MSG_NOTE, vect_location,
325 				 "get vectype for scalar type:  %T\n",
326 				 scalar_type);
327 
328 	      vectype = get_vectype_for_scalar_type (scalar_type);
329 	      if (!vectype)
330 		return opt_result::failure_at (phi,
331 					       "not vectorized: unsupported "
332 					       "data-type %T\n",
333 					       scalar_type);
334 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
335 
336 	      if (dump_enabled_p ())
337 		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 				 vectype);
339 
340 	      if (dump_enabled_p ())
341 		{
342 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 		  dump_printf (MSG_NOTE, "\n");
345 		}
346 
347 	      vect_update_max_nunits (&vectorization_factor, vectype);
348 	    }
349 	}
350 
351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 	   gsi_next (&si))
353 	{
354 	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 	  opt_result res
356 	    = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 					  &mask_producers);
358 	  if (!res)
359 	    return res;
360         }
361     }
362 
363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
364   if (dump_enabled_p ())
365     {
366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367       dump_dec (MSG_NOTE, vectorization_factor);
368       dump_printf (MSG_NOTE, "\n");
369     }
370 
371   if (known_le (vectorization_factor, 1U))
372     return opt_result::failure_at (vect_location,
373 				   "not vectorized: unsupported data-type\n");
374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375 
376   for (i = 0; i < mask_producers.length (); i++)
377     {
378       stmt_info = mask_producers[i];
379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380       if (!mask_type)
381 	return opt_result::propagate_failure (mask_type);
382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
383     }
384 
385   return opt_result::success ();
386 }
387 
388 
389 /* Function vect_is_simple_iv_evolution.
390 
391    FORNOW: A simple evolution of an induction variables in the loop is
392    considered a polynomial evolution.  */
393 
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396                              tree * step)
397 {
398   tree init_expr;
399   tree step_expr;
400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401   basic_block bb;
402 
403   /* When there is no evolution in this loop, the evolution function
404      is not "simple".  */
405   if (evolution_part == NULL_TREE)
406     return false;
407 
408   /* When the evolution is a polynomial of degree >= 2
409      the evolution function is not "simple".  */
410   if (tree_is_chrec (evolution_part))
411     return false;
412 
413   step_expr = evolution_part;
414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415 
416   if (dump_enabled_p ())
417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
418 		     step_expr, init_expr);
419 
420   *init = init_expr;
421   *step = step_expr;
422 
423   if (TREE_CODE (step_expr) != INTEGER_CST
424       && (TREE_CODE (step_expr) != SSA_NAME
425 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 		  || !flag_associative_math)))
430       && (TREE_CODE (step_expr) != REAL_CST
431 	  || !flag_associative_math))
432     {
433       if (dump_enabled_p ())
434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435                          "step unknown.\n");
436       return false;
437     }
438 
439   return true;
440 }
441 
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443    what we are assuming is a double reduction.  For example, given
444    a structure like this:
445 
446       outer1:
447 	x_1 = PHI <x_4(outer2), ...>;
448 	...
449 
450       inner:
451 	x_2 = PHI <x_1(outer1), ...>;
452 	...
453 	x_3 = ...;
454 	...
455 
456       outer2:
457 	x_4 = PHI <x_3(inner)>;
458 	...
459 
460    outer loop analysis would treat x_1 as a double reduction phi and
461    this function would then return true for x_2.  */
462 
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
465 {
466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467   use_operand_p use_p;
468   ssa_op_iter op_iter;
469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 	return true;
473   return false;
474 }
475 
476 /* Function vect_analyze_scalar_cycles_1.
477 
478    Examine the cross iteration def-use cycles of scalar variables
479    in LOOP.  LOOP_VINFO represents the loop that is now being
480    considered for vectorization (can be LOOP, or an outer-loop
481    enclosing LOOP).  */
482 
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
485 {
486   basic_block bb = loop->header;
487   tree init, step;
488   auto_vec<stmt_vec_info, 64> worklist;
489   gphi_iterator gsi;
490   bool double_reduc;
491 
492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
493 
494   /* First - identify all inductions.  Reduction detection assumes that all the
495      inductions have been identified, therefore, this order must not be
496      changed.  */
497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498     {
499       gphi *phi = gsi.phi ();
500       tree access_fn = NULL;
501       tree def = PHI_RESULT (phi);
502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
503 
504       if (dump_enabled_p ())
505 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
506 
507       /* Skip virtual phi's.  The data dependences that are associated with
508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
509       if (virtual_operand_p (def))
510 	continue;
511 
512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
513 
514       /* Analyze the evolution function.  */
515       access_fn = analyze_scalar_evolution (loop, def);
516       if (access_fn)
517 	{
518 	  STRIP_NOPS (access_fn);
519 	  if (dump_enabled_p ())
520 	    dump_printf_loc (MSG_NOTE, vect_location,
521 			     "Access function of PHI: %T\n", access_fn);
522 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 	    = initial_condition_in_loop_num (access_fn, loop->num);
524 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 	    = evolution_part_in_loop_num (access_fn, loop->num);
526 	}
527 
528       if (!access_fn
529 	  || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 	      && TREE_CODE (step) != INTEGER_CST))
533 	{
534 	  worklist.safe_push (stmt_vinfo);
535 	  continue;
536 	}
537 
538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 		  != NULL_TREE);
540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
541 
542       if (dump_enabled_p ())
543 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
545     }
546 
547 
548   /* Second - identify all reductions and nested cycles.  */
549   while (worklist.length () > 0)
550     {
551       stmt_vec_info stmt_vinfo = worklist.pop ();
552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553       tree def = PHI_RESULT (phi);
554 
555       if (dump_enabled_p ())
556 	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
557 
558       gcc_assert (!virtual_operand_p (def)
559 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
560 
561       stmt_vec_info reduc_stmt_info
562 	= vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 				       &double_reduc, false);
564       if (reduc_stmt_info)
565         {
566           if (double_reduc)
567             {
568               if (dump_enabled_p ())
569                 dump_printf_loc (MSG_NOTE, vect_location,
570 				 "Detected double reduction.\n");
571 
572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 	      STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 		= vect_double_reduction_def;
575             }
576           else
577             {
578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
579                 {
580                   if (dump_enabled_p ())
581                     dump_printf_loc (MSG_NOTE, vect_location,
582 				     "Detected vectorizable nested cycle.\n");
583 
584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
586                 }
587               else
588                 {
589                   if (dump_enabled_p ())
590                     dump_printf_loc (MSG_NOTE, vect_location,
591 				     "Detected reduction.\n");
592 
593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595                   /* Store the reduction cycles for possible vectorization in
596                      loop-aware SLP if it was not detected as reduction
597 		     chain.  */
598 		  if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 		      (reduc_stmt_info);
601                 }
602             }
603         }
604       else
605         if (dump_enabled_p ())
606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 			   "Unknown def-use cycle pattern.\n");
608     }
609 }
610 
611 
612 /* Function vect_analyze_scalar_cycles.
613 
614    Examine the cross iteration def-use cycles of scalar variables, by
615    analyzing the loop-header PHIs of scalar variables.  Classify each
616    cycle as one of the following: invariant, induction, reduction, unknown.
617    We do that for the loop represented by LOOP_VINFO, and also to its
618    inner-loop, if exists.
619    Examples for scalar cycles:
620 
621    Example1: reduction:
622 
623               loop1:
624               for (i=0; i<N; i++)
625                  sum += a[i];
626 
627    Example2: induction:
628 
629               loop2:
630               for (i=0; i<N; i++)
631                  a[i] = i;  */
632 
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635 {
636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637 
638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639 
640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641      Reductions in such inner-loop therefore have different properties than
642      the reductions in the nest that gets vectorized:
643      1. When vectorized, they are executed in the same order as in the original
644         scalar loop, so we can't change the order of computation when
645         vectorizing them.
646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647         current checks are too strict.  */
648 
649   if (loop->inner)
650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 }
652 
653 /* Transfer group and reduction information from STMT_INFO to its
654    pattern stmt.  */
655 
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
658 {
659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660   stmt_vec_info stmtp;
661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664   do
665     {
666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669       if (stmt_info)
670 	REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 	  = STMT_VINFO_RELATED_STMT (stmt_info);
672     }
673   while (stmt_info);
674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 }
676 
677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
678 
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
681 {
682   stmt_vec_info first;
683   unsigned i;
684 
685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686     if (STMT_VINFO_IN_PATTERN_P (first))
687       {
688 	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 	while (next)
690 	  {
691 	    if (! STMT_VINFO_IN_PATTERN_P (next))
692 	      break;
693 	    next = REDUC_GROUP_NEXT_ELEMENT (next);
694 	  }
695 	/* If not all stmt in the chain are patterns try to handle
696 	   the chain without patterns.  */
697 	if (! next)
698 	  {
699 	    vect_fixup_reduc_chain (first);
700 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 	      = STMT_VINFO_RELATED_STMT (first);
702 	  }
703       }
704 }
705 
706 /* Function vect_get_loop_niters.
707 
708    Determine how many iterations the loop is executed and place it
709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
711    niter information holds in ASSUMPTIONS.
712 
713    Return the loop exit condition.  */
714 
715 
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 		      tree *number_of_iterations, tree *number_of_iterationsm1)
719 {
720   edge exit = single_exit (loop);
721   struct tree_niter_desc niter_desc;
722   tree niter_assumptions, niter, may_be_zero;
723   gcond *cond = get_loop_exit_condition (loop);
724 
725   *assumptions = boolean_true_node;
726   *number_of_iterationsm1 = chrec_dont_know;
727   *number_of_iterations = chrec_dont_know;
728   DUMP_VECT_SCOPE ("get_loop_niters");
729 
730   if (!exit)
731     return cond;
732 
733   niter = chrec_dont_know;
734   may_be_zero = NULL_TREE;
735   niter_assumptions = boolean_true_node;
736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737       || chrec_contains_undetermined (niter_desc.niter))
738     return cond;
739 
740   niter_assumptions = niter_desc.assumptions;
741   may_be_zero = niter_desc.may_be_zero;
742   niter = niter_desc.niter;
743 
744   if (may_be_zero && integer_zerop (may_be_zero))
745     may_be_zero = NULL_TREE;
746 
747   if (may_be_zero)
748     {
749       if (COMPARISON_CLASS_P (may_be_zero))
750 	{
751 	  /* Try to combine may_be_zero with assumptions, this can simplify
752 	     computation of niter expression.  */
753 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 					     niter_assumptions,
756 					     fold_build1 (TRUTH_NOT_EXPR,
757 							  boolean_type_node,
758 							  may_be_zero));
759 	  else
760 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 				 build_int_cst (TREE_TYPE (niter), 0),
762 				 rewrite_to_non_trapping_overflow (niter));
763 
764 	  may_be_zero = NULL_TREE;
765 	}
766       else if (integer_nonzerop (may_be_zero))
767 	{
768 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 	  return cond;
771 	}
772       else
773 	return cond;
774     }
775 
776   *assumptions = niter_assumptions;
777   *number_of_iterationsm1 = niter;
778 
779   /* We want the number of loop header executions which is the number
780      of latch executions plus one.
781      ???  For UINT_MAX latch executions this number overflows to zero
782      for loops like do { n++; } while (n != 0);  */
783   if (niter && !chrec_contains_undetermined (niter))
784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 			  build_int_cst (TREE_TYPE (niter), 1));
786   *number_of_iterations = niter;
787 
788   return cond;
789 }
790 
791 /* Function bb_in_loop_p
792 
793    Used as predicate for dfs order traversal of the loop bbs.  */
794 
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
797 {
798   const struct loop *const loop = (const struct loop *)data;
799   if (flow_bb_inside_loop_p (loop, bb))
800     return true;
801   return false;
802 }
803 
804 
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
807 
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
810     loop (loop_in),
811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812     num_itersm1 (NULL_TREE),
813     num_iters (NULL_TREE),
814     num_iters_unchanged (NULL_TREE),
815     num_iters_assumptions (NULL_TREE),
816     th (0),
817     versioning_threshold (0),
818     vectorization_factor (0),
819     max_vectorization_factor (0),
820     mask_skip_niters (NULL_TREE),
821     mask_compare_type (NULL_TREE),
822     simd_if_cond (NULL_TREE),
823     unaligned_dr (NULL),
824     peeling_for_alignment (0),
825     ptr_mask (0),
826     ivexpr_map (NULL),
827     slp_unrolling_factor (1),
828     single_scalar_iteration_cost (0),
829     vectorizable (false),
830     can_fully_mask_p (true),
831     fully_masked_p (false),
832     peeling_for_gaps (false),
833     peeling_for_niter (false),
834     operands_swapped (false),
835     no_data_dependencies (false),
836     has_mask_store (false),
837     scalar_loop (NULL),
838     orig_loop_info (NULL)
839 {
840   /* CHECKME: We want to visit all BBs before their successors (except for
841      latch blocks, for which this assertion wouldn't hold).  In the simple
842      case of the loop forms we allow, a dfs order of the BBs would the same
843      as reversed postorder traversal, so we are safe.  */
844 
845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
846 					  bbs, loop->num_nodes, loop);
847   gcc_assert (nbbs == loop->num_nodes);
848 
849   for (unsigned int i = 0; i < nbbs; i++)
850     {
851       basic_block bb = bbs[i];
852       gimple_stmt_iterator si;
853 
854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
855 	{
856 	  gimple *phi = gsi_stmt (si);
857 	  gimple_set_uid (phi, 0);
858 	  add_stmt (phi);
859 	}
860 
861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
862 	{
863 	  gimple *stmt = gsi_stmt (si);
864 	  gimple_set_uid (stmt, 0);
865 	  add_stmt (stmt);
866 	  /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
867 	     second argument is the #pragma omp simd if (x) condition, when 0,
868 	     loop shouldn't be vectorized, when non-zero constant, it should
869 	     be vectorized normally, otherwise versioned with vectorized loop
870 	     done if the condition is non-zero at runtime.  */
871 	  if (loop_in->simduid
872 	      && is_gimple_call (stmt)
873 	      && gimple_call_internal_p (stmt)
874 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
875 	      && gimple_call_num_args (stmt) >= 2
876 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
877 	      && (loop_in->simduid
878 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
879 	    {
880 	      tree arg = gimple_call_arg (stmt, 1);
881 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
882 		simd_if_cond = arg;
883 	      else
884 		gcc_assert (integer_nonzerop (arg));
885 	    }
886 	}
887     }
888 }
889 
890 /* Free all levels of MASKS.  */
891 
892 void
893 release_vec_loop_masks (vec_loop_masks *masks)
894 {
895   rgroup_masks *rgm;
896   unsigned int i;
897   FOR_EACH_VEC_ELT (*masks, i, rgm)
898     rgm->masks.release ();
899   masks->release ();
900 }
901 
902 /* Free all memory used by the _loop_vec_info, as well as all the
903    stmt_vec_info structs of all the stmts in the loop.  */
904 
905 _loop_vec_info::~_loop_vec_info ()
906 {
907   int nbbs;
908   gimple_stmt_iterator si;
909   int j;
910 
911   nbbs = loop->num_nodes;
912   for (j = 0; j < nbbs; j++)
913     {
914       basic_block bb = bbs[j];
915       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
916         {
917 	  gimple *stmt = gsi_stmt (si);
918 
919 	  /* We may have broken canonical form by moving a constant
920 	     into RHS1 of a commutative op.  Fix such occurrences.  */
921 	  if (operands_swapped && is_gimple_assign (stmt))
922 	    {
923 	      enum tree_code code = gimple_assign_rhs_code (stmt);
924 
925 	      if ((code == PLUS_EXPR
926 		   || code == POINTER_PLUS_EXPR
927 		   || code == MULT_EXPR)
928 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
929 		swap_ssa_operands (stmt,
930 				   gimple_assign_rhs1_ptr (stmt),
931 				   gimple_assign_rhs2_ptr (stmt));
932 	      else if (code == COND_EXPR
933 		       && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
934 		{
935 		  tree cond_expr = gimple_assign_rhs1 (stmt);
936 		  enum tree_code cond_code = TREE_CODE (cond_expr);
937 
938 		  if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
939 		    {
940 		      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
941 								  0));
942 		      cond_code = invert_tree_comparison (cond_code,
943 							  honor_nans);
944 		      if (cond_code != ERROR_MARK)
945 			{
946 			  TREE_SET_CODE (cond_expr, cond_code);
947 			  swap_ssa_operands (stmt,
948 					     gimple_assign_rhs2_ptr (stmt),
949 					     gimple_assign_rhs3_ptr (stmt));
950 			}
951 		    }
952 		}
953 	    }
954           gsi_next (&si);
955         }
956     }
957 
958   free (bbs);
959 
960   release_vec_loop_masks (&masks);
961   delete ivexpr_map;
962 
963   loop->aux = NULL;
964 }
965 
966 /* Return an invariant or register for EXPR and emit necessary
967    computations in the LOOP_VINFO loop preheader.  */
968 
969 tree
970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
971 {
972   if (is_gimple_reg (expr)
973       || is_gimple_min_invariant (expr))
974     return expr;
975 
976   if (! loop_vinfo->ivexpr_map)
977     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
978   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
979   if (! cached)
980     {
981       gimple_seq stmts = NULL;
982       cached = force_gimple_operand (unshare_expr (expr),
983 				     &stmts, true, NULL_TREE);
984       if (stmts)
985 	{
986 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
987 	  gsi_insert_seq_on_edge_immediate (e, stmts);
988 	}
989     }
990   return cached;
991 }
992 
993 /* Return true if we can use CMP_TYPE as the comparison type to produce
994    all masks required to mask LOOP_VINFO.  */
995 
996 static bool
997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
998 {
999   rgroup_masks *rgm;
1000   unsigned int i;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     if (rgm->mask_type != NULL_TREE
1003 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1004 					    cmp_type, rgm->mask_type,
1005 					    OPTIMIZE_FOR_SPEED))
1006       return false;
1007   return true;
1008 }
1009 
1010 /* Calculate the maximum number of scalars per iteration for every
1011    rgroup in LOOP_VINFO.  */
1012 
1013 static unsigned int
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1015 {
1016   unsigned int res = 1;
1017   unsigned int i;
1018   rgroup_masks *rgm;
1019   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1020     res = MAX (res, rgm->max_nscalars_per_iter);
1021   return res;
1022 }
1023 
1024 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1025    whether we can actually generate the masks required.  Return true if so,
1026    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1027 
1028 static bool
1029 vect_verify_full_masking (loop_vec_info loop_vinfo)
1030 {
1031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1032   unsigned int min_ni_width;
1033 
1034   /* Use a normal loop if there are no statements that need masking.
1035      This only happens in rare degenerate cases: it means that the loop
1036      has no loads, no stores, and no live-out values.  */
1037   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1038     return false;
1039 
1040   /* Get the maximum number of iterations that is representable
1041      in the counter type.  */
1042   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1043   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1044 
1045   /* Get a more refined estimate for the number of iterations.  */
1046   widest_int max_back_edges;
1047   if (max_loop_iterations (loop, &max_back_edges))
1048     max_ni = wi::smin (max_ni, max_back_edges + 1);
1049 
1050   /* Account for rgroup masks, in which each bit is replicated N times.  */
1051   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1052 
1053   /* Work out how many bits we need to represent the limit.  */
1054   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1055 
1056   /* Find a scalar mode for which WHILE_ULT is supported.  */
1057   opt_scalar_int_mode cmp_mode_iter;
1058   tree cmp_type = NULL_TREE;
1059   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1060     {
1061       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1062       if (cmp_bits >= min_ni_width
1063 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1064 	{
1065 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1066 	  if (this_type
1067 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1068 	    {
1069 	      /* Although we could stop as soon as we find a valid mode,
1070 		 it's often better to continue until we hit Pmode, since the
1071 		 operands to the WHILE are more likely to be reusable in
1072 		 address calculations.  */
1073 	      cmp_type = this_type;
1074 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1075 		break;
1076 	    }
1077 	}
1078     }
1079 
1080   if (!cmp_type)
1081     return false;
1082 
1083   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1084   return true;
1085 }
1086 
1087 /* Calculate the cost of one scalar iteration of the loop.  */
1088 static void
1089 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1090 {
1091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1092   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1093   int nbbs = loop->num_nodes, factor;
1094   int innerloop_iters, i;
1095 
1096   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1097 
1098   /* Gather costs for statements in the scalar loop.  */
1099 
1100   /* FORNOW.  */
1101   innerloop_iters = 1;
1102   if (loop->inner)
1103     innerloop_iters = 50; /* FIXME */
1104 
1105   for (i = 0; i < nbbs; i++)
1106     {
1107       gimple_stmt_iterator si;
1108       basic_block bb = bbs[i];
1109 
1110       if (bb->loop_father == loop->inner)
1111         factor = innerloop_iters;
1112       else
1113         factor = 1;
1114 
1115       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1116         {
1117 	  gimple *stmt = gsi_stmt (si);
1118 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1119 
1120           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1121             continue;
1122 
1123           /* Skip stmts that are not vectorized inside the loop.  */
1124 	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1125           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1126               && (!STMT_VINFO_LIVE_P (vstmt_info)
1127                   || !VECTORIZABLE_CYCLE_DEF
1128 			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1129             continue;
1130 
1131 	  vect_cost_for_stmt kind;
1132           if (STMT_VINFO_DATA_REF (stmt_info))
1133             {
1134               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1135                kind = scalar_load;
1136              else
1137                kind = scalar_store;
1138             }
1139           else
1140             kind = scalar_stmt;
1141 
1142 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1143 			    factor, kind, stmt_info, 0, vect_prologue);
1144         }
1145     }
1146 
1147   /* Now accumulate cost.  */
1148   void *target_cost_data = init_cost (loop);
1149   stmt_info_for_cost *si;
1150   int j;
1151   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1152 		    j, si)
1153     (void) add_stmt_cost (target_cost_data, si->count,
1154 			  si->kind, si->stmt_info, si->misalign,
1155 			  vect_body);
1156   unsigned dummy, body_cost = 0;
1157   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1158   destroy_cost_data (target_cost_data);
1159   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1160 }
1161 
1162 
1163 /* Function vect_analyze_loop_form_1.
1164 
1165    Verify that certain CFG restrictions hold, including:
1166    - the loop has a pre-header
1167    - the loop has a single entry and exit
1168    - the loop exit condition is simple enough
1169    - the number of iterations can be analyzed, i.e, a countable loop.  The
1170      niter could be analyzed under some assumptions.  */
1171 
1172 opt_result
1173 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1174 			  tree *assumptions, tree *number_of_iterationsm1,
1175 			  tree *number_of_iterations, gcond **inner_loop_cond)
1176 {
1177   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1178 
1179   /* Different restrictions apply when we are considering an inner-most loop,
1180      vs. an outer (nested) loop.
1181      (FORNOW. May want to relax some of these restrictions in the future).  */
1182 
1183   if (!loop->inner)
1184     {
1185       /* Inner-most loop.  We currently require that the number of BBs is
1186 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1187 	 look like this:
1188 
1189                         (pre-header)
1190                            |
1191                           header <--------+
1192                            | |            |
1193                            | +--> latch --+
1194                            |
1195                         (exit-bb)  */
1196 
1197       if (loop->num_nodes != 2)
1198 	return opt_result::failure_at (vect_location,
1199 				       "not vectorized:"
1200 				       " control flow in loop.\n");
1201 
1202       if (empty_block_p (loop->header))
1203 	return opt_result::failure_at (vect_location,
1204 				       "not vectorized: empty loop.\n");
1205     }
1206   else
1207     {
1208       struct loop *innerloop = loop->inner;
1209       edge entryedge;
1210 
1211       /* Nested loop. We currently require that the loop is doubly-nested,
1212 	 contains a single inner loop, and the number of BBs is exactly 5.
1213 	 Vectorizable outer-loops look like this:
1214 
1215 			(pre-header)
1216 			   |
1217 			  header <---+
1218 			   |         |
1219 		          inner-loop |
1220 			   |         |
1221 			  tail ------+
1222 			   |
1223 		        (exit-bb)
1224 
1225 	 The inner-loop has the properties expected of inner-most loops
1226 	 as described above.  */
1227 
1228       if ((loop->inner)->inner || (loop->inner)->next)
1229 	return opt_result::failure_at (vect_location,
1230 				       "not vectorized:"
1231 				       " multiple nested loops.\n");
1232 
1233       if (loop->num_nodes != 5)
1234 	return opt_result::failure_at (vect_location,
1235 				       "not vectorized:"
1236 				       " control flow in loop.\n");
1237 
1238       entryedge = loop_preheader_edge (innerloop);
1239       if (entryedge->src != loop->header
1240 	  || !single_exit (innerloop)
1241 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1242 	return opt_result::failure_at (vect_location,
1243 				       "not vectorized:"
1244 				       " unsupported outerloop form.\n");
1245 
1246       /* Analyze the inner-loop.  */
1247       tree inner_niterm1, inner_niter, inner_assumptions;
1248       opt_result res
1249 	= vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1250 				    &inner_assumptions, &inner_niterm1,
1251 				    &inner_niter, NULL);
1252       if (!res)
1253 	{
1254 	  if (dump_enabled_p ())
1255 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 			     "not vectorized: Bad inner loop.\n");
1257 	  return res;
1258 	}
1259 
1260       /* Don't support analyzing niter under assumptions for inner
1261 	 loop.  */
1262       if (!integer_onep (inner_assumptions))
1263 	return opt_result::failure_at (vect_location,
1264 				       "not vectorized: Bad inner loop.\n");
1265 
1266       if (!expr_invariant_in_loop_p (loop, inner_niter))
1267 	return opt_result::failure_at (vect_location,
1268 				       "not vectorized: inner-loop count not"
1269 				       " invariant.\n");
1270 
1271       if (dump_enabled_p ())
1272         dump_printf_loc (MSG_NOTE, vect_location,
1273 			 "Considering outer-loop vectorization.\n");
1274     }
1275 
1276   if (!single_exit (loop))
1277     return opt_result::failure_at (vect_location,
1278 				   "not vectorized: multiple exits.\n");
1279   if (EDGE_COUNT (loop->header->preds) != 2)
1280     return opt_result::failure_at (vect_location,
1281 				   "not vectorized:"
1282 				   " too many incoming edges.\n");
1283 
1284   /* We assume that the loop exit condition is at the end of the loop. i.e,
1285      that the loop is represented as a do-while (with a proper if-guard
1286      before the loop if needed), where the loop header contains all the
1287      executable statements, and the latch is empty.  */
1288   if (!empty_block_p (loop->latch)
1289       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1290     return opt_result::failure_at (vect_location,
1291 				   "not vectorized: latch block not empty.\n");
1292 
1293   /* Make sure the exit is not abnormal.  */
1294   edge e = single_exit (loop);
1295   if (e->flags & EDGE_ABNORMAL)
1296     return opt_result::failure_at (vect_location,
1297 				   "not vectorized:"
1298 				   " abnormal loop exit edge.\n");
1299 
1300   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1301 				     number_of_iterationsm1);
1302   if (!*loop_cond)
1303     return opt_result::failure_at
1304       (vect_location,
1305        "not vectorized: complicated exit condition.\n");
1306 
1307   if (integer_zerop (*assumptions)
1308       || !*number_of_iterations
1309       || chrec_contains_undetermined (*number_of_iterations))
1310     return opt_result::failure_at
1311       (*loop_cond,
1312        "not vectorized: number of iterations cannot be computed.\n");
1313 
1314   if (integer_zerop (*number_of_iterations))
1315     return opt_result::failure_at
1316       (*loop_cond,
1317        "not vectorized: number of iterations = 0.\n");
1318 
1319   return opt_result::success ();
1320 }
1321 
1322 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1323 
1324 opt_loop_vec_info
1325 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1326 {
1327   tree assumptions, number_of_iterations, number_of_iterationsm1;
1328   gcond *loop_cond, *inner_loop_cond = NULL;
1329 
1330   opt_result res
1331     = vect_analyze_loop_form_1 (loop, &loop_cond,
1332 				&assumptions, &number_of_iterationsm1,
1333 				&number_of_iterations, &inner_loop_cond);
1334   if (!res)
1335     return opt_loop_vec_info::propagate_failure (res);
1336 
1337   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1338   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1339   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1340   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1341   if (!integer_onep (assumptions))
1342     {
1343       /* We consider to vectorize this loop by versioning it under
1344 	 some assumptions.  In order to do this, we need to clear
1345 	 existing information computed by scev and niter analyzer.  */
1346       scev_reset_htab ();
1347       free_numbers_of_iterations_estimates (loop);
1348       /* Also set flag for this loop so that following scev and niter
1349 	 analysis are done under the assumptions.  */
1350       loop_constraint_set (loop, LOOP_C_FINITE);
1351       /* Also record the assumptions for versioning.  */
1352       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1353     }
1354 
1355   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1356     {
1357       if (dump_enabled_p ())
1358         {
1359           dump_printf_loc (MSG_NOTE, vect_location,
1360 			   "Symbolic number of iterations is ");
1361 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1362           dump_printf (MSG_NOTE, "\n");
1363         }
1364     }
1365 
1366   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1367   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1368   if (inner_loop_cond)
1369     {
1370       stmt_vec_info inner_loop_cond_info
1371 	= loop_vinfo->lookup_stmt (inner_loop_cond);
1372       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1373     }
1374 
1375   gcc_assert (!loop->aux);
1376   loop->aux = loop_vinfo;
1377   return opt_loop_vec_info::success (loop_vinfo);
1378 }
1379 
1380 
1381 
1382 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1383    statements update the vectorization factor.  */
1384 
1385 static void
1386 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1387 {
1388   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1389   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1390   int nbbs = loop->num_nodes;
1391   poly_uint64 vectorization_factor;
1392   int i;
1393 
1394   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1395 
1396   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397   gcc_assert (known_ne (vectorization_factor, 0U));
1398 
1399   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1400      vectorization factor of the loop is the unrolling factor required by
1401      the SLP instances.  If that unrolling factor is 1, we say, that we
1402      perform pure SLP on loop - cross iteration parallelism is not
1403      exploited.  */
1404   bool only_slp_in_loop = true;
1405   for (i = 0; i < nbbs; i++)
1406     {
1407       basic_block bb = bbs[i];
1408       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1409 	   gsi_next (&si))
1410 	{
1411 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1412 	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1413 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1414 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1415 	      && !PURE_SLP_STMT (stmt_info))
1416 	    /* STMT needs both SLP and loop-based vectorization.  */
1417 	    only_slp_in_loop = false;
1418 	}
1419     }
1420 
1421   if (only_slp_in_loop)
1422     {
1423       if (dump_enabled_p ())
1424 	dump_printf_loc (MSG_NOTE, vect_location,
1425 			 "Loop contains only SLP stmts\n");
1426       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1427     }
1428   else
1429     {
1430       if (dump_enabled_p ())
1431 	dump_printf_loc (MSG_NOTE, vect_location,
1432 			 "Loop contains SLP and non-SLP stmts\n");
1433       /* Both the vectorization factor and unroll factor have the form
1434 	 current_vector_size * X for some rational X, so they must have
1435 	 a common multiple.  */
1436       vectorization_factor
1437 	= force_common_multiple (vectorization_factor,
1438 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1439     }
1440 
1441   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1442   if (dump_enabled_p ())
1443     {
1444       dump_printf_loc (MSG_NOTE, vect_location,
1445 		       "Updating vectorization factor to ");
1446       dump_dec (MSG_NOTE, vectorization_factor);
1447       dump_printf (MSG_NOTE, ".\n");
1448     }
1449 }
1450 
1451 /* Return true if STMT_INFO describes a double reduction phi and if
1452    the other phi in the reduction is also relevant for vectorization.
1453    This rejects cases such as:
1454 
1455       outer1:
1456 	x_1 = PHI <x_3(outer2), ...>;
1457 	...
1458 
1459       inner:
1460 	x_2 = ...;
1461 	...
1462 
1463       outer2:
1464 	x_3 = PHI <x_2(inner)>;
1465 
1466    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1467 
1468 static bool
1469 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1470 {
1471   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1472     return false;
1473 
1474   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1475 }
1476 
1477 /* Function vect_analyze_loop_operations.
1478 
1479    Scan the loop stmts and make sure they are all vectorizable.  */
1480 
1481 static opt_result
1482 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1483 {
1484   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1485   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1486   int nbbs = loop->num_nodes;
1487   int i;
1488   stmt_vec_info stmt_info;
1489   bool need_to_vectorize = false;
1490   bool ok;
1491 
1492   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1493 
1494   auto_vec<stmt_info_for_cost> cost_vec;
1495 
1496   for (i = 0; i < nbbs; i++)
1497     {
1498       basic_block bb = bbs[i];
1499 
1500       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1501 	   gsi_next (&si))
1502         {
1503           gphi *phi = si.phi ();
1504           ok = true;
1505 
1506 	  stmt_info = loop_vinfo->lookup_stmt (phi);
1507           if (dump_enabled_p ())
1508 	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1509 	  if (virtual_operand_p (gimple_phi_result (phi)))
1510 	    continue;
1511 
1512           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1513              (i.e., a phi in the tail of the outer-loop).  */
1514           if (! is_loop_header_bb_p (bb))
1515             {
1516               /* FORNOW: we currently don't support the case that these phis
1517                  are not used in the outerloop (unless it is double reduction,
1518                  i.e., this phi is vect_reduction_def), cause this case
1519                  requires to actually do something here.  */
1520               if (STMT_VINFO_LIVE_P (stmt_info)
1521 		  && !vect_active_double_reduction_p (stmt_info))
1522 		return opt_result::failure_at (phi,
1523 					       "Unsupported loop-closed phi"
1524 					       " in outer-loop.\n");
1525 
1526               /* If PHI is used in the outer loop, we check that its operand
1527                  is defined in the inner loop.  */
1528               if (STMT_VINFO_RELEVANT_P (stmt_info))
1529                 {
1530                   tree phi_op;
1531 
1532                   if (gimple_phi_num_args (phi) != 1)
1533                     return opt_result::failure_at (phi, "unsupported phi");
1534 
1535                   phi_op = PHI_ARG_DEF (phi, 0);
1536 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1537 		  if (!op_def_info)
1538 		    return opt_result::failure_at (phi, "unsupported phi");
1539 
1540 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1541 		      && (STMT_VINFO_RELEVANT (op_def_info)
1542 			  != vect_used_in_outer_by_reduction))
1543 		    return opt_result::failure_at (phi, "unsupported phi");
1544                 }
1545 
1546               continue;
1547             }
1548 
1549           gcc_assert (stmt_info);
1550 
1551           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1552                || STMT_VINFO_LIVE_P (stmt_info))
1553               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1554 	    /* A scalar-dependence cycle that we don't support.  */
1555 	    return opt_result::failure_at (phi,
1556 					   "not vectorized:"
1557 					   " scalar dependence cycle.\n");
1558 
1559           if (STMT_VINFO_RELEVANT_P (stmt_info))
1560             {
1561               need_to_vectorize = true;
1562               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1563 		  && ! PURE_SLP_STMT (stmt_info))
1564 		ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1565 					     &cost_vec);
1566 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1567 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1568 		       && ! PURE_SLP_STMT (stmt_info))
1569 		ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1570 					     &cost_vec);
1571             }
1572 
1573 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1574 	  if (ok
1575 	      && STMT_VINFO_LIVE_P (stmt_info)
1576 	      && !PURE_SLP_STMT (stmt_info))
1577 	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1578 					      &cost_vec);
1579 
1580           if (!ok)
1581 	    return opt_result::failure_at (phi,
1582 					   "not vectorized: relevant phi not "
1583 					   "supported: %G",
1584 					   static_cast <gimple *> (phi));
1585         }
1586 
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588 	   gsi_next (&si))
1589         {
1590 	  gimple *stmt = gsi_stmt (si);
1591 	  if (!gimple_clobber_p (stmt))
1592 	    {
1593 	      opt_result res
1594 		= vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1595 				     &need_to_vectorize,
1596 				     NULL, NULL, &cost_vec);
1597 	      if (!res)
1598 		return res;
1599 	    }
1600         }
1601     } /* bbs */
1602 
1603   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1604 
1605   /* All operations in the loop are either irrelevant (deal with loop
1606      control, or dead), or only used outside the loop and can be moved
1607      out of the loop (e.g. invariants, inductions).  The loop can be
1608      optimized away by scalar optimizations.  We're better off not
1609      touching this loop.  */
1610   if (!need_to_vectorize)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_NOTE, vect_location,
1614 			 "All the computation can be taken out of the loop.\n");
1615       return opt_result::failure_at
1616 	(vect_location,
1617 	 "not vectorized: redundant loop. no profit to vectorize.\n");
1618     }
1619 
1620   return opt_result::success ();
1621 }
1622 
1623 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1624    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1625    definitely no, or -1 if it's worth retrying.  */
1626 
1627 static int
1628 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1629 {
1630   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1632 
1633   /* Only fully-masked loops can have iteration counts less than the
1634      vectorization factor.  */
1635   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1636     {
1637       HOST_WIDE_INT max_niter;
1638 
1639       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1640 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1641       else
1642 	max_niter = max_stmt_executions_int (loop);
1643 
1644       if (max_niter != -1
1645 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1646 	{
1647 	  if (dump_enabled_p ())
1648 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649 			     "not vectorized: iteration count smaller than "
1650 			     "vectorization factor.\n");
1651 	  return 0;
1652 	}
1653     }
1654 
1655   int min_profitable_iters, min_profitable_estimate;
1656   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1657 				      &min_profitable_estimate);
1658 
1659   if (min_profitable_iters < 0)
1660     {
1661       if (dump_enabled_p ())
1662 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663 			 "not vectorized: vectorization not profitable.\n");
1664       if (dump_enabled_p ())
1665 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666 			 "not vectorized: vector version will never be "
1667 			 "profitable.\n");
1668       return -1;
1669     }
1670 
1671   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1672 			       * assumed_vf);
1673 
1674   /* Use the cost model only if it is more conservative than user specified
1675      threshold.  */
1676   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1677 				    min_profitable_iters);
1678 
1679   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1680 
1681   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1682       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1683     {
1684       if (dump_enabled_p ())
1685 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 			 "not vectorized: vectorization not profitable.\n");
1687       if (dump_enabled_p ())
1688 	dump_printf_loc (MSG_NOTE, vect_location,
1689 			 "not vectorized: iteration count smaller than user "
1690 			 "specified loop bound parameter or minimum profitable "
1691 			 "iterations (whichever is more conservative).\n");
1692       return 0;
1693     }
1694 
1695   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1696   if (estimated_niter == -1)
1697     estimated_niter = likely_max_stmt_executions_int (loop);
1698   if (estimated_niter != -1
1699       && ((unsigned HOST_WIDE_INT) estimated_niter
1700 	  < MAX (th, (unsigned) min_profitable_estimate)))
1701     {
1702       if (dump_enabled_p ())
1703 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 			 "not vectorized: estimated iteration count too "
1705 			 "small.\n");
1706       if (dump_enabled_p ())
1707 	dump_printf_loc (MSG_NOTE, vect_location,
1708 			 "not vectorized: estimated iteration count smaller "
1709 			 "than specified loop bound parameter or minimum "
1710 			 "profitable iterations (whichever is more "
1711 			 "conservative).\n");
1712       return -1;
1713     }
1714 
1715   return 1;
1716 }
1717 
1718 static opt_result
1719 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1720 			   vec<data_reference_p> *datarefs,
1721 			   unsigned int *n_stmts)
1722 {
1723   *n_stmts = 0;
1724   for (unsigned i = 0; i < loop->num_nodes; i++)
1725     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1726 	 !gsi_end_p (gsi); gsi_next (&gsi))
1727       {
1728 	gimple *stmt = gsi_stmt (gsi);
1729 	if (is_gimple_debug (stmt))
1730 	  continue;
1731 	++(*n_stmts);
1732 	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1733 	if (!res)
1734 	  {
1735 	    if (is_gimple_call (stmt) && loop->safelen)
1736 	      {
1737 		tree fndecl = gimple_call_fndecl (stmt), op;
1738 		if (fndecl != NULL_TREE)
1739 		  {
1740 		    cgraph_node *node = cgraph_node::get (fndecl);
1741 		    if (node != NULL && node->simd_clones != NULL)
1742 		      {
1743 			unsigned int j, n = gimple_call_num_args (stmt);
1744 			for (j = 0; j < n; j++)
1745 			  {
1746 			    op = gimple_call_arg (stmt, j);
1747 			    if (DECL_P (op)
1748 				|| (REFERENCE_CLASS_P (op)
1749 				    && get_base_address (op)))
1750 			      break;
1751 			  }
1752 			op = gimple_call_lhs (stmt);
1753 			/* Ignore #pragma omp declare simd functions
1754 			   if they don't have data references in the
1755 			   call stmt itself.  */
1756 			if (j == n
1757 			    && !(op
1758 				 && (DECL_P (op)
1759 				     || (REFERENCE_CLASS_P (op)
1760 					 && get_base_address (op)))))
1761 			  continue;
1762 		      }
1763 		  }
1764 	      }
1765 	    return res;
1766 	  }
1767 	/* If dependence analysis will give up due to the limit on the
1768 	   number of datarefs stop here and fail fatally.  */
1769 	if (datarefs->length ()
1770 	    > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1771 	  return opt_result::failure_at (stmt, "exceeded param "
1772 					 "loop-max-datarefs-for-datadeps\n");
1773       }
1774   return opt_result::success ();
1775 }
1776 
1777 /* Function vect_analyze_loop_2.
1778 
1779    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780    for it.  The different analyses will record information in the
1781    loop_vec_info struct.  */
1782 static opt_result
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1784 {
1785   opt_result ok = opt_result::success ();
1786   int res;
1787   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788   poly_uint64 min_vf = 2;
1789 
1790   /* The first group of checks is independent of the vector size.  */
1791   fatal = true;
1792 
1793   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1794       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1795     return opt_result::failure_at (vect_location,
1796 				   "not vectorized: simd if(0)\n");
1797 
1798   /* Find all data references in the loop (which correspond to vdefs/vuses)
1799      and analyze their evolution in the loop.  */
1800 
1801   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1802 
1803   /* Gather the data references and count stmts in the loop.  */
1804   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1805     {
1806       opt_result res
1807 	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1808 				     &LOOP_VINFO_DATAREFS (loop_vinfo),
1809 				     n_stmts);
1810       if (!res)
1811 	{
1812 	  if (dump_enabled_p ())
1813 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1814 			     "not vectorized: loop contains function "
1815 			     "calls or data references that cannot "
1816 			     "be analyzed\n");
1817 	  return res;
1818 	}
1819       loop_vinfo->shared->save_datarefs ();
1820     }
1821   else
1822     loop_vinfo->shared->check_datarefs ();
1823 
1824   /* Analyze the data references and also adjust the minimal
1825      vectorization factor according to the loads and stores.  */
1826 
1827   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1828   if (!ok)
1829     {
1830       if (dump_enabled_p ())
1831 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832 			 "bad data references.\n");
1833       return ok;
1834     }
1835 
1836   /* Classify all cross-iteration scalar data-flow cycles.
1837      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1838   vect_analyze_scalar_cycles (loop_vinfo);
1839 
1840   vect_pattern_recog (loop_vinfo);
1841 
1842   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1843 
1844   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1845      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1846 
1847   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1848   if (!ok)
1849     {
1850       if (dump_enabled_p ())
1851 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852 			 "bad data access.\n");
1853       return ok;
1854     }
1855 
1856   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1857 
1858   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1859   if (!ok)
1860     {
1861       if (dump_enabled_p ())
1862 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 			 "unexpected pattern.\n");
1864       return ok;
1865     }
1866 
1867   /* While the rest of the analysis below depends on it in some way.  */
1868   fatal = false;
1869 
1870   /* Analyze data dependences between the data-refs in the loop
1871      and adjust the maximum vectorization factor according to
1872      the dependences.
1873      FORNOW: fail at the first data dependence that we encounter.  */
1874 
1875   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1876   if (!ok)
1877     {
1878       if (dump_enabled_p ())
1879 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1880 			 "bad data dependence.\n");
1881       return ok;
1882     }
1883   if (max_vf != MAX_VECTORIZATION_FACTOR
1884       && maybe_lt (max_vf, min_vf))
1885     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1886   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1887 
1888   ok = vect_determine_vectorization_factor (loop_vinfo);
1889   if (!ok)
1890     {
1891       if (dump_enabled_p ())
1892 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893 			 "can't determine vectorization factor.\n");
1894       return ok;
1895     }
1896   if (max_vf != MAX_VECTORIZATION_FACTOR
1897       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1898     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1899 
1900   /* Compute the scalar iteration cost.  */
1901   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1902 
1903   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1904   unsigned th;
1905 
1906   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1907   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1908   if (!ok)
1909     return ok;
1910 
1911   /* If there are any SLP instances mark them as pure_slp.  */
1912   bool slp = vect_make_slp_decision (loop_vinfo);
1913   if (slp)
1914     {
1915       /* Find stmts that need to be both vectorized and SLPed.  */
1916       vect_detect_hybrid_slp (loop_vinfo);
1917 
1918       /* Update the vectorization factor based on the SLP decision.  */
1919       vect_update_vf_for_slp (loop_vinfo);
1920     }
1921 
1922   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1923 
1924   /* We don't expect to have to roll back to anything other than an empty
1925      set of rgroups.  */
1926   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1927 
1928   /* This is the point where we can re-start analysis with SLP forced off.  */
1929 start_over:
1930 
1931   /* Now the vectorization factor is final.  */
1932   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1933   gcc_assert (known_ne (vectorization_factor, 0U));
1934 
1935   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1936     {
1937       dump_printf_loc (MSG_NOTE, vect_location,
1938 		       "vectorization_factor = ");
1939       dump_dec (MSG_NOTE, vectorization_factor);
1940       dump_printf (MSG_NOTE, ", niters = %wd\n",
1941 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
1942     }
1943 
1944   HOST_WIDE_INT max_niter
1945     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1946 
1947   /* Analyze the alignment of the data-refs in the loop.
1948      Fail if a data reference is found that cannot be vectorized.  */
1949 
1950   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1951   if (!ok)
1952     {
1953       if (dump_enabled_p ())
1954 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955 			 "bad data alignment.\n");
1956       return ok;
1957     }
1958 
1959   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1960      It is important to call pruning after vect_analyze_data_ref_accesses,
1961      since we use grouping information gathered by interleaving analysis.  */
1962   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1963   if (!ok)
1964     return ok;
1965 
1966   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1967      vectorization, since we do not want to add extra peeling or
1968      add versioning for alignment.  */
1969   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1970     /* This pass will decide on using loop versioning and/or loop peeling in
1971        order to enhance the alignment of data references in the loop.  */
1972     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1973   else
1974     ok = vect_verify_datarefs_alignment (loop_vinfo);
1975   if (!ok)
1976     return ok;
1977 
1978   if (slp)
1979     {
1980       /* Analyze operations in the SLP instances.  Note this may
1981 	 remove unsupported SLP instances which makes the above
1982 	 SLP kind detection invalid.  */
1983       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1984       vect_slp_analyze_operations (loop_vinfo);
1985       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1986 	{
1987 	  ok = opt_result::failure_at (vect_location,
1988 				       "unsupported SLP instances\n");
1989 	  goto again;
1990 	}
1991     }
1992 
1993   /* Scan all the remaining operations in the loop that are not subject
1994      to SLP and make sure they are vectorizable.  */
1995   ok = vect_analyze_loop_operations (loop_vinfo);
1996   if (!ok)
1997     {
1998       if (dump_enabled_p ())
1999 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2000 			 "bad operation or unsupported loop bound.\n");
2001       return ok;
2002     }
2003 
2004   /* Decide whether to use a fully-masked loop for this vectorization
2005      factor.  */
2006   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2007     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2008        && vect_verify_full_masking (loop_vinfo));
2009   if (dump_enabled_p ())
2010     {
2011       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2012 	dump_printf_loc (MSG_NOTE, vect_location,
2013 			 "using a fully-masked loop.\n");
2014       else
2015 	dump_printf_loc (MSG_NOTE, vect_location,
2016 			 "not using a fully-masked loop.\n");
2017     }
2018 
2019   /* If epilog loop is required because of data accesses with gaps,
2020      one additional iteration needs to be peeled.  Check if there is
2021      enough iterations for vectorization.  */
2022   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2023       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2025     {
2026       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2027       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2028 
2029       if (known_lt (wi::to_widest (scalar_niters), vf))
2030 	return opt_result::failure_at (vect_location,
2031 				       "loop has no enough iterations to"
2032 				       " support peeling for gaps.\n");
2033     }
2034 
2035   /* Check the costings of the loop make vectorizing worthwhile.  */
2036   res = vect_analyze_loop_costing (loop_vinfo);
2037   if (res < 0)
2038     {
2039       ok = opt_result::failure_at (vect_location,
2040 				   "Loop costings may not be worthwhile.\n");
2041       goto again;
2042     }
2043   if (!res)
2044     return opt_result::failure_at (vect_location,
2045 				   "Loop costings not worthwhile.\n");
2046 
2047   /* Decide whether we need to create an epilogue loop to handle
2048      remaining scalar iterations.  */
2049   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2050 
2051   unsigned HOST_WIDE_INT const_vf;
2052   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2053     /* The main loop handles all iterations.  */
2054     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2055   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2056 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2057     {
2058       /* Work out the (constant) number of iterations that need to be
2059 	 peeled for reasons other than niters.  */
2060       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2061       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2062 	peel_niter += 1;
2063       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2064 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2065 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2066     }
2067   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2068 	   /* ??? When peeling for gaps but not alignment, we could
2069 	      try to check whether the (variable) niters is known to be
2070 	      VF * N + 1.  That's something of a niche case though.  */
2071 	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2072 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2073 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2074 		< (unsigned) exact_log2 (const_vf))
2075 	       /* In case of versioning, check if the maximum number of
2076 		  iterations is greater than th.  If they are identical,
2077 		  the epilogue is unnecessary.  */
2078 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2079 		   || ((unsigned HOST_WIDE_INT) max_niter
2080 		       > (th / const_vf) * const_vf))))
2081     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2082 
2083   /* If an epilogue loop is required make sure we can create one.  */
2084   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2085       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2089       if (!vect_can_advance_ivs_p (loop_vinfo)
2090 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2091 					   single_exit (LOOP_VINFO_LOOP
2092 							 (loop_vinfo))))
2093         {
2094 	  ok = opt_result::failure_at (vect_location,
2095 				       "not vectorized: can't create required "
2096 				       "epilog loop\n");
2097           goto again;
2098         }
2099     }
2100 
2101   /* During peeling, we need to check if number of loop iterations is
2102      enough for both peeled prolog loop and vector loop.  This check
2103      can be merged along with threshold check of loop versioning, so
2104      increase threshold for this case if necessary.  */
2105   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2106     {
2107       poly_uint64 niters_th = 0;
2108 
2109       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2110 	{
2111 	  /* Niters for peeled prolog loop.  */
2112 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2113 	    {
2114 	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2115 	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2116 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2117 	    }
2118 	  else
2119 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2120 	}
2121 
2122       /* Niters for at least one iteration of vectorized loop.  */
2123       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2124 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2125       /* One additional iteration because of peeling for gap.  */
2126       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2127 	niters_th += 1;
2128       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2129     }
2130 
2131   gcc_assert (known_eq (vectorization_factor,
2132 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2133 
2134   /* Ok to vectorize!  */
2135   return opt_result::success ();
2136 
2137 again:
2138   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2139   gcc_assert (!ok);
2140 
2141   /* Try again with SLP forced off but if we didn't do any SLP there is
2142      no point in re-trying.  */
2143   if (!slp)
2144     return ok;
2145 
2146   /* If there are reduction chains re-trying will fail anyway.  */
2147   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2148     return ok;
2149 
2150   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2151      via interleaving or lane instructions.  */
2152   slp_instance instance;
2153   slp_tree node;
2154   unsigned i, j;
2155   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2156     {
2157       stmt_vec_info vinfo;
2158       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2159       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2160 	continue;
2161       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2162       unsigned int size = DR_GROUP_SIZE (vinfo);
2163       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2164       if (! vect_store_lanes_supported (vectype, size, false)
2165 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2166 	 && ! vect_grouped_store_supported (vectype, size))
2167 	return opt_result::failure_at (vinfo->stmt,
2168 				       "unsupported grouped store\n");
2169       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2170 	{
2171 	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2172 	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2173 	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2174 	  size = DR_GROUP_SIZE (vinfo);
2175 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2176 	  if (! vect_load_lanes_supported (vectype, size, false)
2177 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2178 						size))
2179 	    return opt_result::failure_at (vinfo->stmt,
2180 					   "unsupported grouped load\n");
2181 	}
2182     }
2183 
2184   if (dump_enabled_p ())
2185     dump_printf_loc (MSG_NOTE, vect_location,
2186 		     "re-trying with SLP disabled\n");
2187 
2188   /* Roll back state appropriately.  No SLP this time.  */
2189   slp = false;
2190   /* Restore vectorization factor as it were without SLP.  */
2191   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2192   /* Free the SLP instances.  */
2193   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2194     vect_free_slp_instance (instance, false);
2195   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2196   /* Reset SLP type to loop_vect on all stmts.  */
2197   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2198     {
2199       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2200       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2201 	   !gsi_end_p (si); gsi_next (&si))
2202 	{
2203 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2204 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2205 	}
2206       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2207 	   !gsi_end_p (si); gsi_next (&si))
2208 	{
2209 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2210 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2211 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2212 	    {
2213 	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2214 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2215 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2216 	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2217 		   !gsi_end_p (pi); gsi_next (&pi))
2218 		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2219 		  = loop_vect;
2220 	    }
2221 	}
2222     }
2223   /* Free optimized alias test DDRS.  */
2224   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2225   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2226   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2227   /* Reset target cost data.  */
2228   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2229   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2230     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2231   /* Reset accumulated rgroup information.  */
2232   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2233   /* Reset assorted flags.  */
2234   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2235   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2236   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2237   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2238   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2239 
2240   goto start_over;
2241 }
2242 
2243 /* Function vect_analyze_loop.
2244 
2245    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2246    for it.  The different analyses will record information in the
2247    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2248    be vectorized.  */
2249 opt_loop_vec_info
2250 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2251 		   vec_info_shared *shared)
2252 {
2253   auto_vector_sizes vector_sizes;
2254 
2255   /* Autodetect first vector size we try.  */
2256   current_vector_size = 0;
2257   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2258   unsigned int next_size = 0;
2259 
2260   DUMP_VECT_SCOPE ("analyze_loop_nest");
2261 
2262   if (loop_outer (loop)
2263       && loop_vec_info_for_loop (loop_outer (loop))
2264       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2265     return opt_loop_vec_info::failure_at (vect_location,
2266 					  "outer-loop already vectorized.\n");
2267 
2268   if (!find_loop_nest (loop, &shared->loop_nest))
2269     return opt_loop_vec_info::failure_at
2270       (vect_location,
2271        "not vectorized: loop nest containing two or more consecutive inner"
2272        " loops cannot be vectorized\n");
2273 
2274   unsigned n_stmts = 0;
2275   poly_uint64 autodetected_vector_size = 0;
2276   while (1)
2277     {
2278       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2279       opt_loop_vec_info loop_vinfo
2280 	= vect_analyze_loop_form (loop, shared);
2281       if (!loop_vinfo)
2282 	{
2283 	  if (dump_enabled_p ())
2284 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 			     "bad loop form.\n");
2286 	  return loop_vinfo;
2287 	}
2288 
2289       bool fatal = false;
2290 
2291       if (orig_loop_vinfo)
2292 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2293 
2294       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2295       if (res)
2296 	{
2297 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2298 
2299 	  return loop_vinfo;
2300 	}
2301 
2302       delete loop_vinfo;
2303 
2304       if (next_size == 0)
2305 	autodetected_vector_size = current_vector_size;
2306 
2307       if (next_size < vector_sizes.length ()
2308 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
2309 	next_size += 1;
2310 
2311       if (fatal
2312 	  || next_size == vector_sizes.length ()
2313 	  || known_eq (current_vector_size, 0U))
2314 	return opt_loop_vec_info::propagate_failure (res);
2315 
2316       /* Try the next biggest vector size.  */
2317       current_vector_size = vector_sizes[next_size++];
2318       if (dump_enabled_p ())
2319 	{
2320 	  dump_printf_loc (MSG_NOTE, vect_location,
2321 			   "***** Re-trying analysis with "
2322 			   "vector size ");
2323 	  dump_dec (MSG_NOTE, current_vector_size);
2324 	  dump_printf (MSG_NOTE, "\n");
2325 	}
2326     }
2327 }
2328 
2329 /* Return true if there is an in-order reduction function for CODE, storing
2330    it in *REDUC_FN if so.  */
2331 
2332 static bool
2333 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2334 {
2335   switch (code)
2336     {
2337     case PLUS_EXPR:
2338       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2339       return true;
2340 
2341     default:
2342       return false;
2343     }
2344 }
2345 
2346 /* Function reduction_fn_for_scalar_code
2347 
2348    Input:
2349    CODE - tree_code of a reduction operations.
2350 
2351    Output:
2352    REDUC_FN - the corresponding internal function to be used to reduce the
2353       vector of partial results into a single scalar result, or IFN_LAST
2354       if the operation is a supported reduction operation, but does not have
2355       such an internal function.
2356 
2357    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2358 
2359 static bool
2360 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2361 {
2362   switch (code)
2363     {
2364       case MAX_EXPR:
2365         *reduc_fn = IFN_REDUC_MAX;
2366         return true;
2367 
2368       case MIN_EXPR:
2369         *reduc_fn = IFN_REDUC_MIN;
2370         return true;
2371 
2372       case PLUS_EXPR:
2373         *reduc_fn = IFN_REDUC_PLUS;
2374         return true;
2375 
2376       case BIT_AND_EXPR:
2377 	*reduc_fn = IFN_REDUC_AND;
2378 	return true;
2379 
2380       case BIT_IOR_EXPR:
2381 	*reduc_fn = IFN_REDUC_IOR;
2382 	return true;
2383 
2384       case BIT_XOR_EXPR:
2385 	*reduc_fn = IFN_REDUC_XOR;
2386 	return true;
2387 
2388       case MULT_EXPR:
2389       case MINUS_EXPR:
2390         *reduc_fn = IFN_LAST;
2391         return true;
2392 
2393       default:
2394        return false;
2395     }
2396 }
2397 
2398 /* If there is a neutral value X such that SLP reduction NODE would not
2399    be affected by the introduction of additional X elements, return that X,
2400    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2401    is true if the SLP statements perform a single reduction, false if each
2402    statement performs an independent reduction.  */
2403 
2404 static tree
2405 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2406 			      bool reduc_chain)
2407 {
2408   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2409   stmt_vec_info stmt_vinfo = stmts[0];
2410   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2411   tree scalar_type = TREE_TYPE (vector_type);
2412   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2413   gcc_assert (loop);
2414 
2415   switch (code)
2416     {
2417     case WIDEN_SUM_EXPR:
2418     case DOT_PROD_EXPR:
2419     case SAD_EXPR:
2420     case PLUS_EXPR:
2421     case MINUS_EXPR:
2422     case BIT_IOR_EXPR:
2423     case BIT_XOR_EXPR:
2424       return build_zero_cst (scalar_type);
2425 
2426     case MULT_EXPR:
2427       return build_one_cst (scalar_type);
2428 
2429     case BIT_AND_EXPR:
2430       return build_all_ones_cst (scalar_type);
2431 
2432     case MAX_EXPR:
2433     case MIN_EXPR:
2434       /* For MIN/MAX the initial values are neutral.  A reduction chain
2435 	 has only a single initial value, so that value is neutral for
2436 	 all statements.  */
2437       if (reduc_chain)
2438 	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2439 				      loop_preheader_edge (loop));
2440       return NULL_TREE;
2441 
2442     default:
2443       return NULL_TREE;
2444     }
2445 }
2446 
2447 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2448    STMT is printed with a message MSG. */
2449 
2450 static void
2451 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2452 {
2453   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2454 }
2455 
2456 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2457    operation.  Return true if the results of DEF_STMT_INFO are something
2458    that can be accumulated by such a reduction.  */
2459 
2460 static bool
2461 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2462 {
2463   return (is_gimple_assign (def_stmt_info->stmt)
2464 	  || is_gimple_call (def_stmt_info->stmt)
2465 	  || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2466 	  || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2467 	      && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2468 	      && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2469 }
2470 
2471 /* Detect SLP reduction of the form:
2472 
2473    #a1 = phi <a5, a0>
2474    a2 = operation (a1)
2475    a3 = operation (a2)
2476    a4 = operation (a3)
2477    a5 = operation (a4)
2478 
2479    #a = phi <a5>
2480 
2481    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2482    FIRST_STMT is the first reduction stmt in the chain
2483    (a2 = operation (a1)).
2484 
2485    Return TRUE if a reduction chain was detected.  */
2486 
2487 static bool
2488 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2489 		       gimple *first_stmt)
2490 {
2491   struct loop *loop = (gimple_bb (phi))->loop_father;
2492   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2493   enum tree_code code;
2494   gimple *loop_use_stmt = NULL;
2495   stmt_vec_info use_stmt_info;
2496   tree lhs;
2497   imm_use_iterator imm_iter;
2498   use_operand_p use_p;
2499   int nloop_uses, size = 0, n_out_of_loop_uses;
2500   bool found = false;
2501 
2502   if (loop != vect_loop)
2503     return false;
2504 
2505   auto_vec<stmt_vec_info, 8> reduc_chain;
2506   lhs = PHI_RESULT (phi);
2507   code = gimple_assign_rhs_code (first_stmt);
2508   while (1)
2509     {
2510       nloop_uses = 0;
2511       n_out_of_loop_uses = 0;
2512       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2513         {
2514 	  gimple *use_stmt = USE_STMT (use_p);
2515 	  if (is_gimple_debug (use_stmt))
2516 	    continue;
2517 
2518           /* Check if we got back to the reduction phi.  */
2519 	  if (use_stmt == phi)
2520             {
2521 	      loop_use_stmt = use_stmt;
2522               found = true;
2523               break;
2524             }
2525 
2526           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2527             {
2528 	      loop_use_stmt = use_stmt;
2529 	      nloop_uses++;
2530             }
2531            else
2532              n_out_of_loop_uses++;
2533 
2534            /* There are can be either a single use in the loop or two uses in
2535               phi nodes.  */
2536            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2537              return false;
2538         }
2539 
2540       if (found)
2541         break;
2542 
2543       /* We reached a statement with no loop uses.  */
2544       if (nloop_uses == 0)
2545 	return false;
2546 
2547       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2548       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2549         return false;
2550 
2551       if (!is_gimple_assign (loop_use_stmt)
2552 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2553 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2554         return false;
2555 
2556       /* Insert USE_STMT into reduction chain.  */
2557       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2558       reduc_chain.safe_push (use_stmt_info);
2559 
2560       lhs = gimple_assign_lhs (loop_use_stmt);
2561       size++;
2562    }
2563 
2564   if (!found || loop_use_stmt != phi || size < 2)
2565     return false;
2566 
2567   /* Swap the operands, if needed, to make the reduction operand be the second
2568      operand.  */
2569   lhs = PHI_RESULT (phi);
2570   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2571     {
2572       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2573       if (gimple_assign_rhs2 (next_stmt) == lhs)
2574 	{
2575 	  tree op = gimple_assign_rhs1 (next_stmt);
2576 	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2577 
2578 	  /* Check that the other def is either defined in the loop
2579 	     ("vect_internal_def"), or it's an induction (defined by a
2580 	     loop-header phi-node).  */
2581 	  if (def_stmt_info
2582 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2583 	      && vect_valid_reduction_input_p (def_stmt_info))
2584 	    {
2585 	      lhs = gimple_assign_lhs (next_stmt);
2586  	      continue;
2587 	    }
2588 
2589 	  return false;
2590 	}
2591       else
2592 	{
2593           tree op = gimple_assign_rhs2 (next_stmt);
2594 	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2595 
2596           /* Check that the other def is either defined in the loop
2597             ("vect_internal_def"), or it's an induction (defined by a
2598             loop-header phi-node).  */
2599 	  if (def_stmt_info
2600 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2601 	      && vect_valid_reduction_input_p (def_stmt_info))
2602   	    {
2603 	      if (dump_enabled_p ())
2604 		dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2605 				 next_stmt);
2606 
2607 	      swap_ssa_operands (next_stmt,
2608 	 		         gimple_assign_rhs1_ptr (next_stmt),
2609                                  gimple_assign_rhs2_ptr (next_stmt));
2610 	      update_stmt (next_stmt);
2611 
2612 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2613 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2614 	    }
2615 	  else
2616 	    return false;
2617         }
2618 
2619       lhs = gimple_assign_lhs (next_stmt);
2620     }
2621 
2622   /* Build up the actual chain.  */
2623   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2624     {
2625       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2626       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2627     }
2628   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2629   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2630 
2631   /* Save the chain for further analysis in SLP detection.  */
2632   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2633   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2634 
2635   return true;
2636 }
2637 
2638 /* Return true if we need an in-order reduction for operation CODE
2639    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2640    overflow must wrap.  */
2641 
2642 static bool
2643 needs_fold_left_reduction_p (tree type, tree_code code,
2644 			     bool need_wrapping_integral_overflow)
2645 {
2646   /* CHECKME: check for !flag_finite_math_only too?  */
2647   if (SCALAR_FLOAT_TYPE_P (type))
2648     switch (code)
2649       {
2650       case MIN_EXPR:
2651       case MAX_EXPR:
2652 	return false;
2653 
2654       default:
2655 	return !flag_associative_math;
2656       }
2657 
2658   if (INTEGRAL_TYPE_P (type))
2659     {
2660       if (!operation_no_trapping_overflow (type, code))
2661 	return true;
2662       if (need_wrapping_integral_overflow
2663 	  && !TYPE_OVERFLOW_WRAPS (type)
2664 	  && operation_can_overflow (code))
2665 	return true;
2666       return false;
2667     }
2668 
2669   if (SAT_FIXED_POINT_TYPE_P (type))
2670     return true;
2671 
2672   return false;
2673 }
2674 
2675 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2676    reduction operation CODE has a handled computation expression.  */
2677 
2678 bool
2679 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2680 		      tree loop_arg, enum tree_code code)
2681 {
2682   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2683   auto_bitmap visited;
2684   tree lookfor = PHI_RESULT (phi);
2685   ssa_op_iter curri;
2686   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2687   while (USE_FROM_PTR (curr) != loop_arg)
2688     curr = op_iter_next_use (&curri);
2689   curri.i = curri.numops;
2690   do
2691     {
2692       path.safe_push (std::make_pair (curri, curr));
2693       tree use = USE_FROM_PTR (curr);
2694       if (use == lookfor)
2695 	break;
2696       gimple *def = SSA_NAME_DEF_STMT (use);
2697       if (gimple_nop_p (def)
2698 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2699 	{
2700 pop:
2701 	  do
2702 	    {
2703 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2704 	      curri = x.first;
2705 	      curr = x.second;
2706 	      do
2707 		curr = op_iter_next_use (&curri);
2708 	      /* Skip already visited or non-SSA operands (from iterating
2709 	         over PHI args).  */
2710 	      while (curr != NULL_USE_OPERAND_P
2711 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2712 			 || ! bitmap_set_bit (visited,
2713 					      SSA_NAME_VERSION
2714 					        (USE_FROM_PTR (curr)))));
2715 	    }
2716 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2717 	  if (curr == NULL_USE_OPERAND_P)
2718 	    break;
2719 	}
2720       else
2721 	{
2722 	  if (gimple_code (def) == GIMPLE_PHI)
2723 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2724 	  else
2725 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2726 	  while (curr != NULL_USE_OPERAND_P
2727 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2728 		     || ! bitmap_set_bit (visited,
2729 					  SSA_NAME_VERSION
2730 					    (USE_FROM_PTR (curr)))))
2731 	    curr = op_iter_next_use (&curri);
2732 	  if (curr == NULL_USE_OPERAND_P)
2733 	    goto pop;
2734 	}
2735     }
2736   while (1);
2737   if (dump_file && (dump_flags & TDF_DETAILS))
2738     {
2739       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2740       unsigned i;
2741       std::pair<ssa_op_iter, use_operand_p> *x;
2742       FOR_EACH_VEC_ELT (path, i, x)
2743 	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2744       dump_printf (MSG_NOTE, "\n");
2745     }
2746 
2747   /* Check whether the reduction path detected is valid.  */
2748   bool fail = path.length () == 0;
2749   bool neg = false;
2750   for (unsigned i = 1; i < path.length (); ++i)
2751     {
2752       gimple *use_stmt = USE_STMT (path[i].second);
2753       tree op = USE_FROM_PTR (path[i].second);
2754       if (! has_single_use (op)
2755 	  || ! is_gimple_assign (use_stmt))
2756 	{
2757 	  fail = true;
2758 	  break;
2759 	}
2760       if (gimple_assign_rhs_code (use_stmt) != code)
2761 	{
2762 	  if (code == PLUS_EXPR
2763 	      && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2764 	    {
2765 	      /* Track whether we negate the reduction value each iteration.  */
2766 	      if (gimple_assign_rhs2 (use_stmt) == op)
2767 		neg = ! neg;
2768 	    }
2769 	  else
2770 	    {
2771 	      fail = true;
2772 	      break;
2773 	    }
2774 	}
2775     }
2776   return ! fail && ! neg;
2777 }
2778 
2779 
2780 /* Function vect_is_simple_reduction
2781 
2782    (1) Detect a cross-iteration def-use cycle that represents a simple
2783    reduction computation.  We look for the following pattern:
2784 
2785    loop_header:
2786      a1 = phi < a0, a2 >
2787      a3 = ...
2788      a2 = operation (a3, a1)
2789 
2790    or
2791 
2792    a3 = ...
2793    loop_header:
2794      a1 = phi < a0, a2 >
2795      a2 = operation (a3, a1)
2796 
2797    such that:
2798    1. operation is commutative and associative and it is safe to
2799       change the order of the computation
2800    2. no uses for a2 in the loop (a2 is used out of the loop)
2801    3. no uses of a1 in the loop besides the reduction operation
2802    4. no uses of a1 outside the loop.
2803 
2804    Conditions 1,4 are tested here.
2805    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2806 
2807    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2808    nested cycles.
2809 
2810    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2811    reductions:
2812 
2813      a1 = phi < a0, a2 >
2814      inner loop (def of a3)
2815      a2 = phi < a3 >
2816 
2817    (4) Detect condition expressions, ie:
2818      for (int i = 0; i < N; i++)
2819        if (a[i] < val)
2820 	ret_val = a[i];
2821 
2822 */
2823 
2824 static stmt_vec_info
2825 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2826 			  bool *double_reduc,
2827 			  bool need_wrapping_integral_overflow,
2828 			  enum vect_reduction_type *v_reduc_type)
2829 {
2830   gphi *phi = as_a <gphi *> (phi_info->stmt);
2831   struct loop *loop = (gimple_bb (phi))->loop_father;
2832   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2833   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2834   gimple *phi_use_stmt = NULL;
2835   enum tree_code orig_code, code;
2836   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2837   tree type;
2838   tree name;
2839   imm_use_iterator imm_iter;
2840   use_operand_p use_p;
2841   bool phi_def;
2842 
2843   *double_reduc = false;
2844   *v_reduc_type = TREE_CODE_REDUCTION;
2845 
2846   tree phi_name = PHI_RESULT (phi);
2847   /* ???  If there are no uses of the PHI result the inner loop reduction
2848      won't be detected as possibly double-reduction by vectorizable_reduction
2849      because that tries to walk the PHI arg from the preheader edge which
2850      can be constant.  See PR60382.  */
2851   if (has_zero_uses (phi_name))
2852     return NULL;
2853   unsigned nphi_def_loop_uses = 0;
2854   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2855     {
2856       gimple *use_stmt = USE_STMT (use_p);
2857       if (is_gimple_debug (use_stmt))
2858 	continue;
2859 
2860       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2861         {
2862           if (dump_enabled_p ())
2863 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2864 			     "intermediate value used outside loop.\n");
2865 
2866           return NULL;
2867         }
2868 
2869       nphi_def_loop_uses++;
2870       phi_use_stmt = use_stmt;
2871     }
2872 
2873   edge latch_e = loop_latch_edge (loop);
2874   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2875   if (TREE_CODE (loop_arg) != SSA_NAME)
2876     {
2877       if (dump_enabled_p ())
2878 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2879 			 "reduction: not ssa_name: %T\n", loop_arg);
2880       return NULL;
2881     }
2882 
2883   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2884   if (!def_stmt_info
2885       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2886     return NULL;
2887 
2888   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2889     {
2890       name = gimple_assign_lhs (def_stmt);
2891       phi_def = false;
2892     }
2893   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2894     {
2895       name = PHI_RESULT (def_stmt);
2896       phi_def = true;
2897     }
2898   else
2899     {
2900       if (dump_enabled_p ())
2901 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2902 			 "reduction: unhandled reduction operation: %G",
2903 			 def_stmt_info->stmt);
2904       return NULL;
2905     }
2906 
2907   unsigned nlatch_def_loop_uses = 0;
2908   auto_vec<gphi *, 3> lcphis;
2909   bool inner_loop_of_double_reduc = false;
2910   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2911     {
2912       gimple *use_stmt = USE_STMT (use_p);
2913       if (is_gimple_debug (use_stmt))
2914 	continue;
2915       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2916 	nlatch_def_loop_uses++;
2917       else
2918 	{
2919 	  /* We can have more than one loop-closed PHI.  */
2920 	  lcphis.safe_push (as_a <gphi *> (use_stmt));
2921 	  if (nested_in_vect_loop
2922 	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2923 		  == vect_double_reduction_def))
2924 	    inner_loop_of_double_reduc = true;
2925 	}
2926     }
2927 
2928   /* If this isn't a nested cycle or if the nested cycle reduction value
2929      is used ouside of the inner loop we cannot handle uses of the reduction
2930      value.  */
2931   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2932       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2933     {
2934       if (dump_enabled_p ())
2935 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936 			 "reduction used in loop.\n");
2937       return NULL;
2938     }
2939 
2940   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2941      defined in the inner loop.  */
2942   if (phi_def)
2943     {
2944       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2945       op1 = PHI_ARG_DEF (def_stmt, 0);
2946 
2947       if (gimple_phi_num_args (def_stmt) != 1
2948           || TREE_CODE (op1) != SSA_NAME)
2949         {
2950           if (dump_enabled_p ())
2951 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2952 			     "unsupported phi node definition.\n");
2953 
2954           return NULL;
2955         }
2956 
2957       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2958       if (gimple_bb (def1)
2959 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2960           && loop->inner
2961           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2962           && is_gimple_assign (def1)
2963 	  && is_a <gphi *> (phi_use_stmt)
2964 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2965         {
2966           if (dump_enabled_p ())
2967             report_vect_op (MSG_NOTE, def_stmt,
2968 			    "detected double reduction: ");
2969 
2970           *double_reduc = true;
2971 	  return def_stmt_info;
2972         }
2973 
2974       return NULL;
2975     }
2976 
2977   /* If we are vectorizing an inner reduction we are executing that
2978      in the original order only in case we are not dealing with a
2979      double reduction.  */
2980   bool check_reduction = true;
2981   if (flow_loop_nested_p (vect_loop, loop))
2982     {
2983       gphi *lcphi;
2984       unsigned i;
2985       check_reduction = false;
2986       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2987 	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2988 	  {
2989 	    gimple *use_stmt = USE_STMT (use_p);
2990 	    if (is_gimple_debug (use_stmt))
2991 	      continue;
2992 	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2993 	      check_reduction = true;
2994 	  }
2995     }
2996 
2997   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2998   code = orig_code = gimple_assign_rhs_code (def_stmt);
2999 
3000   if (nested_in_vect_loop && !check_reduction)
3001     {
3002       /* FIXME: Even for non-reductions code generation is funneled
3003 	 through vectorizable_reduction for the stmt defining the
3004 	 PHI latch value.  So we have to artificially restrict ourselves
3005 	 for the supported operations.  */
3006       switch (get_gimple_rhs_class (code))
3007 	{
3008 	case GIMPLE_BINARY_RHS:
3009 	case GIMPLE_TERNARY_RHS:
3010 	  break;
3011 	default:
3012 	  /* Not supported by vectorizable_reduction.  */
3013 	  if (dump_enabled_p ())
3014 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3015 			    "nested cycle: not handled operation: ");
3016 	  return NULL;
3017 	}
3018       if (dump_enabled_p ())
3019 	report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3020       return def_stmt_info;
3021     }
3022 
3023   /* We can handle "res -= x[i]", which is non-associative by
3024      simply rewriting this into "res += -x[i]".  Avoid changing
3025      gimple instruction for the first simple tests and only do this
3026      if we're allowed to change code at all.  */
3027   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3028     code = PLUS_EXPR;
3029 
3030   if (code == COND_EXPR)
3031     {
3032       if (! nested_in_vect_loop)
3033 	*v_reduc_type = COND_REDUCTION;
3034 
3035       op3 = gimple_assign_rhs1 (def_stmt);
3036       if (COMPARISON_CLASS_P (op3))
3037         {
3038           op4 = TREE_OPERAND (op3, 1);
3039           op3 = TREE_OPERAND (op3, 0);
3040         }
3041       if (op3 == phi_name || op4 == phi_name)
3042 	{
3043 	  if (dump_enabled_p ())
3044 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3045 			    "reduction: condition depends on previous"
3046 			    " iteration: ");
3047 	  return NULL;
3048 	}
3049 
3050       op1 = gimple_assign_rhs2 (def_stmt);
3051       op2 = gimple_assign_rhs3 (def_stmt);
3052     }
3053   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3054     {
3055       if (dump_enabled_p ())
3056 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3057 			"reduction: not commutative/associative: ");
3058       return NULL;
3059     }
3060   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3061     {
3062       op1 = gimple_assign_rhs1 (def_stmt);
3063       op2 = gimple_assign_rhs2 (def_stmt);
3064     }
3065   else
3066     {
3067       if (dump_enabled_p ())
3068 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069 			"reduction: not handled operation: ");
3070       return NULL;
3071     }
3072 
3073   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3074     {
3075       if (dump_enabled_p ())
3076 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3077 			"reduction: both uses not ssa_names: ");
3078 
3079       return NULL;
3080     }
3081 
3082   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3083   if ((TREE_CODE (op1) == SSA_NAME
3084        && !types_compatible_p (type,TREE_TYPE (op1)))
3085       || (TREE_CODE (op2) == SSA_NAME
3086           && !types_compatible_p (type, TREE_TYPE (op2)))
3087       || (op3 && TREE_CODE (op3) == SSA_NAME
3088           && !types_compatible_p (type, TREE_TYPE (op3)))
3089       || (op4 && TREE_CODE (op4) == SSA_NAME
3090           && !types_compatible_p (type, TREE_TYPE (op4))))
3091     {
3092       if (dump_enabled_p ())
3093         {
3094           dump_printf_loc (MSG_NOTE, vect_location,
3095 			   "reduction: multiple types: operation type: "
3096 			   "%T, operands types: %T,%T",
3097 			   type,  TREE_TYPE (op1), TREE_TYPE (op2));
3098           if (op3)
3099 	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3100 
3101           if (op4)
3102 	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3103           dump_printf (MSG_NOTE, "\n");
3104         }
3105 
3106       return NULL;
3107     }
3108 
3109   /* Check whether it's ok to change the order of the computation.
3110      Generally, when vectorizing a reduction we change the order of the
3111      computation.  This may change the behavior of the program in some
3112      cases, so we need to check that this is ok.  One exception is when
3113      vectorizing an outer-loop: the inner-loop is executed sequentially,
3114      and therefore vectorizing reductions in the inner-loop during
3115      outer-loop vectorization is safe.  */
3116   if (check_reduction
3117       && *v_reduc_type == TREE_CODE_REDUCTION
3118       && needs_fold_left_reduction_p (type, code,
3119 				      need_wrapping_integral_overflow))
3120     *v_reduc_type = FOLD_LEFT_REDUCTION;
3121 
3122   /* Reduction is safe. We're dealing with one of the following:
3123      1) integer arithmetic and no trapv
3124      2) floating point arithmetic, and special flags permit this optimization
3125      3) nested cycle (i.e., outer loop vectorization).  */
3126   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3127   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3128   if (code != COND_EXPR && !def1_info && !def2_info)
3129     {
3130       if (dump_enabled_p ())
3131 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3132       return NULL;
3133     }
3134 
3135   /* Check that one def is the reduction def, defined by PHI,
3136      the other def is either defined in the loop ("vect_internal_def"),
3137      or it's an induction (defined by a loop-header phi-node).  */
3138 
3139   if (def2_info
3140       && def2_info->stmt == phi
3141       && (code == COND_EXPR
3142 	  || !def1_info
3143 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3144 	  || vect_valid_reduction_input_p (def1_info)))
3145     {
3146       if (dump_enabled_p ())
3147 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3148       return def_stmt_info;
3149     }
3150 
3151   if (def1_info
3152       && def1_info->stmt == phi
3153       && (code == COND_EXPR
3154 	  || !def2_info
3155 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3156 	  || vect_valid_reduction_input_p (def2_info)))
3157     {
3158       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3159 	{
3160 	  /* Check if we can swap operands (just for simplicity - so that
3161 	     the rest of the code can assume that the reduction variable
3162 	     is always the last (second) argument).  */
3163 	  if (code == COND_EXPR)
3164 	    {
3165 	      /* Swap cond_expr by inverting the condition.  */
3166 	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
3167 	      enum tree_code invert_code = ERROR_MARK;
3168 	      enum tree_code cond_code = TREE_CODE (cond_expr);
3169 
3170 	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3171 		{
3172 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3173 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
3174 		}
3175 	      if (invert_code != ERROR_MARK)
3176 		{
3177 		  TREE_SET_CODE (cond_expr, invert_code);
3178 		  swap_ssa_operands (def_stmt,
3179 				     gimple_assign_rhs2_ptr (def_stmt),
3180 				     gimple_assign_rhs3_ptr (def_stmt));
3181 		}
3182 	      else
3183 		{
3184 		  if (dump_enabled_p ())
3185 		    report_vect_op (MSG_NOTE, def_stmt,
3186 				    "detected reduction: cannot swap operands "
3187 				    "for cond_expr");
3188 		  return NULL;
3189 		}
3190 	    }
3191 	  else
3192 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3193 			       gimple_assign_rhs2_ptr (def_stmt));
3194 
3195 	  if (dump_enabled_p ())
3196 	    report_vect_op (MSG_NOTE, def_stmt,
3197 			    "detected reduction: need to swap operands: ");
3198 
3199 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3200 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3201         }
3202       else
3203         {
3204           if (dump_enabled_p ())
3205             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3206         }
3207 
3208       return def_stmt_info;
3209     }
3210 
3211   /* Try to find SLP reduction chain.  */
3212   if (! nested_in_vect_loop
3213       && code != COND_EXPR
3214       && orig_code != MINUS_EXPR
3215       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3216     {
3217       if (dump_enabled_p ())
3218         report_vect_op (MSG_NOTE, def_stmt,
3219 			"reduction: detected reduction chain: ");
3220 
3221       return def_stmt_info;
3222     }
3223 
3224   /* Look for the expression computing loop_arg from loop PHI result.  */
3225   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3226     return def_stmt_info;
3227 
3228   if (dump_enabled_p ())
3229     {
3230       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3231 		      "reduction: unknown pattern: ");
3232     }
3233 
3234   return NULL;
3235 }
3236 
3237 /* Wrapper around vect_is_simple_reduction, which will modify code
3238    in-place if it enables detection of more reductions.  Arguments
3239    as there.  */
3240 
3241 stmt_vec_info
3242 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3243 			     bool *double_reduc,
3244 			     bool need_wrapping_integral_overflow)
3245 {
3246   enum vect_reduction_type v_reduc_type;
3247   stmt_vec_info def_info
3248     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3249 				need_wrapping_integral_overflow,
3250 				&v_reduc_type);
3251   if (def_info)
3252     {
3253       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3254       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3255       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3256       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3257     }
3258   return def_info;
3259 }
3260 
3261 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3262 int
3263 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3264                              int *peel_iters_epilogue,
3265                              stmt_vector_for_cost *scalar_cost_vec,
3266 			     stmt_vector_for_cost *prologue_cost_vec,
3267 			     stmt_vector_for_cost *epilogue_cost_vec)
3268 {
3269   int retval = 0;
3270   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3271 
3272   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3273     {
3274       *peel_iters_epilogue = assumed_vf / 2;
3275       if (dump_enabled_p ())
3276         dump_printf_loc (MSG_NOTE, vect_location,
3277 			 "cost model: epilogue peel iters set to vf/2 "
3278 			 "because loop iterations are unknown .\n");
3279 
3280       /* If peeled iterations are known but number of scalar loop
3281          iterations are unknown, count a taken branch per peeled loop.  */
3282       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3283 				 NULL, 0, vect_prologue);
3284       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3285 				 NULL, 0, vect_epilogue);
3286     }
3287   else
3288     {
3289       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3290       peel_iters_prologue = niters < peel_iters_prologue ?
3291                             niters : peel_iters_prologue;
3292       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3293       /* If we need to peel for gaps, but no peeling is required, we have to
3294 	 peel VF iterations.  */
3295       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3296 	*peel_iters_epilogue = assumed_vf;
3297     }
3298 
3299   stmt_info_for_cost *si;
3300   int j;
3301   if (peel_iters_prologue)
3302     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3303       retval += record_stmt_cost (prologue_cost_vec,
3304 				  si->count * peel_iters_prologue,
3305 				  si->kind, si->stmt_info, si->misalign,
3306 				  vect_prologue);
3307   if (*peel_iters_epilogue)
3308     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309       retval += record_stmt_cost (epilogue_cost_vec,
3310 				  si->count * *peel_iters_epilogue,
3311 				  si->kind, si->stmt_info, si->misalign,
3312 				  vect_epilogue);
3313 
3314   return retval;
3315 }
3316 
3317 /* Function vect_estimate_min_profitable_iters
3318 
3319    Return the number of iterations required for the vector version of the
3320    loop to be profitable relative to the cost of the scalar version of the
3321    loop.
3322 
3323    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3324    of iterations for vectorization.  -1 value means loop vectorization
3325    is not profitable.  This returned value may be used for dynamic
3326    profitability check.
3327 
3328    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3329    for static check against estimated number of iterations.  */
3330 
3331 static void
3332 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3333 				    int *ret_min_profitable_niters,
3334 				    int *ret_min_profitable_estimate)
3335 {
3336   int min_profitable_iters;
3337   int min_profitable_estimate;
3338   int peel_iters_prologue;
3339   int peel_iters_epilogue;
3340   unsigned vec_inside_cost = 0;
3341   int vec_outside_cost = 0;
3342   unsigned vec_prologue_cost = 0;
3343   unsigned vec_epilogue_cost = 0;
3344   int scalar_single_iter_cost = 0;
3345   int scalar_outside_cost = 0;
3346   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3347   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3348   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3349 
3350   /* Cost model disabled.  */
3351   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3352     {
3353       if (dump_enabled_p ())
3354 	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3355       *ret_min_profitable_niters = 0;
3356       *ret_min_profitable_estimate = 0;
3357       return;
3358     }
3359 
3360   /* Requires loop versioning tests to handle misalignment.  */
3361   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3362     {
3363       /*  FIXME: Make cost depend on complexity of individual check.  */
3364       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3365       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3366 			    vect_prologue);
3367       if (dump_enabled_p ())
3368 	dump_printf (MSG_NOTE,
3369 		     "cost model: Adding cost of checks for loop "
3370 		     "versioning to treat misalignment.\n");
3371     }
3372 
3373   /* Requires loop versioning with alias checks.  */
3374   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3375     {
3376       /*  FIXME: Make cost depend on complexity of individual check.  */
3377       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3378       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3379 			    vect_prologue);
3380       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3381       if (len)
3382 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3383 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3384 			      NULL, 0, vect_prologue);
3385       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3386       if (len)
3387 	{
3388 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3389 	  unsigned int nstmts = len * 2 - 1;
3390 	  /* +1 for each bias that needs adding.  */
3391 	  for (unsigned int i = 0; i < len; ++i)
3392 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3393 	      nstmts += 1;
3394 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3395 				NULL, 0, vect_prologue);
3396 	}
3397       if (dump_enabled_p ())
3398 	dump_printf (MSG_NOTE,
3399 		     "cost model: Adding cost of checks for loop "
3400 		     "versioning aliasing.\n");
3401     }
3402 
3403   /* Requires loop versioning with niter checks.  */
3404   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3405     {
3406       /*  FIXME: Make cost depend on complexity of individual check.  */
3407       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3408 			    vect_prologue);
3409       if (dump_enabled_p ())
3410 	dump_printf (MSG_NOTE,
3411 		     "cost model: Adding cost of checks for loop "
3412 		     "versioning niters.\n");
3413     }
3414 
3415   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3416     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3417 			  vect_prologue);
3418 
3419   /* Count statements in scalar loop.  Using this as scalar cost for a single
3420      iteration for now.
3421 
3422      TODO: Add outer loop support.
3423 
3424      TODO: Consider assigning different costs to different scalar
3425      statements.  */
3426 
3427   scalar_single_iter_cost
3428     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3429 
3430   /* Add additional cost for the peeled instructions in prologue and epilogue
3431      loop.  (For fully-masked loops there will be no peeling.)
3432 
3433      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3434      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3435 
3436      TODO: Build an expression that represents peel_iters for prologue and
3437      epilogue to be used in a run-time test.  */
3438 
3439   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3440     {
3441       peel_iters_prologue = 0;
3442       peel_iters_epilogue = 0;
3443 
3444       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3445 	{
3446 	  /* We need to peel exactly one iteration.  */
3447 	  peel_iters_epilogue += 1;
3448 	  stmt_info_for_cost *si;
3449 	  int j;
3450 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3451 			    j, si)
3452 	    (void) add_stmt_cost (target_cost_data, si->count,
3453 				  si->kind, si->stmt_info, si->misalign,
3454 				  vect_epilogue);
3455 	}
3456     }
3457   else if (npeel < 0)
3458     {
3459       peel_iters_prologue = assumed_vf / 2;
3460       if (dump_enabled_p ())
3461 	dump_printf (MSG_NOTE, "cost model: "
3462 		     "prologue peel iters set to vf/2.\n");
3463 
3464       /* If peeling for alignment is unknown, loop bound of main loop becomes
3465          unknown.  */
3466       peel_iters_epilogue = assumed_vf / 2;
3467       if (dump_enabled_p ())
3468 	dump_printf (MSG_NOTE, "cost model: "
3469 		     "epilogue peel iters set to vf/2 because "
3470 		     "peeling for alignment is unknown.\n");
3471 
3472       /* If peeled iterations are unknown, count a taken branch and a not taken
3473          branch per peeled loop. Even if scalar loop iterations are known,
3474          vector iterations are not known since peeled prologue iterations are
3475          not known. Hence guards remain the same.  */
3476       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3477 			    NULL, 0, vect_prologue);
3478       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3479 			    NULL, 0, vect_prologue);
3480       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3481 			    NULL, 0, vect_epilogue);
3482       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3483 			    NULL, 0, vect_epilogue);
3484       stmt_info_for_cost *si;
3485       int j;
3486       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3487 	{
3488 	  (void) add_stmt_cost (target_cost_data,
3489 				si->count * peel_iters_prologue,
3490 				si->kind, si->stmt_info, si->misalign,
3491 				vect_prologue);
3492 	  (void) add_stmt_cost (target_cost_data,
3493 				si->count * peel_iters_epilogue,
3494 				si->kind, si->stmt_info, si->misalign,
3495 				vect_epilogue);
3496 	}
3497     }
3498   else
3499     {
3500       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3501       stmt_info_for_cost *si;
3502       int j;
3503       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3504 
3505       prologue_cost_vec.create (2);
3506       epilogue_cost_vec.create (2);
3507       peel_iters_prologue = npeel;
3508 
3509       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3510 					  &peel_iters_epilogue,
3511 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3512 					    (loop_vinfo),
3513 					  &prologue_cost_vec,
3514 					  &epilogue_cost_vec);
3515 
3516       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3517 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3518 			      si->misalign, vect_prologue);
3519 
3520       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3521 	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3522 			      si->misalign, vect_epilogue);
3523 
3524       prologue_cost_vec.release ();
3525       epilogue_cost_vec.release ();
3526     }
3527 
3528   /* FORNOW: The scalar outside cost is incremented in one of the
3529      following ways:
3530 
3531      1. The vectorizer checks for alignment and aliasing and generates
3532      a condition that allows dynamic vectorization.  A cost model
3533      check is ANDED with the versioning condition.  Hence scalar code
3534      path now has the added cost of the versioning check.
3535 
3536        if (cost > th & versioning_check)
3537          jmp to vector code
3538 
3539      Hence run-time scalar is incremented by not-taken branch cost.
3540 
3541      2. The vectorizer then checks if a prologue is required.  If the
3542      cost model check was not done before during versioning, it has to
3543      be done before the prologue check.
3544 
3545        if (cost <= th)
3546          prologue = scalar_iters
3547        if (prologue == 0)
3548          jmp to vector code
3549        else
3550          execute prologue
3551        if (prologue == num_iters)
3552 	 go to exit
3553 
3554      Hence the run-time scalar cost is incremented by a taken branch,
3555      plus a not-taken branch, plus a taken branch cost.
3556 
3557      3. The vectorizer then checks if an epilogue is required.  If the
3558      cost model check was not done before during prologue check, it
3559      has to be done with the epilogue check.
3560 
3561        if (prologue == 0)
3562          jmp to vector code
3563        else
3564          execute prologue
3565        if (prologue == num_iters)
3566 	 go to exit
3567        vector code:
3568          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3569            jmp to epilogue
3570 
3571      Hence the run-time scalar cost should be incremented by 2 taken
3572      branches.
3573 
3574      TODO: The back end may reorder the BBS's differently and reverse
3575      conditions/branch directions.  Change the estimates below to
3576      something more reasonable.  */
3577 
3578   /* If the number of iterations is known and we do not do versioning, we can
3579      decide whether to vectorize at compile time.  Hence the scalar version
3580      do not carry cost model guard costs.  */
3581   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3582       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3583     {
3584       /* Cost model check occurs at versioning.  */
3585       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3586 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3587       else
3588 	{
3589 	  /* Cost model check occurs at prologue generation.  */
3590 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3591 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3592 	      + vect_get_stmt_cost (cond_branch_not_taken);
3593 	  /* Cost model check occurs at epilogue generation.  */
3594 	  else
3595 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3596 	}
3597     }
3598 
3599   /* Complete the target-specific cost calculations.  */
3600   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3601 	       &vec_inside_cost, &vec_epilogue_cost);
3602 
3603   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3604 
3605   if (dump_enabled_p ())
3606     {
3607       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3608       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3609                    vec_inside_cost);
3610       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3611                    vec_prologue_cost);
3612       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3613                    vec_epilogue_cost);
3614       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3615                    scalar_single_iter_cost);
3616       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3617                    scalar_outside_cost);
3618       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3619                    vec_outside_cost);
3620       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3621                    peel_iters_prologue);
3622       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3623                    peel_iters_epilogue);
3624     }
3625 
3626   /* Calculate number of iterations required to make the vector version
3627      profitable, relative to the loop bodies only.  The following condition
3628      must hold true:
3629      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3630      where
3631      SIC = scalar iteration cost, VIC = vector iteration cost,
3632      VOC = vector outside cost, VF = vectorization factor,
3633      NPEEL = prologue iterations + epilogue iterations,
3634      SOC = scalar outside cost for run time cost model check.  */
3635 
3636   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3637 			  - vec_inside_cost);
3638   if (saving_per_viter <= 0)
3639     {
3640       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3641 	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3642 		    "vectorization did not happen for a simd loop");
3643 
3644       if (dump_enabled_p ())
3645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646 			 "cost model: the vector iteration cost = %d "
3647 			 "divided by the scalar iteration cost = %d "
3648 			 "is greater or equal to the vectorization factor = %d"
3649                          ".\n",
3650 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3651       *ret_min_profitable_niters = -1;
3652       *ret_min_profitable_estimate = -1;
3653       return;
3654     }
3655 
3656   /* ??? The "if" arm is written to handle all cases; see below for what
3657      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3658   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3659     {
3660       /* Rewriting the condition above in terms of the number of
3661 	 vector iterations (vniters) rather than the number of
3662 	 scalar iterations (niters) gives:
3663 
3664 	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3665 
3666 	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3667 
3668 	 For integer N, X and Y when X > 0:
3669 
3670 	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3671       int outside_overhead = (vec_outside_cost
3672 			      - scalar_single_iter_cost * peel_iters_prologue
3673 			      - scalar_single_iter_cost * peel_iters_epilogue
3674 			      - scalar_outside_cost);
3675       /* We're only interested in cases that require at least one
3676 	 vector iteration.  */
3677       int min_vec_niters = 1;
3678       if (outside_overhead > 0)
3679 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3680 
3681       if (dump_enabled_p ())
3682 	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3683 		     min_vec_niters);
3684 
3685       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3686 	{
3687 	  /* Now that we know the minimum number of vector iterations,
3688 	     find the minimum niters for which the scalar cost is larger:
3689 
3690 	     SIC * niters > VIC * vniters + VOC - SOC
3691 
3692 	     We know that the minimum niters is no more than
3693 	     vniters * VF + NPEEL, but it might be (and often is) less
3694 	     than that if a partial vector iteration is cheaper than the
3695 	     equivalent scalar code.  */
3696 	  int threshold = (vec_inside_cost * min_vec_niters
3697 			   + vec_outside_cost
3698 			   - scalar_outside_cost);
3699 	  if (threshold <= 0)
3700 	    min_profitable_iters = 1;
3701 	  else
3702 	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3703 	}
3704       else
3705 	/* Convert the number of vector iterations into a number of
3706 	   scalar iterations.  */
3707 	min_profitable_iters = (min_vec_niters * assumed_vf
3708 				+ peel_iters_prologue
3709 				+ peel_iters_epilogue);
3710     }
3711   else
3712     {
3713       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3714 			      * assumed_vf
3715 			      - vec_inside_cost * peel_iters_prologue
3716 			      - vec_inside_cost * peel_iters_epilogue);
3717       if (min_profitable_iters <= 0)
3718         min_profitable_iters = 0;
3719       else
3720 	{
3721 	  min_profitable_iters /= saving_per_viter;
3722 
3723 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3724 	      <= (((int) vec_inside_cost * min_profitable_iters)
3725 		  + (((int) vec_outside_cost - scalar_outside_cost)
3726 		     * assumed_vf)))
3727 	    min_profitable_iters++;
3728 	}
3729     }
3730 
3731   if (dump_enabled_p ())
3732     dump_printf (MSG_NOTE,
3733 		 "  Calculated minimum iters for profitability: %d\n",
3734 		 min_profitable_iters);
3735 
3736   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3737       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3738     /* We want the vectorized loop to execute at least once.  */
3739     min_profitable_iters = assumed_vf + peel_iters_prologue;
3740 
3741   if (dump_enabled_p ())
3742     dump_printf_loc (MSG_NOTE, vect_location,
3743                      "  Runtime profitability threshold = %d\n",
3744                      min_profitable_iters);
3745 
3746   *ret_min_profitable_niters = min_profitable_iters;
3747 
3748   /* Calculate number of iterations required to make the vector version
3749      profitable, relative to the loop bodies only.
3750 
3751      Non-vectorized variant is SIC * niters and it must win over vector
3752      variant on the expected loop trip count.  The following condition must hold true:
3753      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3754 
3755   if (vec_outside_cost <= 0)
3756     min_profitable_estimate = 0;
3757   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3758     {
3759       /* This is a repeat of the code above, but with + SOC rather
3760 	 than - SOC.  */
3761       int outside_overhead = (vec_outside_cost
3762 			      - scalar_single_iter_cost * peel_iters_prologue
3763 			      - scalar_single_iter_cost * peel_iters_epilogue
3764 			      + scalar_outside_cost);
3765       int min_vec_niters = 1;
3766       if (outside_overhead > 0)
3767 	min_vec_niters = outside_overhead / saving_per_viter + 1;
3768 
3769       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3770 	{
3771 	  int threshold = (vec_inside_cost * min_vec_niters
3772 			   + vec_outside_cost
3773 			   + scalar_outside_cost);
3774 	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3775 	}
3776       else
3777 	min_profitable_estimate = (min_vec_niters * assumed_vf
3778 				   + peel_iters_prologue
3779 				   + peel_iters_epilogue);
3780     }
3781   else
3782     {
3783       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3784 				 * assumed_vf
3785 				 - vec_inside_cost * peel_iters_prologue
3786 				 - vec_inside_cost * peel_iters_epilogue)
3787 				 / ((scalar_single_iter_cost * assumed_vf)
3788 				   - vec_inside_cost);
3789     }
3790   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3791   if (dump_enabled_p ())
3792     dump_printf_loc (MSG_NOTE, vect_location,
3793 		     "  Static estimate profitability threshold = %d\n",
3794 		     min_profitable_estimate);
3795 
3796   *ret_min_profitable_estimate = min_profitable_estimate;
3797 }
3798 
3799 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3800    vector elements (not bits) for a vector with NELT elements.  */
3801 static void
3802 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3803 			      vec_perm_builder *sel)
3804 {
3805   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3806      by vec_perm_indices.  */
3807   sel->new_vector (nelt, 1, 3);
3808   for (unsigned int i = 0; i < 3; i++)
3809     sel->quick_push (i + offset);
3810 }
3811 
3812 /* Checks whether the target supports whole-vector shifts for vectors of mode
3813    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3814    it supports vec_perm_const with masks for all necessary shift amounts.  */
3815 static bool
3816 have_whole_vector_shift (machine_mode mode)
3817 {
3818   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3819     return true;
3820 
3821   /* Variable-length vectors should be handled via the optab.  */
3822   unsigned int nelt;
3823   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3824     return false;
3825 
3826   vec_perm_builder sel;
3827   vec_perm_indices indices;
3828   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3829     {
3830       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3831       indices.new_vector (sel, 2, nelt);
3832       if (!can_vec_perm_const_p (mode, indices, false))
3833 	return false;
3834     }
3835   return true;
3836 }
3837 
3838 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3839    functions. Design better to avoid maintenance issues.  */
3840 
3841 /* Function vect_model_reduction_cost.
3842 
3843    Models cost for a reduction operation, including the vector ops
3844    generated within the strip-mine loop, the initial definition before
3845    the loop, and the epilogue code that must be generated.  */
3846 
3847 static void
3848 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3849 			   int ncopies, stmt_vector_for_cost *cost_vec)
3850 {
3851   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3852   enum tree_code code;
3853   optab optab;
3854   tree vectype;
3855   machine_mode mode;
3856   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3857   struct loop *loop = NULL;
3858 
3859   if (loop_vinfo)
3860     loop = LOOP_VINFO_LOOP (loop_vinfo);
3861 
3862   /* Condition reductions generate two reductions in the loop.  */
3863   vect_reduction_type reduction_type
3864     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3865   if (reduction_type == COND_REDUCTION)
3866     ncopies *= 2;
3867 
3868   vectype = STMT_VINFO_VECTYPE (stmt_info);
3869   mode = TYPE_MODE (vectype);
3870   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3871 
3872   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3873 
3874   if (reduction_type == EXTRACT_LAST_REDUCTION
3875       || reduction_type == FOLD_LEFT_REDUCTION)
3876     {
3877       /* No extra instructions needed in the prologue.  */
3878       prologue_cost = 0;
3879 
3880       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3881 	/* Count one reduction-like operation per vector.  */
3882 	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3883 					stmt_info, 0, vect_body);
3884       else
3885 	{
3886 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3887 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3888 	  inside_cost = record_stmt_cost (cost_vec, nelements,
3889 					  vec_to_scalar, stmt_info, 0,
3890 					  vect_body);
3891 	  inside_cost += record_stmt_cost (cost_vec, nelements,
3892 					   scalar_stmt, stmt_info, 0,
3893 					   vect_body);
3894 	}
3895     }
3896   else
3897     {
3898       /* Add in cost for initial definition.
3899 	 For cond reduction we have four vectors: initial index, step,
3900 	 initial result of the data reduction, initial value of the index
3901 	 reduction.  */
3902       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3903       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3904 					 scalar_to_vec, stmt_info, 0,
3905 					 vect_prologue);
3906 
3907       /* Cost of reduction op inside loop.  */
3908       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3909 				      stmt_info, 0, vect_body);
3910     }
3911 
3912   /* Determine cost of epilogue code.
3913 
3914      We have a reduction operator that will reduce the vector in one statement.
3915      Also requires scalar extract.  */
3916 
3917   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3918     {
3919       if (reduc_fn != IFN_LAST)
3920 	{
3921 	  if (reduction_type == COND_REDUCTION)
3922 	    {
3923 	      /* An EQ stmt and an COND_EXPR stmt.  */
3924 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
3925 						 vector_stmt, stmt_info, 0,
3926 						 vect_epilogue);
3927 	      /* Reduction of the max index and a reduction of the found
3928 		 values.  */
3929 	      epilogue_cost += record_stmt_cost (cost_vec, 2,
3930 						 vec_to_scalar, stmt_info, 0,
3931 						 vect_epilogue);
3932 	      /* A broadcast of the max value.  */
3933 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
3934 						 scalar_to_vec, stmt_info, 0,
3935 						 vect_epilogue);
3936 	    }
3937 	  else
3938 	    {
3939 	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3940 						 stmt_info, 0, vect_epilogue);
3941 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
3942 						 vec_to_scalar, stmt_info, 0,
3943 						 vect_epilogue);
3944 	    }
3945 	}
3946       else if (reduction_type == COND_REDUCTION)
3947 	{
3948 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3949 	  /* Extraction of scalar elements.  */
3950 	  epilogue_cost += record_stmt_cost (cost_vec,
3951 					     2 * estimated_nunits,
3952 					     vec_to_scalar, stmt_info, 0,
3953 					     vect_epilogue);
3954 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3955 	  epilogue_cost += record_stmt_cost (cost_vec,
3956 					     2 * estimated_nunits - 3,
3957 					     scalar_stmt, stmt_info, 0,
3958 					     vect_epilogue);
3959 	}
3960       else if (reduction_type == EXTRACT_LAST_REDUCTION
3961 	       || reduction_type == FOLD_LEFT_REDUCTION)
3962 	/* No extra instructions need in the epilogue.  */
3963 	;
3964       else
3965 	{
3966 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3967 	  tree bitsize =
3968 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3969 	  int element_bitsize = tree_to_uhwi (bitsize);
3970 	  int nelements = vec_size_in_bits / element_bitsize;
3971 
3972 	  if (code == COND_EXPR)
3973 	    code = MAX_EXPR;
3974 
3975 	  optab = optab_for_tree_code (code, vectype, optab_default);
3976 
3977 	  /* We have a whole vector shift available.  */
3978 	  if (optab != unknown_optab
3979 	      && VECTOR_MODE_P (mode)
3980 	      && optab_handler (optab, mode) != CODE_FOR_nothing
3981 	      && have_whole_vector_shift (mode))
3982 	    {
3983 	      /* Final reduction via vector shifts and the reduction operator.
3984 		 Also requires scalar extract.  */
3985 	      epilogue_cost += record_stmt_cost (cost_vec,
3986 						 exact_log2 (nelements) * 2,
3987 						 vector_stmt, stmt_info, 0,
3988 						 vect_epilogue);
3989 	      epilogue_cost += record_stmt_cost (cost_vec, 1,
3990 						 vec_to_scalar, stmt_info, 0,
3991 						 vect_epilogue);
3992 	    }
3993 	  else
3994 	    /* Use extracts and reduction op for final reduction.  For N
3995 	       elements, we have N extracts and N-1 reduction ops.  */
3996 	    epilogue_cost += record_stmt_cost (cost_vec,
3997 					       nelements + nelements - 1,
3998 					       vector_stmt, stmt_info, 0,
3999 					       vect_epilogue);
4000 	}
4001     }
4002 
4003   if (dump_enabled_p ())
4004     dump_printf (MSG_NOTE,
4005                  "vect_model_reduction_cost: inside_cost = %d, "
4006                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4007                  prologue_cost, epilogue_cost);
4008 }
4009 
4010 
4011 /* Function vect_model_induction_cost.
4012 
4013    Models cost for induction operations.  */
4014 
4015 static void
4016 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4017 			   stmt_vector_for_cost *cost_vec)
4018 {
4019   unsigned inside_cost, prologue_cost;
4020 
4021   if (PURE_SLP_STMT (stmt_info))
4022     return;
4023 
4024   /* loop cost for vec_loop.  */
4025   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4026 				  stmt_info, 0, vect_body);
4027 
4028   /* prologue cost for vec_init and vec_step.  */
4029   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4030 				    stmt_info, 0, vect_prologue);
4031 
4032   if (dump_enabled_p ())
4033     dump_printf_loc (MSG_NOTE, vect_location,
4034                      "vect_model_induction_cost: inside_cost = %d, "
4035                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4036 }
4037 
4038 
4039 
4040 /* Function get_initial_def_for_reduction
4041 
4042    Input:
4043    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4044    INIT_VAL - the initial value of the reduction variable
4045 
4046    Output:
4047    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4048         of the reduction (used for adjusting the epilog - see below).
4049    Return a vector variable, initialized according to the operation that
4050 	STMT_VINFO performs. This vector will be used as the initial value
4051 	of the vector of partial results.
4052 
4053    Option1 (adjust in epilog): Initialize the vector as follows:
4054      add/bit or/xor:    [0,0,...,0,0]
4055      mult/bit and:      [1,1,...,1,1]
4056      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4057    and when necessary (e.g. add/mult case) let the caller know
4058    that it needs to adjust the result by init_val.
4059 
4060    Option2: Initialize the vector as follows:
4061      add/bit or/xor:    [init_val,0,0,...,0]
4062      mult/bit and:      [init_val,1,1,...,1]
4063      min/max/cond_expr: [init_val,init_val,...,init_val]
4064    and no adjustments are needed.
4065 
4066    For example, for the following code:
4067 
4068    s = init_val;
4069    for (i=0;i<n;i++)
4070      s = s + a[i];
4071 
4072    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4073    For a vector of 4 units, we want to return either [0,0,0,init_val],
4074    or [0,0,0,0] and let the caller know that it needs to adjust
4075    the result at the end by 'init_val'.
4076 
4077    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4078    initialization vector is simpler (same element in all entries), if
4079    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4080 
4081    A cost model should help decide between these two schemes.  */
4082 
4083 tree
4084 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4085                                tree *adjustment_def)
4086 {
4087   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4088   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4089   tree scalar_type = TREE_TYPE (init_val);
4090   tree vectype = get_vectype_for_scalar_type (scalar_type);
4091   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4092   tree def_for_init;
4093   tree init_def;
4094   REAL_VALUE_TYPE real_init_val = dconst0;
4095   int int_init_val = 0;
4096   gimple_seq stmts = NULL;
4097 
4098   gcc_assert (vectype);
4099 
4100   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4101 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4102 
4103   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4104 	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4105 
4106   vect_reduction_type reduction_type
4107     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4108 
4109   switch (code)
4110     {
4111     case WIDEN_SUM_EXPR:
4112     case DOT_PROD_EXPR:
4113     case SAD_EXPR:
4114     case PLUS_EXPR:
4115     case MINUS_EXPR:
4116     case BIT_IOR_EXPR:
4117     case BIT_XOR_EXPR:
4118     case MULT_EXPR:
4119     case BIT_AND_EXPR:
4120       {
4121         /* ADJUSTMENT_DEF is NULL when called from
4122            vect_create_epilog_for_reduction to vectorize double reduction.  */
4123         if (adjustment_def)
4124 	  *adjustment_def = init_val;
4125 
4126         if (code == MULT_EXPR)
4127           {
4128             real_init_val = dconst1;
4129             int_init_val = 1;
4130           }
4131 
4132         if (code == BIT_AND_EXPR)
4133           int_init_val = -1;
4134 
4135         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4136           def_for_init = build_real (scalar_type, real_init_val);
4137         else
4138           def_for_init = build_int_cst (scalar_type, int_init_val);
4139 
4140 	if (adjustment_def)
4141 	  /* Option1: the first element is '0' or '1' as well.  */
4142 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4143 						   def_for_init);
4144 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4145 	  {
4146 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4147 	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4148 						     def_for_init);
4149 	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4150 				     vectype, init_def, init_val);
4151 	  }
4152 	else
4153 	  {
4154 	    /* Option2: the first element is INIT_VAL.  */
4155 	    tree_vector_builder elts (vectype, 1, 2);
4156 	    elts.quick_push (init_val);
4157 	    elts.quick_push (def_for_init);
4158 	    init_def = gimple_build_vector (&stmts, &elts);
4159 	  }
4160       }
4161       break;
4162 
4163     case MIN_EXPR:
4164     case MAX_EXPR:
4165     case COND_EXPR:
4166       {
4167 	if (adjustment_def)
4168           {
4169 	    *adjustment_def = NULL_TREE;
4170 	    if (reduction_type != COND_REDUCTION
4171 		&& reduction_type != EXTRACT_LAST_REDUCTION)
4172 	      {
4173 		init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4174 		break;
4175 	      }
4176 	  }
4177 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4178 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4179       }
4180       break;
4181 
4182     default:
4183       gcc_unreachable ();
4184     }
4185 
4186   if (stmts)
4187     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4188   return init_def;
4189 }
4190 
4191 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4192    NUMBER_OF_VECTORS is the number of vector defs to create.
4193    If NEUTRAL_OP is nonnull, introducing extra elements of that
4194    value will not change the result.  */
4195 
4196 static void
4197 get_initial_defs_for_reduction (slp_tree slp_node,
4198 				vec<tree> *vec_oprnds,
4199 				unsigned int number_of_vectors,
4200 				bool reduc_chain, tree neutral_op)
4201 {
4202   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4203   stmt_vec_info stmt_vinfo = stmts[0];
4204   unsigned HOST_WIDE_INT nunits;
4205   unsigned j, number_of_places_left_in_vector;
4206   tree vector_type;
4207   unsigned int group_size = stmts.length ();
4208   unsigned int i;
4209   struct loop *loop;
4210 
4211   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4212 
4213   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4214 
4215   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4216   gcc_assert (loop);
4217   edge pe = loop_preheader_edge (loop);
4218 
4219   gcc_assert (!reduc_chain || neutral_op);
4220 
4221   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4222      created vectors. It is greater than 1 if unrolling is performed.
4223 
4224      For example, we have two scalar operands, s1 and s2 (e.g., group of
4225      strided accesses of size two), while NUNITS is four (i.e., four scalars
4226      of this type can be packed in a vector).  The output vector will contain
4227      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4228      will be 2).
4229 
4230      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4231      vectors containing the operands.
4232 
4233      For example, NUNITS is four as before, and the group size is 8
4234      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4235      {s5, s6, s7, s8}.  */
4236 
4237   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4238     nunits = group_size;
4239 
4240   number_of_places_left_in_vector = nunits;
4241   bool constant_p = true;
4242   tree_vector_builder elts (vector_type, nunits, 1);
4243   elts.quick_grow (nunits);
4244   gimple_seq ctor_seq = NULL;
4245   for (j = 0; j < nunits * number_of_vectors; ++j)
4246     {
4247       tree op;
4248       i = j % group_size;
4249       stmt_vinfo = stmts[i];
4250 
4251       /* Get the def before the loop.  In reduction chain we have only
4252 	 one initial value.  Else we have as many as PHIs in the group.  */
4253       if (reduc_chain)
4254 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4255       else if (((vec_oprnds->length () + 1) * nunits
4256 		- number_of_places_left_in_vector >= group_size)
4257 	       && neutral_op)
4258 	op = neutral_op;
4259       else
4260 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4261 
4262       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4263       number_of_places_left_in_vector--;
4264       elts[nunits - number_of_places_left_in_vector - 1] = op;
4265       if (!CONSTANT_CLASS_P (op))
4266 	constant_p = false;
4267 
4268       if (number_of_places_left_in_vector == 0)
4269 	{
4270 	  tree init;
4271 	  if (constant_p && !neutral_op
4272 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4273 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4274 	    /* Build the vector directly from ELTS.  */
4275 	    init = gimple_build_vector (&ctor_seq, &elts);
4276 	  else if (neutral_op)
4277 	    {
4278 	      /* Build a vector of the neutral value and shift the
4279 		 other elements into place.  */
4280 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4281 						   neutral_op);
4282 	      int k = nunits;
4283 	      while (k > 0 && elts[k - 1] == neutral_op)
4284 		k -= 1;
4285 	      while (k > 0)
4286 		{
4287 		  k -= 1;
4288 		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4289 				       vector_type, init, elts[k]);
4290 		}
4291 	    }
4292 	  else
4293 	    {
4294 	      /* First time round, duplicate ELTS to fill the
4295 		 required number of vectors.  */
4296 	      duplicate_and_interleave (&ctor_seq, vector_type, elts,
4297 					number_of_vectors, *vec_oprnds);
4298 	      break;
4299 	    }
4300 	  vec_oprnds->quick_push (init);
4301 
4302 	  number_of_places_left_in_vector = nunits;
4303 	  elts.new_vector (vector_type, nunits, 1);
4304 	  elts.quick_grow (nunits);
4305 	  constant_p = true;
4306 	}
4307     }
4308   if (ctor_seq != NULL)
4309     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4310 }
4311 
4312 
4313 /* Function vect_create_epilog_for_reduction
4314 
4315    Create code at the loop-epilog to finalize the result of a reduction
4316    computation.
4317 
4318    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4319      reduction statements.
4320    STMT_INFO is the scalar reduction stmt that is being vectorized.
4321    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4322      number of elements that we can fit in a vectype (nunits).  In this case
4323      we have to generate more than one vector stmt - i.e - we need to "unroll"
4324      the vector stmt by a factor VF/nunits.  For more details see documentation
4325      in vectorizable_operation.
4326    REDUC_FN is the internal function for the epilog reduction.
4327    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4328      computation.
4329    REDUC_INDEX is the index of the operand in the right hand side of the
4330      statement that is defined by REDUCTION_PHI.
4331    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4332    SLP_NODE is an SLP node containing a group of reduction statements. The
4333      first one in this group is STMT_INFO.
4334    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4335      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4336      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4337      any value of the IV in the loop.
4338    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4339    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4340      null if this is not an SLP reduction
4341 
4342    This function:
4343    1. Creates the reduction def-use cycles: sets the arguments for
4344       REDUCTION_PHIS:
4345       The loop-entry argument is the vectorized initial-value of the reduction.
4346       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4347       sums.
4348    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4349       by calling the function specified by REDUC_FN if available, or by
4350       other means (whole-vector shifts or a scalar loop).
4351       The function also creates a new phi node at the loop exit to preserve
4352       loop-closed form, as illustrated below.
4353 
4354      The flow at the entry to this function:
4355 
4356         loop:
4357           vec_def = phi <null, null>            # REDUCTION_PHI
4358           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4359           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4360         loop_exit:
4361           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4362           use <s_out0>
4363           use <s_out0>
4364 
4365      The above is transformed by this function into:
4366 
4367         loop:
4368           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4369           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4370           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4371         loop_exit:
4372           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4373           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4374           v_out2 = reduce <v_out1>
4375           s_out3 = extract_field <v_out2, 0>
4376           s_out4 = adjust_result <s_out3>
4377           use <s_out4>
4378           use <s_out4>
4379 */
4380 
4381 static void
4382 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4383 				  stmt_vec_info stmt_info,
4384 				  gimple *reduc_def_stmt,
4385 				  int ncopies, internal_fn reduc_fn,
4386 				  vec<stmt_vec_info> reduction_phis,
4387                                   bool double_reduc,
4388 				  slp_tree slp_node,
4389 				  slp_instance slp_node_instance,
4390 				  tree induc_val, enum tree_code induc_code,
4391 				  tree neutral_op)
4392 {
4393   stmt_vec_info prev_phi_info;
4394   tree vectype;
4395   machine_mode mode;
4396   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4397   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4398   basic_block exit_bb;
4399   tree scalar_dest;
4400   tree scalar_type;
4401   gimple *new_phi = NULL, *phi;
4402   stmt_vec_info phi_info;
4403   gimple_stmt_iterator exit_gsi;
4404   tree vec_dest;
4405   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4406   gimple *epilog_stmt = NULL;
4407   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4408   gimple *exit_phi;
4409   tree bitsize;
4410   tree adjustment_def = NULL;
4411   tree vec_initial_def = NULL;
4412   tree expr, def, initial_def = NULL;
4413   tree orig_name, scalar_result;
4414   imm_use_iterator imm_iter, phi_imm_iter;
4415   use_operand_p use_p, phi_use_p;
4416   gimple *use_stmt;
4417   stmt_vec_info reduction_phi_info = NULL;
4418   bool nested_in_vect_loop = false;
4419   auto_vec<gimple *> new_phis;
4420   auto_vec<stmt_vec_info> inner_phis;
4421   int j, i;
4422   auto_vec<tree> scalar_results;
4423   unsigned int group_size = 1, k, ratio;
4424   auto_vec<tree> vec_initial_defs;
4425   auto_vec<gimple *> phis;
4426   bool slp_reduc = false;
4427   bool direct_slp_reduc;
4428   tree new_phi_result;
4429   stmt_vec_info inner_phi = NULL;
4430   tree induction_index = NULL_TREE;
4431 
4432   if (slp_node)
4433     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4434 
4435   if (nested_in_vect_loop_p (loop, stmt_info))
4436     {
4437       outer_loop = loop;
4438       loop = loop->inner;
4439       nested_in_vect_loop = true;
4440       gcc_assert (!slp_node);
4441     }
4442 
4443   vectype = STMT_VINFO_VECTYPE (stmt_info);
4444   gcc_assert (vectype);
4445   mode = TYPE_MODE (vectype);
4446 
4447   /* 1. Create the reduction def-use cycle:
4448      Set the arguments of REDUCTION_PHIS, i.e., transform
4449 
4450         loop:
4451           vec_def = phi <null, null>            # REDUCTION_PHI
4452           VECT_DEF = vector_stmt                # vectorized form of STMT
4453           ...
4454 
4455      into:
4456 
4457         loop:
4458           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4459           VECT_DEF = vector_stmt                # vectorized form of STMT
4460           ...
4461 
4462      (in case of SLP, do it for all the phis). */
4463 
4464   /* Get the loop-entry arguments.  */
4465   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4466   if (slp_node)
4467     {
4468       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4469       vec_initial_defs.reserve (vec_num);
4470       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4471 				      &vec_initial_defs, vec_num,
4472 				      REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4473 				      neutral_op);
4474     }
4475   else
4476     {
4477       /* Get at the scalar def before the loop, that defines the initial value
4478 	 of the reduction variable.  */
4479       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4480 					   loop_preheader_edge (loop));
4481       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4482 	 and we can't use zero for induc_val, use initial_def.  Similarly
4483 	 for REDUC_MIN and initial_def larger than the base.  */
4484       if (TREE_CODE (initial_def) == INTEGER_CST
4485 	  && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4486 	      == INTEGER_INDUC_COND_REDUCTION)
4487 	  && !integer_zerop (induc_val)
4488 	  && ((induc_code == MAX_EXPR
4489 	       && tree_int_cst_lt (initial_def, induc_val))
4490 	      || (induc_code == MIN_EXPR
4491 		  && tree_int_cst_lt (induc_val, initial_def))))
4492 	induc_val = initial_def;
4493 
4494       if (double_reduc)
4495 	/* In case of double reduction we only create a vector variable
4496 	   to be put in the reduction phi node.  The actual statement
4497 	   creation is done later in this function.  */
4498 	vec_initial_def = vect_create_destination_var (initial_def, vectype);
4499       else if (nested_in_vect_loop)
4500 	{
4501 	  /* Do not use an adjustment def as that case is not supported
4502 	     correctly if ncopies is not one.  */
4503 	  vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4504 	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4505 							  stmt_info);
4506 	}
4507       else
4508 	vec_initial_def
4509 	  = get_initial_def_for_reduction (stmt_info, initial_def,
4510 					   &adjustment_def);
4511       vec_initial_defs.create (1);
4512       vec_initial_defs.quick_push (vec_initial_def);
4513     }
4514 
4515   /* Set phi nodes arguments.  */
4516   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4517     {
4518       tree vec_init_def = vec_initial_defs[i];
4519       tree def = vect_defs[i];
4520       for (j = 0; j < ncopies; j++)
4521         {
4522 	  if (j != 0)
4523 	    {
4524 	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4525 	      if (nested_in_vect_loop)
4526 		vec_init_def
4527 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4528 	    }
4529 
4530 	  /* Set the loop-entry arg of the reduction-phi.  */
4531 
4532 	  gphi *phi = as_a <gphi *> (phi_info->stmt);
4533 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4534 	      == INTEGER_INDUC_COND_REDUCTION)
4535 	    {
4536 	      /* Initialise the reduction phi to zero.  This prevents initial
4537 		 values of non-zero interferring with the reduction op.  */
4538 	      gcc_assert (ncopies == 1);
4539 	      gcc_assert (i == 0);
4540 
4541 	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
4542 	      tree induc_val_vec
4543 		= build_vector_from_val (vec_init_def_type, induc_val);
4544 
4545 	      add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4546 			   UNKNOWN_LOCATION);
4547 	    }
4548 	  else
4549 	    add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4550 			 UNKNOWN_LOCATION);
4551 
4552           /* Set the loop-latch arg for the reduction-phi.  */
4553           if (j > 0)
4554 	    def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4555 
4556 	  add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4557 
4558           if (dump_enabled_p ())
4559 	    dump_printf_loc (MSG_NOTE, vect_location,
4560 			     "transform reduction: created def-use cycle: %G%G",
4561 			     phi, SSA_NAME_DEF_STMT (def));
4562         }
4563     }
4564 
4565   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4566      which is updated with the current index of the loop for every match of
4567      the original loop's cond_expr (VEC_STMT).  This results in a vector
4568      containing the last time the condition passed for that vector lane.
4569      The first match will be a 1 to allow 0 to be used for non-matching
4570      indexes.  If there are no matches at all then the vector will be all
4571      zeroes.  */
4572   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4573     {
4574       tree indx_before_incr, indx_after_incr;
4575       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4576 
4577       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4578       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4579 
4580       int scalar_precision
4581 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4582       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4583       tree cr_index_vector_type = build_vector_type
4584 	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4585 
4586       /* First we create a simple vector induction variable which starts
4587 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4588 	 vector size (STEP).  */
4589 
4590       /* Create a {1,2,3,...} vector.  */
4591       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4592 
4593       /* Create a vector of the step value.  */
4594       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4595       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4596 
4597       /* Create an induction variable.  */
4598       gimple_stmt_iterator incr_gsi;
4599       bool insert_after;
4600       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4601       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4602 		 insert_after, &indx_before_incr, &indx_after_incr);
4603 
4604       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4605 	 filled with zeros (VEC_ZERO).  */
4606 
4607       /* Create a vector of 0s.  */
4608       tree zero = build_zero_cst (cr_index_scalar_type);
4609       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4610 
4611       /* Create a vector phi node.  */
4612       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4613       new_phi = create_phi_node (new_phi_tree, loop->header);
4614       loop_vinfo->add_stmt (new_phi);
4615       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4616 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4617 
4618       /* Now take the condition from the loops original cond_expr
4619 	 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4620 	 every match uses values from the induction variable
4621 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4622 	 (NEW_PHI_TREE).
4623 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4624 	 the new cond_expr (INDEX_COND_EXPR).  */
4625 
4626       /* Duplicate the condition from vec_stmt.  */
4627       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4628 
4629       /* Create a conditional, where the condition is taken from vec_stmt
4630 	 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4631 	 else is the phi (NEW_PHI_TREE).  */
4632       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4633 				     ccompare, indx_before_incr,
4634 				     new_phi_tree);
4635       induction_index = make_ssa_name (cr_index_vector_type);
4636       gimple *index_condition = gimple_build_assign (induction_index,
4637 						     index_cond_expr);
4638       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4639       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4640       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4641 
4642       /* Update the phi with the vec cond.  */
4643       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4644 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4645     }
4646 
4647   /* 2. Create epilog code.
4648         The reduction epilog code operates across the elements of the vector
4649         of partial results computed by the vectorized loop.
4650         The reduction epilog code consists of:
4651 
4652         step 1: compute the scalar result in a vector (v_out2)
4653         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4654         step 3: adjust the scalar result (s_out3) if needed.
4655 
4656         Step 1 can be accomplished using one the following three schemes:
4657           (scheme 1) using reduc_fn, if available.
4658           (scheme 2) using whole-vector shifts, if available.
4659           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4660                      combined.
4661 
4662           The overall epilog code looks like this:
4663 
4664           s_out0 = phi <s_loop>         # original EXIT_PHI
4665           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4666           v_out2 = reduce <v_out1>              # step 1
4667           s_out3 = extract_field <v_out2, 0>    # step 2
4668           s_out4 = adjust_result <s_out3>       # step 3
4669 
4670           (step 3 is optional, and steps 1 and 2 may be combined).
4671           Lastly, the uses of s_out0 are replaced by s_out4.  */
4672 
4673 
4674   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4675          v_out1 = phi <VECT_DEF>
4676          Store them in NEW_PHIS.  */
4677 
4678   exit_bb = single_exit (loop)->dest;
4679   prev_phi_info = NULL;
4680   new_phis.create (vect_defs.length ());
4681   FOR_EACH_VEC_ELT (vect_defs, i, def)
4682     {
4683       for (j = 0; j < ncopies; j++)
4684         {
4685 	  tree new_def = copy_ssa_name (def);
4686           phi = create_phi_node (new_def, exit_bb);
4687 	  stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4688           if (j == 0)
4689             new_phis.quick_push (phi);
4690           else
4691 	    {
4692 	      def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4693 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4694 	    }
4695 
4696           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4697 	  prev_phi_info = phi_info;
4698         }
4699     }
4700 
4701   /* The epilogue is created for the outer-loop, i.e., for the loop being
4702      vectorized.  Create exit phis for the outer loop.  */
4703   if (double_reduc)
4704     {
4705       loop = outer_loop;
4706       exit_bb = single_exit (loop)->dest;
4707       inner_phis.create (vect_defs.length ());
4708       FOR_EACH_VEC_ELT (new_phis, i, phi)
4709 	{
4710 	  stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4711 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
4712 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
4713 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4714 			   PHI_RESULT (phi));
4715 	  prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4716 	  inner_phis.quick_push (phi_info);
4717 	  new_phis[i] = outer_phi;
4718 	  while (STMT_VINFO_RELATED_STMT (phi_info))
4719             {
4720 	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4721 	      new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4722 	      outer_phi = create_phi_node (new_result, exit_bb);
4723 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4724 			       PHI_RESULT (phi_info->stmt));
4725 	      stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4726 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4727 	      prev_phi_info = outer_phi_info;
4728 	    }
4729 	}
4730     }
4731 
4732   exit_gsi = gsi_after_labels (exit_bb);
4733 
4734   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4735          (i.e. when reduc_fn is not available) and in the final adjustment
4736 	 code (if needed).  Also get the original scalar reduction variable as
4737          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4738          represents a reduction pattern), the tree-code and scalar-def are
4739          taken from the original stmt that the pattern-stmt (STMT) replaces.
4740          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4741          are taken from STMT.  */
4742 
4743   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4744   if (orig_stmt_info != stmt_info)
4745     {
4746       /* Reduction pattern  */
4747       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4748       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4749     }
4750 
4751   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4752   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4753      partial results are added and not subtracted.  */
4754   if (code == MINUS_EXPR)
4755     code = PLUS_EXPR;
4756 
4757   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4758   scalar_type = TREE_TYPE (scalar_dest);
4759   scalar_results.create (group_size);
4760   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4761   bitsize = TYPE_SIZE (scalar_type);
4762 
4763   /* In case this is a reduction in an inner-loop while vectorizing an outer
4764      loop - we don't need to extract a single scalar result at the end of the
4765      inner-loop (unless it is double reduction, i.e., the use of reduction is
4766      outside the outer-loop).  The final vector of partial results will be used
4767      in the vectorized outer-loop, or reduced to a scalar result at the end of
4768      the outer-loop.  */
4769   if (nested_in_vect_loop && !double_reduc)
4770     goto vect_finalize_reduction;
4771 
4772   /* SLP reduction without reduction chain, e.g.,
4773      # a1 = phi <a2, a0>
4774      # b1 = phi <b2, b0>
4775      a2 = operation (a1)
4776      b2 = operation (b1)  */
4777   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4778 
4779   /* True if we should implement SLP_REDUC using native reduction operations
4780      instead of scalar operations.  */
4781   direct_slp_reduc = (reduc_fn != IFN_LAST
4782 		      && slp_reduc
4783 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4784 
4785   /* In case of reduction chain, e.g.,
4786      # a1 = phi <a3, a0>
4787      a2 = operation (a1)
4788      a3 = operation (a2),
4789 
4790      we may end up with more than one vector result.  Here we reduce them to
4791      one vector.  */
4792   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4793     {
4794       tree first_vect = PHI_RESULT (new_phis[0]);
4795       gassign *new_vec_stmt = NULL;
4796       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4797       for (k = 1; k < new_phis.length (); k++)
4798         {
4799 	  gimple *next_phi = new_phis[k];
4800           tree second_vect = PHI_RESULT (next_phi);
4801           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4802           new_vec_stmt = gimple_build_assign (tem, code,
4803 					      first_vect, second_vect);
4804           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4805 	  first_vect = tem;
4806         }
4807 
4808       new_phi_result = first_vect;
4809       if (new_vec_stmt)
4810         {
4811           new_phis.truncate (0);
4812           new_phis.safe_push (new_vec_stmt);
4813         }
4814     }
4815   /* Likewise if we couldn't use a single defuse cycle.  */
4816   else if (ncopies > 1)
4817     {
4818       gcc_assert (new_phis.length () == 1);
4819       tree first_vect = PHI_RESULT (new_phis[0]);
4820       gassign *new_vec_stmt = NULL;
4821       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4822       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4823       for (int k = 1; k < ncopies; ++k)
4824 	{
4825 	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4826 	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
4827           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4828           new_vec_stmt = gimple_build_assign (tem, code,
4829 					      first_vect, second_vect);
4830           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4831 	  first_vect = tem;
4832 	}
4833       new_phi_result = first_vect;
4834       new_phis.truncate (0);
4835       new_phis.safe_push (new_vec_stmt);
4836     }
4837   else
4838     new_phi_result = PHI_RESULT (new_phis[0]);
4839 
4840   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4841       && reduc_fn != IFN_LAST)
4842     {
4843       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4844 	 various data values where the condition matched and another vector
4845 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
4846 	 need to extract the last matching index (which will be the index with
4847 	 highest value) and use this to index into the data vector.
4848 	 For the case where there were no matches, the data vector will contain
4849 	 all default values and the index vector will be all zeros.  */
4850 
4851       /* Get various versions of the type of the vector of indexes.  */
4852       tree index_vec_type = TREE_TYPE (induction_index);
4853       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4854       tree index_scalar_type = TREE_TYPE (index_vec_type);
4855       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4856 	(index_vec_type);
4857 
4858       /* Get an unsigned integer version of the type of the data vector.  */
4859       int scalar_precision
4860 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4861       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4862       tree vectype_unsigned = build_vector_type
4863 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4864 
4865       /* First we need to create a vector (ZERO_VEC) of zeros and another
4866 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4867 	 can create using a MAX reduction and then expanding.
4868 	 In the case where the loop never made any matches, the max index will
4869 	 be zero.  */
4870 
4871       /* Vector of {0, 0, 0,...}.  */
4872       tree zero_vec = make_ssa_name (vectype);
4873       tree zero_vec_rhs = build_zero_cst (vectype);
4874       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4875       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4876 
4877       /* Find maximum value from the vector of found indexes.  */
4878       tree max_index = make_ssa_name (index_scalar_type);
4879       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4880 							  1, induction_index);
4881       gimple_call_set_lhs (max_index_stmt, max_index);
4882       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4883 
4884       /* Vector of {max_index, max_index, max_index,...}.  */
4885       tree max_index_vec = make_ssa_name (index_vec_type);
4886       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4887 						      max_index);
4888       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4889 							max_index_vec_rhs);
4890       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4891 
4892       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4893 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4894 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4895 	 otherwise.  Only one value should match, resulting in a vector
4896 	 (VEC_COND) with one data value and the rest zeros.
4897 	 In the case where the loop never made any matches, every index will
4898 	 match, resulting in a vector with all data values (which will all be
4899 	 the default value).  */
4900 
4901       /* Compare the max index vector to the vector of found indexes to find
4902 	 the position of the max value.  */
4903       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4904       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4905 						      induction_index,
4906 						      max_index_vec);
4907       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4908 
4909       /* Use the compare to choose either values from the data vector or
4910 	 zero.  */
4911       tree vec_cond = make_ssa_name (vectype);
4912       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4913 						   vec_compare, new_phi_result,
4914 						   zero_vec);
4915       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4916 
4917       /* Finally we need to extract the data value from the vector (VEC_COND)
4918 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4919 	 reduction, but because this doesn't exist, we can use a MAX reduction
4920 	 instead.  The data value might be signed or a float so we need to cast
4921 	 it first.
4922 	 In the case where the loop never made any matches, the data values are
4923 	 all identical, and so will reduce down correctly.  */
4924 
4925       /* Make the matched data values unsigned.  */
4926       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4927       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4928 				       vec_cond);
4929       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4930 							VIEW_CONVERT_EXPR,
4931 							vec_cond_cast_rhs);
4932       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4933 
4934       /* Reduce down to a scalar value.  */
4935       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4936       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4937 							   1, vec_cond_cast);
4938       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4939       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4940 
4941       /* Convert the reduced value back to the result type and set as the
4942 	 result.  */
4943       gimple_seq stmts = NULL;
4944       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4945 			       data_reduc);
4946       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4947       scalar_results.safe_push (new_temp);
4948     }
4949   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4950 	   && reduc_fn == IFN_LAST)
4951     {
4952       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4953 	 idx = 0;
4954          idx_val = induction_index[0];
4955 	 val = data_reduc[0];
4956          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4957 	   if (induction_index[i] > idx_val)
4958 	     val = data_reduc[i], idx_val = induction_index[i];
4959 	 return val;  */
4960 
4961       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4962       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4963       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4964       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4965       /* Enforced by vectorizable_reduction, which ensures we have target
4966 	 support before allowing a conditional reduction on variable-length
4967 	 vectors.  */
4968       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4969       tree idx_val = NULL_TREE, val = NULL_TREE;
4970       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4971 	{
4972 	  tree old_idx_val = idx_val;
4973 	  tree old_val = val;
4974 	  idx_val = make_ssa_name (idx_eltype);
4975 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4976 					     build3 (BIT_FIELD_REF, idx_eltype,
4977 						     induction_index,
4978 						     bitsize_int (el_size),
4979 						     bitsize_int (off)));
4980 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4981 	  val = make_ssa_name (data_eltype);
4982 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4983 					     build3 (BIT_FIELD_REF,
4984 						     data_eltype,
4985 						     new_phi_result,
4986 						     bitsize_int (el_size),
4987 						     bitsize_int (off)));
4988 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4989 	  if (off != 0)
4990 	    {
4991 	      tree new_idx_val = idx_val;
4992 	      tree new_val = val;
4993 	      if (off != v_size - el_size)
4994 		{
4995 		  new_idx_val = make_ssa_name (idx_eltype);
4996 		  epilog_stmt = gimple_build_assign (new_idx_val,
4997 						     MAX_EXPR, idx_val,
4998 						     old_idx_val);
4999 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5000 		}
5001 	      new_val = make_ssa_name (data_eltype);
5002 	      epilog_stmt = gimple_build_assign (new_val,
5003 						 COND_EXPR,
5004 						 build2 (GT_EXPR,
5005 							 boolean_type_node,
5006 							 idx_val,
5007 							 old_idx_val),
5008 						 val, old_val);
5009 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5010 	      idx_val = new_idx_val;
5011 	      val = new_val;
5012 	    }
5013 	}
5014       /* Convert the reduced value back to the result type and set as the
5015 	 result.  */
5016       gimple_seq stmts = NULL;
5017       val = gimple_convert (&stmts, scalar_type, val);
5018       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5019       scalar_results.safe_push (val);
5020     }
5021 
5022   /* 2.3 Create the reduction code, using one of the three schemes described
5023          above. In SLP we simply need to extract all the elements from the
5024          vector (without reducing them), so we use scalar shifts.  */
5025   else if (reduc_fn != IFN_LAST && !slp_reduc)
5026     {
5027       tree tmp;
5028       tree vec_elem_type;
5029 
5030       /* Case 1:  Create:
5031          v_out2 = reduc_expr <v_out1>  */
5032 
5033       if (dump_enabled_p ())
5034         dump_printf_loc (MSG_NOTE, vect_location,
5035 			 "Reduce using direct vector reduction.\n");
5036 
5037       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5038       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5039 	{
5040 	  tree tmp_dest
5041 	    = vect_create_destination_var (scalar_dest, vec_elem_type);
5042 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5043 						    new_phi_result);
5044 	  gimple_set_lhs (epilog_stmt, tmp_dest);
5045 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5046 	  gimple_set_lhs (epilog_stmt, new_temp);
5047 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5048 
5049 	  epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5050 					     new_temp);
5051 	}
5052       else
5053 	{
5054 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5055 						    new_phi_result);
5056 	  gimple_set_lhs (epilog_stmt, new_scalar_dest);
5057 	}
5058 
5059       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5060       gimple_set_lhs (epilog_stmt, new_temp);
5061       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5062 
5063       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5064 	   == INTEGER_INDUC_COND_REDUCTION)
5065 	  && !operand_equal_p (initial_def, induc_val, 0))
5066 	{
5067 	  /* Earlier we set the initial value to be a vector if induc_val
5068 	     values.  Check the result and if it is induc_val then replace
5069 	     with the original initial value, unless induc_val is
5070 	     the same as initial_def already.  */
5071 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5072 				  induc_val);
5073 
5074 	  tmp = make_ssa_name (new_scalar_dest);
5075 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5076 					     initial_def, new_temp);
5077 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5078 	  new_temp = tmp;
5079 	}
5080 
5081       scalar_results.safe_push (new_temp);
5082     }
5083   else if (direct_slp_reduc)
5084     {
5085       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5086 	 with the elements for other SLP statements replaced with the
5087 	 neutral value.  We can then do a normal reduction on each vector.  */
5088 
5089       /* Enforced by vectorizable_reduction.  */
5090       gcc_assert (new_phis.length () == 1);
5091       gcc_assert (pow2p_hwi (group_size));
5092 
5093       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5094       vec<stmt_vec_info> orig_phis
5095 	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5096       gimple_seq seq = NULL;
5097 
5098       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5099 	 and the same element size as VECTYPE.  */
5100       tree index = build_index_vector (vectype, 0, 1);
5101       tree index_type = TREE_TYPE (index);
5102       tree index_elt_type = TREE_TYPE (index_type);
5103       tree mask_type = build_same_sized_truth_vector_type (index_type);
5104 
5105       /* Create a vector that, for each element, identifies which of
5106 	 the REDUC_GROUP_SIZE results should use it.  */
5107       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5108       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5109 			    build_vector_from_val (index_type, index_mask));
5110 
5111       /* Get a neutral vector value.  This is simply a splat of the neutral
5112 	 scalar value if we have one, otherwise the initial scalar value
5113 	 is itself a neutral value.  */
5114       tree vector_identity = NULL_TREE;
5115       if (neutral_op)
5116 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5117 							neutral_op);
5118       for (unsigned int i = 0; i < group_size; ++i)
5119 	{
5120 	  /* If there's no univeral neutral value, we can use the
5121 	     initial scalar value from the original PHI.  This is used
5122 	     for MIN and MAX reduction, for example.  */
5123 	  if (!neutral_op)
5124 	    {
5125 	      tree scalar_value
5126 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5127 					 loop_preheader_edge (loop));
5128 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5129 							      scalar_value);
5130 	    }
5131 
5132 	  /* Calculate the equivalent of:
5133 
5134 	     sel[j] = (index[j] == i);
5135 
5136 	     which selects the elements of NEW_PHI_RESULT that should
5137 	     be included in the result.  */
5138 	  tree compare_val = build_int_cst (index_elt_type, i);
5139 	  compare_val = build_vector_from_val (index_type, compare_val);
5140 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5141 				   index, compare_val);
5142 
5143 	  /* Calculate the equivalent of:
5144 
5145 	     vec = seq ? new_phi_result : vector_identity;
5146 
5147 	     VEC is now suitable for a full vector reduction.  */
5148 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5149 				   sel, new_phi_result, vector_identity);
5150 
5151 	  /* Do the reduction and convert it to the appropriate type.  */
5152 	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5153 				      TREE_TYPE (vectype), vec);
5154 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5155 	  scalar_results.safe_push (scalar);
5156 	}
5157       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5158     }
5159   else
5160     {
5161       bool reduce_with_shift;
5162       tree vec_temp;
5163 
5164       /* COND reductions all do the final reduction with MAX_EXPR
5165 	 or MIN_EXPR.  */
5166       if (code == COND_EXPR)
5167 	{
5168 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5169 	      == INTEGER_INDUC_COND_REDUCTION)
5170 	    code = induc_code;
5171 	  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172 		   == CONST_COND_REDUCTION)
5173 	    code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5174 	  else
5175 	    code = MAX_EXPR;
5176 	}
5177 
5178       /* See if the target wants to do the final (shift) reduction
5179 	 in a vector mode of smaller size and first reduce upper/lower
5180 	 halves against each other.  */
5181       enum machine_mode mode1 = mode;
5182       tree vectype1 = vectype;
5183       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5184       unsigned sz1 = sz;
5185       if (!slp_reduc
5186 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5187 	sz1 = GET_MODE_SIZE (mode1).to_constant ();
5188 
5189       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5190       reduce_with_shift = have_whole_vector_shift (mode1);
5191       if (!VECTOR_MODE_P (mode1))
5192 	reduce_with_shift = false;
5193       else
5194 	{
5195 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5196 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5197 	    reduce_with_shift = false;
5198 	}
5199 
5200       /* First reduce the vector to the desired vector size we should
5201 	 do shift reduction on by combining upper and lower halves.  */
5202       new_temp = new_phi_result;
5203       while (sz > sz1)
5204 	{
5205 	  gcc_assert (!slp_reduc);
5206 	  sz /= 2;
5207 	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5208 
5209 	  /* The target has to make sure we support lowpart/highpart
5210 	     extraction, either via direct vector extract or through
5211 	     an integer mode punning.  */
5212 	  tree dst1, dst2;
5213 	  if (convert_optab_handler (vec_extract_optab,
5214 				     TYPE_MODE (TREE_TYPE (new_temp)),
5215 				     TYPE_MODE (vectype1))
5216 	      != CODE_FOR_nothing)
5217 	    {
5218 	      /* Extract sub-vectors directly once vec_extract becomes
5219 		 a conversion optab.  */
5220 	      dst1 = make_ssa_name (vectype1);
5221 	      epilog_stmt
5222 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5223 					 build3 (BIT_FIELD_REF, vectype1,
5224 						 new_temp, TYPE_SIZE (vectype1),
5225 						 bitsize_int (0)));
5226 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227 	      dst2 =  make_ssa_name (vectype1);
5228 	      epilog_stmt
5229 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5230 					 build3 (BIT_FIELD_REF, vectype1,
5231 						 new_temp, TYPE_SIZE (vectype1),
5232 						 bitsize_int (sz * BITS_PER_UNIT)));
5233 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234 	    }
5235 	  else
5236 	    {
5237 	      /* Extract via punning to appropriately sized integer mode
5238 		 vector.  */
5239 	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5240 							    1);
5241 	      tree etype = build_vector_type (eltype, 2);
5242 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5243 						 TYPE_MODE (etype),
5244 						 TYPE_MODE (eltype))
5245 			  != CODE_FOR_nothing);
5246 	      tree tem = make_ssa_name (etype);
5247 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5248 						 build1 (VIEW_CONVERT_EXPR,
5249 							 etype, new_temp));
5250 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251 	      new_temp = tem;
5252 	      tem = make_ssa_name (eltype);
5253 	      epilog_stmt
5254 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5255 					 build3 (BIT_FIELD_REF, eltype,
5256 						 new_temp, TYPE_SIZE (eltype),
5257 						 bitsize_int (0)));
5258 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259 	      dst1 = make_ssa_name (vectype1);
5260 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5261 						 build1 (VIEW_CONVERT_EXPR,
5262 							 vectype1, tem));
5263 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264 	      tem = make_ssa_name (eltype);
5265 	      epilog_stmt
5266 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5267 					 build3 (BIT_FIELD_REF, eltype,
5268 						 new_temp, TYPE_SIZE (eltype),
5269 						 bitsize_int (sz * BITS_PER_UNIT)));
5270 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271 	      dst2 =  make_ssa_name (vectype1);
5272 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5273 						 build1 (VIEW_CONVERT_EXPR,
5274 							 vectype1, tem));
5275 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5276 	    }
5277 
5278 	  new_temp = make_ssa_name (vectype1);
5279 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5280 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5281 	}
5282 
5283       if (reduce_with_shift && !slp_reduc)
5284 	{
5285 	  int element_bitsize = tree_to_uhwi (bitsize);
5286 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5287 	     for variable-length vectors and also requires direct target support
5288 	     for loop reductions.  */
5289 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5290 	  int nelements = vec_size_in_bits / element_bitsize;
5291 	  vec_perm_builder sel;
5292 	  vec_perm_indices indices;
5293 
5294           int elt_offset;
5295 
5296           tree zero_vec = build_zero_cst (vectype1);
5297           /* Case 2: Create:
5298              for (offset = nelements/2; offset >= 1; offset/=2)
5299                 {
5300                   Create:  va' = vec_shift <va, offset>
5301                   Create:  va = vop <va, va'>
5302                 }  */
5303 
5304           tree rhs;
5305 
5306           if (dump_enabled_p ())
5307             dump_printf_loc (MSG_NOTE, vect_location,
5308 			     "Reduce using vector shifts\n");
5309 
5310 	  mode1 = TYPE_MODE (vectype1);
5311           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5312           for (elt_offset = nelements / 2;
5313                elt_offset >= 1;
5314                elt_offset /= 2)
5315             {
5316 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5317 	      indices.new_vector (sel, 2, nelements);
5318 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5319 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5320 						 new_temp, zero_vec, mask);
5321               new_name = make_ssa_name (vec_dest, epilog_stmt);
5322               gimple_assign_set_lhs (epilog_stmt, new_name);
5323               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5324 
5325 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5326 						 new_temp);
5327               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5328               gimple_assign_set_lhs (epilog_stmt, new_temp);
5329               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330             }
5331 
5332 	  /* 2.4  Extract the final scalar result.  Create:
5333 	     s_out3 = extract_field <v_out2, bitpos>  */
5334 
5335 	  if (dump_enabled_p ())
5336 	    dump_printf_loc (MSG_NOTE, vect_location,
5337 			     "extract scalar result\n");
5338 
5339 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5340 			bitsize, bitsize_zero_node);
5341 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5342 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5343 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5344 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345 	  scalar_results.safe_push (new_temp);
5346         }
5347       else
5348         {
5349           /* Case 3: Create:
5350              s = extract_field <v_out2, 0>
5351              for (offset = element_size;
5352                   offset < vector_size;
5353                   offset += element_size;)
5354                {
5355                  Create:  s' = extract_field <v_out2, offset>
5356                  Create:  s = op <s, s'>  // For non SLP cases
5357                }  */
5358 
5359           if (dump_enabled_p ())
5360             dump_printf_loc (MSG_NOTE, vect_location,
5361 			     "Reduce using scalar code.\n");
5362 
5363 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5364 	  int element_bitsize = tree_to_uhwi (bitsize);
5365           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5366             {
5367               int bit_offset;
5368               if (gimple_code (new_phi) == GIMPLE_PHI)
5369                 vec_temp = PHI_RESULT (new_phi);
5370               else
5371                 vec_temp = gimple_assign_lhs (new_phi);
5372               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5373 				 bitsize_zero_node);
5374               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5375               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376               gimple_assign_set_lhs (epilog_stmt, new_temp);
5377               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378 
5379               /* In SLP we don't need to apply reduction operation, so we just
5380                  collect s' values in SCALAR_RESULTS.  */
5381               if (slp_reduc)
5382                 scalar_results.safe_push (new_temp);
5383 
5384               for (bit_offset = element_bitsize;
5385                    bit_offset < vec_size_in_bits;
5386                    bit_offset += element_bitsize)
5387                 {
5388                   tree bitpos = bitsize_int (bit_offset);
5389                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5390                                      bitsize, bitpos);
5391 
5392                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5393                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5394                   gimple_assign_set_lhs (epilog_stmt, new_name);
5395                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396 
5397                   if (slp_reduc)
5398                     {
5399                       /* In SLP we don't need to apply reduction operation, so
5400                          we just collect s' values in SCALAR_RESULTS.  */
5401                       new_temp = new_name;
5402                       scalar_results.safe_push (new_name);
5403                     }
5404                   else
5405                     {
5406 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5407 							 new_name, new_temp);
5408                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5409                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5410                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411                     }
5412                 }
5413             }
5414 
5415           /* The only case where we need to reduce scalar results in SLP, is
5416              unrolling.  If the size of SCALAR_RESULTS is greater than
5417              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5418              REDUC_GROUP_SIZE.  */
5419           if (slp_reduc)
5420             {
5421               tree res, first_res, new_res;
5422 	      gimple *new_stmt;
5423 
5424               /* Reduce multiple scalar results in case of SLP unrolling.  */
5425               for (j = group_size; scalar_results.iterate (j, &res);
5426                    j++)
5427                 {
5428                   first_res = scalar_results[j % group_size];
5429 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
5430 						  first_res, res);
5431                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5432                   gimple_assign_set_lhs (new_stmt, new_res);
5433                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5434                   scalar_results[j % group_size] = new_res;
5435                 }
5436             }
5437           else
5438             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5439             scalar_results.safe_push (new_temp);
5440         }
5441 
5442       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5443 	   == INTEGER_INDUC_COND_REDUCTION)
5444 	  && !operand_equal_p (initial_def, induc_val, 0))
5445 	{
5446 	  /* Earlier we set the initial value to be a vector if induc_val
5447 	     values.  Check the result and if it is induc_val then replace
5448 	     with the original initial value, unless induc_val is
5449 	     the same as initial_def already.  */
5450 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5451 				  induc_val);
5452 
5453 	  tree tmp = make_ssa_name (new_scalar_dest);
5454 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5455 					     initial_def, new_temp);
5456 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457 	  scalar_results[0] = tmp;
5458 	}
5459     }
5460 
5461 vect_finalize_reduction:
5462 
5463   if (double_reduc)
5464     loop = loop->inner;
5465 
5466   /* 2.5 Adjust the final result by the initial value of the reduction
5467 	 variable. (When such adjustment is not needed, then
5468 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5469 	 new_temp = loop_exit_def + adjustment_def  */
5470 
5471   if (adjustment_def)
5472     {
5473       gcc_assert (!slp_reduc);
5474       if (nested_in_vect_loop)
5475 	{
5476           new_phi = new_phis[0];
5477 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5478 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5479 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
5480 	}
5481       else
5482 	{
5483           new_temp = scalar_results[0];
5484 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5485 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
5486 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5487 	}
5488 
5489       epilog_stmt = gimple_build_assign (new_dest, expr);
5490       new_temp = make_ssa_name (new_dest, epilog_stmt);
5491       gimple_assign_set_lhs (epilog_stmt, new_temp);
5492       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5493       if (nested_in_vect_loop)
5494         {
5495 	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5496 	  STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5497 	    = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5498 
5499           if (!double_reduc)
5500             scalar_results.quick_push (new_temp);
5501           else
5502             scalar_results[0] = new_temp;
5503         }
5504       else
5505         scalar_results[0] = new_temp;
5506 
5507       new_phis[0] = epilog_stmt;
5508     }
5509 
5510   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5511           phis with new adjusted scalar results, i.e., replace use <s_out0>
5512           with use <s_out4>.
5513 
5514      Transform:
5515         loop_exit:
5516           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5517           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5518           v_out2 = reduce <v_out1>
5519           s_out3 = extract_field <v_out2, 0>
5520           s_out4 = adjust_result <s_out3>
5521           use <s_out0>
5522           use <s_out0>
5523 
5524      into:
5525 
5526         loop_exit:
5527           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5528           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5529           v_out2 = reduce <v_out1>
5530           s_out3 = extract_field <v_out2, 0>
5531           s_out4 = adjust_result <s_out3>
5532           use <s_out4>
5533           use <s_out4> */
5534 
5535 
5536   /* In SLP reduction chain we reduce vector results into one vector if
5537      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5538      LHS of the last stmt in the reduction chain, since we are looking for
5539      the loop exit phi node.  */
5540   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5541     {
5542       stmt_vec_info dest_stmt_info
5543 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5544       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5545       group_size = 1;
5546     }
5547 
5548   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5549      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5550      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5551      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5552      correspond to the first vector stmt, etc.
5553      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5554   if (group_size > new_phis.length ())
5555     {
5556       ratio = group_size / new_phis.length ();
5557       gcc_assert (!(group_size % new_phis.length ()));
5558     }
5559   else
5560     ratio = 1;
5561 
5562   stmt_vec_info epilog_stmt_info = NULL;
5563   for (k = 0; k < group_size; k++)
5564     {
5565       if (k % ratio == 0)
5566         {
5567 	  epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5568 	  reduction_phi_info = reduction_phis[k / ratio];
5569 	  if (double_reduc)
5570 	    inner_phi = inner_phis[k / ratio];
5571         }
5572 
5573       if (slp_reduc)
5574         {
5575 	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5576 
5577 	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5578 	  /* SLP statements can't participate in patterns.  */
5579 	  gcc_assert (!orig_stmt_info);
5580 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5581         }
5582 
5583       phis.create (3);
5584       /* Find the loop-closed-use at the loop exit of the original scalar
5585          result.  (The reduction result is expected to have two immediate uses -
5586          one at the latch block, and one at the loop exit).  */
5587       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5588         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5589 	    && !is_gimple_debug (USE_STMT (use_p)))
5590           phis.safe_push (USE_STMT (use_p));
5591 
5592       /* While we expect to have found an exit_phi because of loop-closed-ssa
5593          form we can end up without one if the scalar cycle is dead.  */
5594 
5595       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5596         {
5597           if (outer_loop)
5598             {
5599 	      stmt_vec_info exit_phi_vinfo
5600 		= loop_vinfo->lookup_stmt (exit_phi);
5601               gphi *vect_phi;
5602 
5603 	      if (double_reduc)
5604 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5605 	      else
5606 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5607               if (!double_reduc
5608                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5609                       != vect_double_reduction_def)
5610                 continue;
5611 
5612               /* Handle double reduction:
5613 
5614                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5615                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5616                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5617                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5618 
5619                  At that point the regular reduction (stmt2 and stmt3) is
5620                  already vectorized, as well as the exit phi node, stmt4.
5621                  Here we vectorize the phi node of double reduction, stmt1, and
5622                  update all relevant statements.  */
5623 
5624               /* Go through all the uses of s2 to find double reduction phi
5625                  node, i.e., stmt1 above.  */
5626               orig_name = PHI_RESULT (exit_phi);
5627               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5628                 {
5629                   stmt_vec_info use_stmt_vinfo;
5630                   tree vect_phi_init, preheader_arg, vect_phi_res;
5631                   basic_block bb = gimple_bb (use_stmt);
5632 
5633                   /* Check that USE_STMT is really double reduction phi
5634                      node.  */
5635                   if (gimple_code (use_stmt) != GIMPLE_PHI
5636                       || gimple_phi_num_args (use_stmt) != 2
5637                       || bb->loop_father != outer_loop)
5638                     continue;
5639 		  use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5640                   if (!use_stmt_vinfo
5641                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5642                           != vect_double_reduction_def)
5643 		    continue;
5644 
5645                   /* Create vector phi node for double reduction:
5646                      vs1 = phi <vs0, vs2>
5647                      vs1 was created previously in this function by a call to
5648                        vect_get_vec_def_for_operand and is stored in
5649                        vec_initial_def;
5650                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5651                      vs0 is created here.  */
5652 
5653                   /* Create vector phi node.  */
5654                   vect_phi = create_phi_node (vec_initial_def, bb);
5655 		  loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5656 
5657                   /* Create vs0 - initial def of the double reduction phi.  */
5658                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5659                                              loop_preheader_edge (outer_loop));
5660                   vect_phi_init = get_initial_def_for_reduction
5661 		    (stmt_info, preheader_arg, NULL);
5662 
5663                   /* Update phi node arguments with vs0 and vs2.  */
5664                   add_phi_arg (vect_phi, vect_phi_init,
5665                                loop_preheader_edge (outer_loop),
5666                                UNKNOWN_LOCATION);
5667 		  add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5668 			       loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5669                   if (dump_enabled_p ())
5670 		    dump_printf_loc (MSG_NOTE, vect_location,
5671 				     "created double reduction phi node: %G",
5672 				     vect_phi);
5673 
5674                   vect_phi_res = PHI_RESULT (vect_phi);
5675 
5676                   /* Replace the use, i.e., set the correct vs1 in the regular
5677                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5678                      loop is redundant.  */
5679 		  stmt_vec_info use_info = reduction_phi_info;
5680 		  for (j = 0; j < ncopies; j++)
5681 		    {
5682 		      edge pr_edge = loop_preheader_edge (loop);
5683 		      SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5684 				       pr_edge->dest_idx, vect_phi_res);
5685 		      use_info = STMT_VINFO_RELATED_STMT (use_info);
5686 		    }
5687                 }
5688             }
5689         }
5690 
5691       phis.release ();
5692       if (nested_in_vect_loop)
5693         {
5694           if (double_reduc)
5695             loop = outer_loop;
5696           else
5697             continue;
5698         }
5699 
5700       phis.create (3);
5701       /* Find the loop-closed-use at the loop exit of the original scalar
5702          result.  (The reduction result is expected to have two immediate uses,
5703          one at the latch block, and one at the loop exit).  For double
5704          reductions we are looking for exit phis of the outer loop.  */
5705       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5706         {
5707           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5708 	    {
5709 	      if (!is_gimple_debug (USE_STMT (use_p)))
5710 		phis.safe_push (USE_STMT (use_p));
5711 	    }
5712           else
5713             {
5714               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5715                 {
5716                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5717 
5718                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5719                     {
5720                       if (!flow_bb_inside_loop_p (loop,
5721                                              gimple_bb (USE_STMT (phi_use_p)))
5722 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
5723                         phis.safe_push (USE_STMT (phi_use_p));
5724                     }
5725                 }
5726             }
5727         }
5728 
5729       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5730         {
5731           /* Replace the uses:  */
5732           orig_name = PHI_RESULT (exit_phi);
5733           scalar_result = scalar_results[k];
5734           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5735             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5736               SET_USE (use_p, scalar_result);
5737         }
5738 
5739       phis.release ();
5740     }
5741 }
5742 
5743 /* Return a vector of type VECTYPE that is equal to the vector select
5744    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5745    before GSI.  */
5746 
5747 static tree
5748 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5749 		     tree vec, tree identity)
5750 {
5751   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5752   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5753 					  mask, vec, identity);
5754   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5755   return cond;
5756 }
5757 
5758 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5759    order, starting with LHS.  Insert the extraction statements before GSI and
5760    associate the new scalar SSA names with variable SCALAR_DEST.
5761    Return the SSA name for the result.  */
5762 
5763 static tree
5764 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5765 		       tree_code code, tree lhs, tree vector_rhs)
5766 {
5767   tree vectype = TREE_TYPE (vector_rhs);
5768   tree scalar_type = TREE_TYPE (vectype);
5769   tree bitsize = TYPE_SIZE (scalar_type);
5770   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5771   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5772 
5773   for (unsigned HOST_WIDE_INT bit_offset = 0;
5774        bit_offset < vec_size_in_bits;
5775        bit_offset += element_bitsize)
5776     {
5777       tree bitpos = bitsize_int (bit_offset);
5778       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5779 			 bitsize, bitpos);
5780 
5781       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5782       rhs = make_ssa_name (scalar_dest, stmt);
5783       gimple_assign_set_lhs (stmt, rhs);
5784       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5785 
5786       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5787       tree new_name = make_ssa_name (scalar_dest, stmt);
5788       gimple_assign_set_lhs (stmt, new_name);
5789       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5790       lhs = new_name;
5791     }
5792   return lhs;
5793 }
5794 
5795 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5796    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5797    statement.  CODE is the operation performed by STMT_INFO and OPS are
5798    its scalar operands.  REDUC_INDEX is the index of the operand in
5799    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5800    implements in-order reduction, or IFN_LAST if we should open-code it.
5801    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5802    that should be used to control the operation in a fully-masked loop.  */
5803 
5804 static bool
5805 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5806 			       gimple_stmt_iterator *gsi,
5807 			       stmt_vec_info *vec_stmt, slp_tree slp_node,
5808 			       gimple *reduc_def_stmt,
5809 			       tree_code code, internal_fn reduc_fn,
5810 			       tree ops[3], tree vectype_in,
5811 			       int reduc_index, vec_loop_masks *masks)
5812 {
5813   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5815   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5816   stmt_vec_info new_stmt_info = NULL;
5817 
5818   int ncopies;
5819   if (slp_node)
5820     ncopies = 1;
5821   else
5822     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5823 
5824   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5825   gcc_assert (ncopies == 1);
5826   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5827   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5828   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5829 	      == FOLD_LEFT_REDUCTION);
5830 
5831   if (slp_node)
5832     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5833 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
5834 
5835   tree op0 = ops[1 - reduc_index];
5836 
5837   int group_size = 1;
5838   stmt_vec_info scalar_dest_def_info;
5839   auto_vec<tree> vec_oprnds0;
5840   if (slp_node)
5841     {
5842       auto_vec<vec<tree> > vec_defs (2);
5843       auto_vec<tree> sops(2);
5844       sops.quick_push (ops[0]);
5845       sops.quick_push (ops[1]);
5846       vect_get_slp_defs (sops, slp_node, &vec_defs);
5847       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5848       vec_defs[0].release ();
5849       vec_defs[1].release ();
5850       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5851       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5852     }
5853   else
5854     {
5855       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5856       vec_oprnds0.create (1);
5857       vec_oprnds0.quick_push (loop_vec_def0);
5858       scalar_dest_def_info = stmt_info;
5859     }
5860 
5861   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5862   tree scalar_type = TREE_TYPE (scalar_dest);
5863   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5864 
5865   int vec_num = vec_oprnds0.length ();
5866   gcc_assert (vec_num == 1 || slp_node);
5867   tree vec_elem_type = TREE_TYPE (vectype_out);
5868   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5869 
5870   tree vector_identity = NULL_TREE;
5871   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5872     vector_identity = build_zero_cst (vectype_out);
5873 
5874   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5875   int i;
5876   tree def0;
5877   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5878     {
5879       gimple *new_stmt;
5880       tree mask = NULL_TREE;
5881       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5882 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5883 
5884       /* Handle MINUS by adding the negative.  */
5885       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5886 	{
5887 	  tree negated = make_ssa_name (vectype_out);
5888 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5889 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5890 	  def0 = negated;
5891 	}
5892 
5893       if (mask)
5894 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5895 				    vector_identity);
5896 
5897       /* On the first iteration the input is simply the scalar phi
5898 	 result, and for subsequent iterations it is the output of
5899 	 the preceding operation.  */
5900       if (reduc_fn != IFN_LAST)
5901 	{
5902 	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5903 	  /* For chained SLP reductions the output of the previous reduction
5904 	     operation serves as the input of the next. For the final statement
5905 	     the output cannot be a temporary - we reuse the original
5906 	     scalar destination of the last statement.  */
5907 	  if (i != vec_num - 1)
5908 	    {
5909 	      gimple_set_lhs (new_stmt, scalar_dest_var);
5910 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5911 	      gimple_set_lhs (new_stmt, reduc_var);
5912 	    }
5913 	}
5914       else
5915 	{
5916 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5917 					     reduc_var, def0);
5918 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5919 	  /* Remove the statement, so that we can use the same code paths
5920 	     as for statements that we've just created.  */
5921 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5922 	  gsi_remove (&tmp_gsi, true);
5923 	}
5924 
5925       if (i == vec_num - 1)
5926 	{
5927 	  gimple_set_lhs (new_stmt, scalar_dest);
5928 	  new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5929 						    new_stmt);
5930 	}
5931       else
5932 	new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5933 						     new_stmt, gsi);
5934 
5935       if (slp_node)
5936 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5937     }
5938 
5939   if (!slp_node)
5940     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5941 
5942   return true;
5943 }
5944 
5945 /* Function is_nonwrapping_integer_induction.
5946 
5947    Check if STMT_VINO (which is part of loop LOOP) both increments and
5948    does not cause overflow.  */
5949 
5950 static bool
5951 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5952 {
5953   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5954   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5955   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5956   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5957   widest_int ni, max_loop_value, lhs_max;
5958   wi::overflow_type overflow = wi::OVF_NONE;
5959 
5960   /* Make sure the loop is integer based.  */
5961   if (TREE_CODE (base) != INTEGER_CST
5962       || TREE_CODE (step) != INTEGER_CST)
5963     return false;
5964 
5965   /* Check that the max size of the loop will not wrap.  */
5966 
5967   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5968     return true;
5969 
5970   if (! max_stmt_executions (loop, &ni))
5971     return false;
5972 
5973   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5974 			    &overflow);
5975   if (overflow)
5976     return false;
5977 
5978   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5979 			    TYPE_SIGN (lhs_type), &overflow);
5980   if (overflow)
5981     return false;
5982 
5983   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5984 	  <= TYPE_PRECISION (lhs_type));
5985 }
5986 
5987 /* Function vectorizable_reduction.
5988 
5989    Check if STMT_INFO performs a reduction operation that can be vectorized.
5990    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5991    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5992    Return true if STMT_INFO is vectorizable in this way.
5993 
5994    This function also handles reduction idioms (patterns) that have been
5995    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5996    may be of this form:
5997      X = pattern_expr (arg0, arg1, ..., X)
5998    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5999    sequence that had been detected and replaced by the pattern-stmt
6000    (STMT_INFO).
6001 
6002    This function also handles reduction of condition expressions, for example:
6003      for (int i = 0; i < N; i++)
6004        if (a[i] < value)
6005 	 last = a[i];
6006    This is handled by vectorising the loop and creating an additional vector
6007    containing the loop indexes for which "a[i] < value" was true.  In the
6008    function epilogue this is reduced to a single max value and then used to
6009    index into the vector of results.
6010 
6011    In some cases of reduction patterns, the type of the reduction variable X is
6012    different than the type of the other arguments of STMT_INFO.
6013    In such cases, the vectype that is used when transforming STMT_INFO into
6014    a vector stmt is different than the vectype that is used to determine the
6015    vectorization factor, because it consists of a different number of elements
6016    than the actual number of elements that are being operated upon in parallel.
6017 
6018    For example, consider an accumulation of shorts into an int accumulator.
6019    On some targets it's possible to vectorize this pattern operating on 8
6020    shorts at a time (hence, the vectype for purposes of determining the
6021    vectorization factor should be V8HI); on the other hand, the vectype that
6022    is used to create the vector form is actually V4SI (the type of the result).
6023 
6024    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6025    indicates what is the actual level of parallelism (V8HI in the example), so
6026    that the right vectorization factor would be derived.  This vectype
6027    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6028    be used to create the vectorized stmt.  The right vectype for the vectorized
6029    stmt is obtained from the type of the result X:
6030         get_vectype_for_scalar_type (TREE_TYPE (X))
6031 
6032    This means that, contrary to "regular" reductions (or "regular" stmts in
6033    general), the following equation:
6034       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6035    does *NOT* necessarily hold for reduction patterns.  */
6036 
6037 bool
6038 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6039 			stmt_vec_info *vec_stmt, slp_tree slp_node,
6040 			slp_instance slp_node_instance,
6041 			stmt_vector_for_cost *cost_vec)
6042 {
6043   tree vec_dest;
6044   tree scalar_dest;
6045   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6046   tree vectype_in = NULL_TREE;
6047   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6048   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6049   enum tree_code code, orig_code;
6050   internal_fn reduc_fn;
6051   machine_mode vec_mode;
6052   int op_type;
6053   optab optab;
6054   tree new_temp = NULL_TREE;
6055   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6056   stmt_vec_info cond_stmt_vinfo = NULL;
6057   enum tree_code cond_reduc_op_code = ERROR_MARK;
6058   tree scalar_type;
6059   bool is_simple_use;
6060   int i;
6061   int ncopies;
6062   int epilog_copies;
6063   stmt_vec_info prev_stmt_info, prev_phi_info;
6064   bool single_defuse_cycle = false;
6065   stmt_vec_info new_stmt_info = NULL;
6066   int j;
6067   tree ops[3];
6068   enum vect_def_type dts[3];
6069   bool nested_cycle = false, found_nested_cycle_def = false;
6070   bool double_reduc = false;
6071   basic_block def_bb;
6072   struct loop * def_stmt_loop;
6073   tree def_arg;
6074   auto_vec<tree> vec_oprnds0;
6075   auto_vec<tree> vec_oprnds1;
6076   auto_vec<tree> vec_oprnds2;
6077   auto_vec<tree> vect_defs;
6078   auto_vec<stmt_vec_info> phis;
6079   int vec_num;
6080   tree def0, tem;
6081   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6082   tree cond_reduc_val = NULL_TREE;
6083 
6084   /* Make sure it was already recognized as a reduction computation.  */
6085   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6086       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6087     return false;
6088 
6089   if (nested_in_vect_loop_p (loop, stmt_info))
6090     {
6091       loop = loop->inner;
6092       nested_cycle = true;
6093     }
6094 
6095   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6096     gcc_assert (slp_node
6097 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6098 
6099   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6100     {
6101       tree phi_result = gimple_phi_result (phi);
6102       /* Analysis is fully done on the reduction stmt invocation.  */
6103       if (! vec_stmt)
6104 	{
6105 	  if (slp_node)
6106 	    slp_node_instance->reduc_phis = slp_node;
6107 
6108 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6109 	  return true;
6110 	}
6111 
6112       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6113 	/* Leave the scalar phi in place.  Note that checking
6114 	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6115 	   for reductions involving a single statement.  */
6116 	return true;
6117 
6118       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6119       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6120 
6121       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6122 	  == EXTRACT_LAST_REDUCTION)
6123 	/* Leave the scalar phi in place.  */
6124 	return true;
6125 
6126       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6127       code = gimple_assign_rhs_code (reduc_stmt);
6128       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6129 	{
6130 	  tree op = gimple_op (reduc_stmt, k);
6131 	  if (op == phi_result)
6132 	    continue;
6133 	  if (k == 1 && code == COND_EXPR)
6134 	    continue;
6135 	  bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6136 	  gcc_assert (is_simple_use);
6137 	  if (dt == vect_constant_def || dt == vect_external_def)
6138 	    continue;
6139 	  if (!vectype_in
6140 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6141 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6142 	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6143 	  break;
6144 	}
6145       /* For a nested cycle we might end up with an operation like
6146          phi_result * phi_result.  */
6147       if (!vectype_in)
6148 	vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6149       gcc_assert (vectype_in);
6150 
6151       if (slp_node)
6152 	ncopies = 1;
6153       else
6154 	ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6155 
6156       stmt_vec_info use_stmt_info;
6157       if (ncopies > 1
6158 	  && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6159 	  && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6160 	  && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6161 	single_defuse_cycle = true;
6162 
6163       /* Create the destination vector  */
6164       scalar_dest = gimple_assign_lhs (reduc_stmt);
6165       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6166 
6167       if (slp_node)
6168 	/* The size vect_schedule_slp_instance computes is off for us.  */
6169 	vec_num = vect_get_num_vectors
6170 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6171 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6172 	   vectype_in);
6173       else
6174 	vec_num = 1;
6175 
6176       /* Generate the reduction PHIs upfront.  */
6177       prev_phi_info = NULL;
6178       for (j = 0; j < ncopies; j++)
6179 	{
6180 	  if (j == 0 || !single_defuse_cycle)
6181 	    {
6182 	      for (i = 0; i < vec_num; i++)
6183 		{
6184 		  /* Create the reduction-phi that defines the reduction
6185 		     operand.  */
6186 		  gimple *new_phi = create_phi_node (vec_dest, loop->header);
6187 		  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6188 
6189 		  if (slp_node)
6190 		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6191 		  else
6192 		    {
6193 		      if (j == 0)
6194 			STMT_VINFO_VEC_STMT (stmt_info)
6195 			  = *vec_stmt = new_phi_info;
6196 		      else
6197 			STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6198 		      prev_phi_info = new_phi_info;
6199 		    }
6200 		}
6201 	    }
6202 	}
6203 
6204       return true;
6205     }
6206 
6207   /* 1. Is vectorizable reduction?  */
6208   /* Not supportable if the reduction variable is used in the loop, unless
6209      it's a reduction chain.  */
6210   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6211       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6212     return false;
6213 
6214   /* Reductions that are not used even in an enclosing outer-loop,
6215      are expected to be "live" (used out of the loop).  */
6216   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6217       && !STMT_VINFO_LIVE_P (stmt_info))
6218     return false;
6219 
6220   /* 2. Has this been recognized as a reduction pattern?
6221 
6222      Check if STMT represents a pattern that has been recognized
6223      in earlier analysis stages.  For stmts that represent a pattern,
6224      the STMT_VINFO_RELATED_STMT field records the last stmt in
6225      the original sequence that constitutes the pattern.  */
6226 
6227   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6228   if (orig_stmt_info)
6229     {
6230       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6231       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6232     }
6233 
6234   /* 3. Check the operands of the operation.  The first operands are defined
6235         inside the loop body. The last operand is the reduction variable,
6236         which is defined by the loop-header-phi.  */
6237 
6238   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6239 
6240   /* Flatten RHS.  */
6241   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6242     {
6243     case GIMPLE_BINARY_RHS:
6244       code = gimple_assign_rhs_code (stmt);
6245       op_type = TREE_CODE_LENGTH (code);
6246       gcc_assert (op_type == binary_op);
6247       ops[0] = gimple_assign_rhs1 (stmt);
6248       ops[1] = gimple_assign_rhs2 (stmt);
6249       break;
6250 
6251     case GIMPLE_TERNARY_RHS:
6252       code = gimple_assign_rhs_code (stmt);
6253       op_type = TREE_CODE_LENGTH (code);
6254       gcc_assert (op_type == ternary_op);
6255       ops[0] = gimple_assign_rhs1 (stmt);
6256       ops[1] = gimple_assign_rhs2 (stmt);
6257       ops[2] = gimple_assign_rhs3 (stmt);
6258       break;
6259 
6260     case GIMPLE_UNARY_RHS:
6261       return false;
6262 
6263     default:
6264       gcc_unreachable ();
6265     }
6266 
6267   if (code == COND_EXPR && slp_node)
6268     return false;
6269 
6270   scalar_dest = gimple_assign_lhs (stmt);
6271   scalar_type = TREE_TYPE (scalar_dest);
6272   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6273       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6274     return false;
6275 
6276   /* Do not try to vectorize bit-precision reductions.  */
6277   if (!type_has_mode_precision_p (scalar_type))
6278     return false;
6279 
6280   /* All uses but the last are expected to be defined in the loop.
6281      The last use is the reduction variable.  In case of nested cycle this
6282      assumption is not true: we use reduc_index to record the index of the
6283      reduction variable.  */
6284   stmt_vec_info reduc_def_info;
6285   if (orig_stmt_info)
6286     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6287   else
6288     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6289   gcc_assert (reduc_def_info);
6290   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6291   tree reduc_def = PHI_RESULT (reduc_def_phi);
6292   int reduc_index = -1;
6293   for (i = 0; i < op_type; i++)
6294     {
6295       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6296       if (i == 0 && code == COND_EXPR)
6297         continue;
6298 
6299       stmt_vec_info def_stmt_info;
6300       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6301 					  &def_stmt_info);
6302       dt = dts[i];
6303       gcc_assert (is_simple_use);
6304       if (dt == vect_reduction_def
6305 	  && ops[i] == reduc_def)
6306 	{
6307 	  reduc_index = i;
6308 	  continue;
6309 	}
6310       else if (tem)
6311 	{
6312 	  /* To properly compute ncopies we are interested in the widest
6313 	     input type in case we're looking at a widening accumulation.  */
6314 	  if (!vectype_in
6315 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6316 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6317 	    vectype_in = tem;
6318 	}
6319 
6320       if (dt != vect_internal_def
6321 	  && dt != vect_external_def
6322 	  && dt != vect_constant_def
6323 	  && dt != vect_induction_def
6324           && !(dt == vect_nested_cycle && nested_cycle))
6325 	return false;
6326 
6327       if (dt == vect_nested_cycle
6328 	  && ops[i] == reduc_def)
6329 	{
6330 	  found_nested_cycle_def = true;
6331 	  reduc_index = i;
6332 	}
6333 
6334       if (i == 1 && code == COND_EXPR)
6335 	{
6336 	  /* Record how value of COND_EXPR is defined.  */
6337 	  if (dt == vect_constant_def)
6338 	    {
6339 	      cond_reduc_dt = dt;
6340 	      cond_reduc_val = ops[i];
6341 	    }
6342 	  if (dt == vect_induction_def
6343 	      && def_stmt_info
6344 	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6345 	    {
6346 	      cond_reduc_dt = dt;
6347 	      cond_stmt_vinfo = def_stmt_info;
6348 	    }
6349 	}
6350     }
6351 
6352   if (!vectype_in)
6353     vectype_in = vectype_out;
6354 
6355   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6356      directy used in stmt.  */
6357   if (reduc_index == -1)
6358     {
6359       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6360 	{
6361 	  if (dump_enabled_p ())
6362 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6363 			     "in-order reduction chain without SLP.\n");
6364 	  return false;
6365 	}
6366     }
6367 
6368   if (!(reduc_index == -1
6369 	|| dts[reduc_index] == vect_reduction_def
6370 	|| dts[reduc_index] == vect_nested_cycle
6371 	|| ((dts[reduc_index] == vect_internal_def
6372 	     || dts[reduc_index] == vect_external_def
6373 	     || dts[reduc_index] == vect_constant_def
6374 	     || dts[reduc_index] == vect_induction_def)
6375 	    && nested_cycle && found_nested_cycle_def)))
6376     {
6377       /* For pattern recognized stmts, orig_stmt might be a reduction,
6378 	 but some helper statements for the pattern might not, or
6379 	 might be COND_EXPRs with reduction uses in the condition.  */
6380       gcc_assert (orig_stmt_info);
6381       return false;
6382     }
6383 
6384   /* PHIs should not participate in patterns.  */
6385   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6386   enum vect_reduction_type v_reduc_type
6387     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6388   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6389 
6390   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6391   /* If we have a condition reduction, see if we can simplify it further.  */
6392   if (v_reduc_type == COND_REDUCTION)
6393     {
6394       /* TODO: We can't yet handle reduction chains, since we need to treat
6395 	 each COND_EXPR in the chain specially, not just the last one.
6396 	 E.g. for:
6397 
6398 	    x_1 = PHI <x_3, ...>
6399 	    x_2 = a_2 ? ... : x_1;
6400 	    x_3 = a_3 ? ... : x_2;
6401 
6402 	 we're interested in the last element in x_3 for which a_2 || a_3
6403 	 is true, whereas the current reduction chain handling would
6404 	 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6405 	 as a reduction operation.  */
6406       if (reduc_index == -1)
6407 	{
6408 	  if (dump_enabled_p ())
6409 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6410 			     "conditional reduction chains not supported\n");
6411 	  return false;
6412 	}
6413 
6414       /* vect_is_simple_reduction ensured that operand 2 is the
6415 	 loop-carried operand.  */
6416       gcc_assert (reduc_index == 2);
6417 
6418       /* Loop peeling modifies initial value of reduction PHI, which
6419 	 makes the reduction stmt to be transformed different to the
6420 	 original stmt analyzed.  We need to record reduction code for
6421 	 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6422 	 it can be used directly at transform stage.  */
6423       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6424 	  || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6425 	{
6426 	  /* Also set the reduction type to CONST_COND_REDUCTION.  */
6427 	  gcc_assert (cond_reduc_dt == vect_constant_def);
6428 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6429 	}
6430       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6431 					       vectype_in, OPTIMIZE_FOR_SPEED))
6432 	{
6433 	  if (dump_enabled_p ())
6434 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6435 			     "optimizing condition reduction with"
6436 			     " FOLD_EXTRACT_LAST.\n");
6437 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6438 	}
6439       else if (cond_reduc_dt == vect_induction_def)
6440 	{
6441 	  tree base
6442 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6443 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6444 
6445 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6446 		      && TREE_CODE (step) == INTEGER_CST);
6447 	  cond_reduc_val = NULL_TREE;
6448 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6449 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6450 	    ;
6451 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6452 	     above base; punt if base is the minimum value of the type for
6453 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6454 	  else if (tree_int_cst_sgn (step) == -1)
6455 	    {
6456 	      cond_reduc_op_code = MIN_EXPR;
6457 	      if (tree_int_cst_sgn (base) == -1)
6458 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6459 	      else if (tree_int_cst_lt (base,
6460 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6461 		cond_reduc_val
6462 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6463 	    }
6464 	  else
6465 	    {
6466 	      cond_reduc_op_code = MAX_EXPR;
6467 	      if (tree_int_cst_sgn (base) == 1)
6468 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6469 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6470 					base))
6471 		cond_reduc_val
6472 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6473 	    }
6474 	  if (cond_reduc_val)
6475 	    {
6476 	      if (dump_enabled_p ())
6477 		dump_printf_loc (MSG_NOTE, vect_location,
6478 				 "condition expression based on "
6479 				 "integer induction.\n");
6480 	      STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6481 		= INTEGER_INDUC_COND_REDUCTION;
6482 	    }
6483 	}
6484       else if (cond_reduc_dt == vect_constant_def)
6485 	{
6486 	  enum vect_def_type cond_initial_dt;
6487 	  gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6488 	  tree cond_initial_val
6489 	    = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6490 
6491 	  gcc_assert (cond_reduc_val != NULL_TREE);
6492 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6493 	  if (cond_initial_dt == vect_constant_def
6494 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6495 				     TREE_TYPE (cond_reduc_val)))
6496 	    {
6497 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6498 				    cond_initial_val, cond_reduc_val);
6499 	      if (e && (integer_onep (e) || integer_zerop (e)))
6500 		{
6501 		  if (dump_enabled_p ())
6502 		    dump_printf_loc (MSG_NOTE, vect_location,
6503 				     "condition expression based on "
6504 				     "compile time constant.\n");
6505 		  /* Record reduction code at analysis stage.  */
6506 		  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6507 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6508 		  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6509 		    = CONST_COND_REDUCTION;
6510 		}
6511 	    }
6512 	}
6513     }
6514 
6515   if (orig_stmt_info)
6516     gcc_assert (tmp == orig_stmt_info
6517 		|| REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6518   else
6519     /* We changed STMT to be the first stmt in reduction chain, hence we
6520        check that in this case the first element in the chain is STMT.  */
6521     gcc_assert (tmp == stmt_info
6522 		|| REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6523 
6524   if (STMT_VINFO_LIVE_P (reduc_def_info))
6525     return false;
6526 
6527   if (slp_node)
6528     ncopies = 1;
6529   else
6530     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6531 
6532   gcc_assert (ncopies >= 1);
6533 
6534   vec_mode = TYPE_MODE (vectype_in);
6535   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6536 
6537   if (nested_cycle)
6538     {
6539       def_bb = gimple_bb (reduc_def_phi);
6540       def_stmt_loop = def_bb->loop_father;
6541       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6542                                        loop_preheader_edge (def_stmt_loop));
6543       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6544       if (def_arg_stmt_info
6545 	  && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6546 	      == vect_double_reduction_def))
6547         double_reduc = true;
6548     }
6549 
6550   vect_reduction_type reduction_type
6551     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6552   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6553       && ncopies > 1)
6554     {
6555       if (dump_enabled_p ())
6556 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6557 			 "multiple types in double reduction or condition "
6558 			 "reduction.\n");
6559       return false;
6560     }
6561 
6562   if (code == COND_EXPR)
6563     {
6564       /* Only call during the analysis stage, otherwise we'll lose
6565 	 STMT_VINFO_TYPE.  */
6566       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6567 						true, NULL, cost_vec))
6568         {
6569           if (dump_enabled_p ())
6570 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6571 			     "unsupported condition in reduction\n");
6572 	  return false;
6573         }
6574     }
6575   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6576 	   || code == LROTATE_EXPR || code == RROTATE_EXPR)
6577     {
6578       /* Only call during the analysis stage, otherwise we'll lose
6579 	 STMT_VINFO_TYPE.  We only support this for nested cycles
6580 	 without double reductions at the moment.  */
6581       if (!nested_cycle
6582 	  || double_reduc
6583 	  || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6584 						NULL, cost_vec)))
6585 	{
6586           if (dump_enabled_p ())
6587 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6588 			     "unsupported shift or rotation in reduction\n");
6589 	  return false;
6590 	}
6591     }
6592   else
6593     {
6594       /* 4. Supportable by target?  */
6595 
6596       /* 4.1. check support for the operation in the loop  */
6597       optab = optab_for_tree_code (code, vectype_in, optab_default);
6598       if (!optab)
6599         {
6600           if (dump_enabled_p ())
6601 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6602 			     "no optab.\n");
6603 
6604           return false;
6605         }
6606 
6607       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6608         {
6609           if (dump_enabled_p ())
6610             dump_printf (MSG_NOTE, "op not supported by target.\n");
6611 
6612 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6613 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6614             return false;
6615 
6616           if (dump_enabled_p ())
6617   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6618         }
6619 
6620       /* Worthwhile without SIMD support?  */
6621       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6622 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6623         {
6624           if (dump_enabled_p ())
6625 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6626 			     "not worthwhile without SIMD support.\n");
6627 
6628           return false;
6629         }
6630     }
6631 
6632   /* 4.2. Check support for the epilog operation.
6633 
6634           If STMT represents a reduction pattern, then the type of the
6635           reduction variable may be different than the type of the rest
6636           of the arguments.  For example, consider the case of accumulation
6637           of shorts into an int accumulator; The original code:
6638                         S1: int_a = (int) short_a;
6639           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6640 
6641           was replaced with:
6642                         STMT: int_acc = widen_sum <short_a, int_acc>
6643 
6644           This means that:
6645           1. The tree-code that is used to create the vector operation in the
6646              epilog code (that reduces the partial results) is not the
6647              tree-code of STMT, but is rather the tree-code of the original
6648              stmt from the pattern that STMT is replacing.  I.e, in the example
6649              above we want to use 'widen_sum' in the loop, but 'plus' in the
6650              epilog.
6651           2. The type (mode) we use to check available target support
6652              for the vector operation to be created in the *epilog*, is
6653              determined by the type of the reduction variable (in the example
6654              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6655              However the type (mode) we use to check available target support
6656              for the vector operation to be created *inside the loop*, is
6657              determined by the type of the other arguments to STMT (in the
6658              example we'd check this: optab_handler (widen_sum_optab,
6659 	     vect_short_mode)).
6660 
6661           This is contrary to "regular" reductions, in which the types of all
6662           the arguments are the same as the type of the reduction variable.
6663           For "regular" reductions we can therefore use the same vector type
6664           (and also the same tree-code) when generating the epilog code and
6665           when generating the code inside the loop.  */
6666 
6667   if (orig_stmt_info
6668       && (reduction_type == TREE_CODE_REDUCTION
6669 	  || reduction_type == FOLD_LEFT_REDUCTION))
6670     {
6671       /* This is a reduction pattern: get the vectype from the type of the
6672          reduction variable, and get the tree-code from orig_stmt.  */
6673       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6674       gcc_assert (vectype_out);
6675       vec_mode = TYPE_MODE (vectype_out);
6676     }
6677   else
6678     {
6679       /* Regular reduction: use the same vectype and tree-code as used for
6680          the vector code inside the loop can be used for the epilog code. */
6681       orig_code = code;
6682 
6683       if (code == MINUS_EXPR)
6684 	orig_code = PLUS_EXPR;
6685 
6686       /* For simple condition reductions, replace with the actual expression
6687 	 we want to base our reduction around.  */
6688       if (reduction_type == CONST_COND_REDUCTION)
6689 	{
6690 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6691 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6692 	}
6693       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6694 	orig_code = cond_reduc_op_code;
6695     }
6696 
6697   reduc_fn = IFN_LAST;
6698 
6699   if (reduction_type == TREE_CODE_REDUCTION
6700       || reduction_type == FOLD_LEFT_REDUCTION
6701       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6702       || reduction_type == CONST_COND_REDUCTION)
6703     {
6704       if (reduction_type == FOLD_LEFT_REDUCTION
6705 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
6706 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6707 	{
6708 	  if (reduc_fn != IFN_LAST
6709 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6710 						  OPTIMIZE_FOR_SPEED))
6711 	    {
6712 	      if (dump_enabled_p ())
6713 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714 				 "reduc op not supported by target.\n");
6715 
6716 	      reduc_fn = IFN_LAST;
6717 	    }
6718 	}
6719       else
6720 	{
6721 	  if (!nested_cycle || double_reduc)
6722 	    {
6723 	      if (dump_enabled_p ())
6724 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725 				 "no reduc code for scalar code.\n");
6726 
6727 	      return false;
6728 	    }
6729 	}
6730     }
6731   else if (reduction_type == COND_REDUCTION)
6732     {
6733       int scalar_precision
6734 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6735       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6736       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6737 						nunits_out);
6738 
6739       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6740 					  OPTIMIZE_FOR_SPEED))
6741 	reduc_fn = IFN_REDUC_MAX;
6742     }
6743 
6744   if (reduction_type != EXTRACT_LAST_REDUCTION
6745       && (!nested_cycle || double_reduc)
6746       && reduc_fn == IFN_LAST
6747       && !nunits_out.is_constant ())
6748     {
6749       if (dump_enabled_p ())
6750 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6751 			 "missing target support for reduction on"
6752 			 " variable-length vectors.\n");
6753       return false;
6754     }
6755 
6756   /* For SLP reductions, see if there is a neutral value we can use.  */
6757   tree neutral_op = NULL_TREE;
6758   if (slp_node)
6759     neutral_op = neutral_op_for_slp_reduction
6760       (slp_node_instance->reduc_phis, code,
6761        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6762 
6763   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6764     {
6765       /* We can't support in-order reductions of code such as this:
6766 
6767 	   for (int i = 0; i < n1; ++i)
6768 	     for (int j = 0; j < n2; ++j)
6769 	       l += a[j];
6770 
6771 	 since GCC effectively transforms the loop when vectorizing:
6772 
6773 	   for (int i = 0; i < n1 / VF; ++i)
6774 	     for (int j = 0; j < n2; ++j)
6775 	       for (int k = 0; k < VF; ++k)
6776 		 l += a[j];
6777 
6778 	 which is a reassociation of the original operation.  */
6779       if (dump_enabled_p ())
6780 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6781 			 "in-order double reduction not supported.\n");
6782 
6783       return false;
6784     }
6785 
6786   if (reduction_type == FOLD_LEFT_REDUCTION
6787       && slp_node
6788       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6789     {
6790       /* We cannot use in-order reductions in this case because there is
6791 	 an implicit reassociation of the operations involved.  */
6792       if (dump_enabled_p ())
6793 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794 			 "in-order unchained SLP reductions not supported.\n");
6795       return false;
6796     }
6797 
6798   /* For double reductions, and for SLP reductions with a neutral value,
6799      we construct a variable-length initial vector by loading a vector
6800      full of the neutral value and then shift-and-inserting the start
6801      values into the low-numbered elements.  */
6802   if ((double_reduc || neutral_op)
6803       && !nunits_out.is_constant ()
6804       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6805 					  vectype_out, OPTIMIZE_FOR_SPEED))
6806     {
6807       if (dump_enabled_p ())
6808 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809 			 "reduction on variable-length vectors requires"
6810 			 " target support for a vector-shift-and-insert"
6811 			 " operation.\n");
6812       return false;
6813     }
6814 
6815   /* Check extra constraints for variable-length unchained SLP reductions.  */
6816   if (STMT_SLP_TYPE (stmt_info)
6817       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6818       && !nunits_out.is_constant ())
6819     {
6820       /* We checked above that we could build the initial vector when
6821 	 there's a neutral element value.  Check here for the case in
6822 	 which each SLP statement has its own initial value and in which
6823 	 that value needs to be repeated for every instance of the
6824 	 statement within the initial vector.  */
6825       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6826       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6827       if (!neutral_op
6828 	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
6829 	{
6830 	  if (dump_enabled_p ())
6831 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6832 			     "unsupported form of SLP reduction for"
6833 			     " variable-length vectors: cannot build"
6834 			     " initial vector.\n");
6835 	  return false;
6836 	}
6837       /* The epilogue code relies on the number of elements being a multiple
6838 	 of the group size.  The duplicate-and-interleave approach to setting
6839 	 up the the initial vector does too.  */
6840       if (!multiple_p (nunits_out, group_size))
6841 	{
6842 	  if (dump_enabled_p ())
6843 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6844 			     "unsupported form of SLP reduction for"
6845 			     " variable-length vectors: the vector size"
6846 			     " is not a multiple of the number of results.\n");
6847 	  return false;
6848 	}
6849     }
6850 
6851   /* In case of widenning multiplication by a constant, we update the type
6852      of the constant to be the type of the other operand.  We check that the
6853      constant fits the type in the pattern recognition pass.  */
6854   if (code == DOT_PROD_EXPR
6855       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6856     {
6857       if (TREE_CODE (ops[0]) == INTEGER_CST)
6858         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6859       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6860         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6861       else
6862         {
6863           if (dump_enabled_p ())
6864 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6865 			     "invalid types in dot-prod\n");
6866 
6867           return false;
6868         }
6869     }
6870 
6871   if (reduction_type == COND_REDUCTION)
6872     {
6873       widest_int ni;
6874 
6875       if (! max_loop_iterations (loop, &ni))
6876 	{
6877 	  if (dump_enabled_p ())
6878 	    dump_printf_loc (MSG_NOTE, vect_location,
6879 			     "loop count not known, cannot create cond "
6880 			     "reduction.\n");
6881 	  return false;
6882 	}
6883       /* Convert backedges to iterations.  */
6884       ni += 1;
6885 
6886       /* The additional index will be the same type as the condition.  Check
6887 	 that the loop can fit into this less one (because we'll use up the
6888 	 zero slot for when there are no matches).  */
6889       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6890       if (wi::geu_p (ni, wi::to_widest (max_index)))
6891 	{
6892 	  if (dump_enabled_p ())
6893 	    dump_printf_loc (MSG_NOTE, vect_location,
6894 			     "loop size is greater than data size.\n");
6895 	  return false;
6896 	}
6897     }
6898 
6899   /* In case the vectorization factor (VF) is bigger than the number
6900      of elements that we can fit in a vectype (nunits), we have to generate
6901      more than one vector stmt - i.e - we need to "unroll" the
6902      vector stmt by a factor VF/nunits.  For more details see documentation
6903      in vectorizable_operation.  */
6904 
6905   /* If the reduction is used in an outer loop we need to generate
6906      VF intermediate results, like so (e.g. for ncopies=2):
6907 	r0 = phi (init, r0)
6908 	r1 = phi (init, r1)
6909 	r0 = x0 + r0;
6910         r1 = x1 + r1;
6911     (i.e. we generate VF results in 2 registers).
6912     In this case we have a separate def-use cycle for each copy, and therefore
6913     for each copy we get the vector def for the reduction variable from the
6914     respective phi node created for this copy.
6915 
6916     Otherwise (the reduction is unused in the loop nest), we can combine
6917     together intermediate results, like so (e.g. for ncopies=2):
6918 	r = phi (init, r)
6919 	r = x0 + r;
6920 	r = x1 + r;
6921    (i.e. we generate VF/2 results in a single register).
6922    In this case for each copy we get the vector def for the reduction variable
6923    from the vectorized reduction operation generated in the previous iteration.
6924 
6925    This only works when we see both the reduction PHI and its only consumer
6926    in vectorizable_reduction and there are no intermediate stmts
6927    participating.  */
6928   stmt_vec_info use_stmt_info;
6929   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6930   if (ncopies > 1
6931       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6932       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6933       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6934     {
6935       single_defuse_cycle = true;
6936       epilog_copies = 1;
6937     }
6938   else
6939     epilog_copies = ncopies;
6940 
6941   /* If the reduction stmt is one of the patterns that have lane
6942      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6943   if ((ncopies > 1
6944        && ! single_defuse_cycle)
6945       && (code == DOT_PROD_EXPR
6946 	  || code == WIDEN_SUM_EXPR
6947 	  || code == SAD_EXPR))
6948     {
6949       if (dump_enabled_p ())
6950 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6951 			 "multi def-use cycle not possible for lane-reducing "
6952 			 "reduction operation\n");
6953       return false;
6954     }
6955 
6956   if (slp_node)
6957     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6958   else
6959     vec_num = 1;
6960 
6961   internal_fn cond_fn = get_conditional_internal_fn (code);
6962   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6963 
6964   if (!vec_stmt) /* transformation not required.  */
6965     {
6966       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6967       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6968 	{
6969 	  if (reduction_type != FOLD_LEFT_REDUCTION
6970 	      && (cond_fn == IFN_LAST
6971 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6972 						      OPTIMIZE_FOR_SPEED)))
6973 	    {
6974 	      if (dump_enabled_p ())
6975 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6976 				 "can't use a fully-masked loop because no"
6977 				 " conditional operation is available.\n");
6978 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6979 	    }
6980 	  else if (reduc_index == -1)
6981 	    {
6982 	      if (dump_enabled_p ())
6983 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6984 				 "can't use a fully-masked loop for chained"
6985 				 " reductions.\n");
6986 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6987 	    }
6988 	  else
6989 	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6990 				   vectype_in);
6991 	}
6992       if (dump_enabled_p ()
6993 	  && reduction_type == FOLD_LEFT_REDUCTION)
6994 	dump_printf_loc (MSG_NOTE, vect_location,
6995 			 "using an in-order (fold-left) reduction.\n");
6996       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6997       return true;
6998     }
6999 
7000   /* Transform.  */
7001 
7002   if (dump_enabled_p ())
7003     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7004 
7005   /* FORNOW: Multiple types are not supported for condition.  */
7006   if (code == COND_EXPR)
7007     gcc_assert (ncopies == 1);
7008 
7009   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7010 
7011   if (reduction_type == FOLD_LEFT_REDUCTION)
7012     return vectorize_fold_left_reduction
7013       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7014        reduc_fn, ops, vectype_in, reduc_index, masks);
7015 
7016   if (reduction_type == EXTRACT_LAST_REDUCTION)
7017     {
7018       gcc_assert (!slp_node);
7019       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7020 				     true, NULL, NULL);
7021     }
7022 
7023   /* Create the destination vector  */
7024   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7025 
7026   prev_stmt_info = NULL;
7027   prev_phi_info = NULL;
7028   if (!slp_node)
7029     {
7030       vec_oprnds0.create (1);
7031       vec_oprnds1.create (1);
7032       if (op_type == ternary_op)
7033         vec_oprnds2.create (1);
7034     }
7035 
7036   phis.create (vec_num);
7037   vect_defs.create (vec_num);
7038   if (!slp_node)
7039     vect_defs.quick_push (NULL_TREE);
7040 
7041   if (slp_node)
7042     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7043   else
7044     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7045 
7046   for (j = 0; j < ncopies; j++)
7047     {
7048       if (code == COND_EXPR)
7049         {
7050           gcc_assert (!slp_node);
7051 	  vectorizable_condition (stmt_info, gsi, vec_stmt,
7052 				  true, NULL, NULL);
7053           break;
7054         }
7055       if (code == LSHIFT_EXPR
7056 	  || code == RSHIFT_EXPR)
7057 	{
7058 	  vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7059 	  break;
7060 	}
7061 
7062       /* Handle uses.  */
7063       if (j == 0)
7064         {
7065 	  if (slp_node)
7066 	    {
7067 	      /* Get vec defs for all the operands except the reduction index,
7068 		 ensuring the ordering of the ops in the vector is kept.  */
7069 	      auto_vec<tree, 3> slp_ops;
7070 	      auto_vec<vec<tree>, 3> vec_defs;
7071 
7072 	      slp_ops.quick_push (ops[0]);
7073 	      slp_ops.quick_push (ops[1]);
7074 	      if (op_type == ternary_op)
7075 		slp_ops.quick_push (ops[2]);
7076 
7077 	      vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7078 
7079 	      vec_oprnds0.safe_splice (vec_defs[0]);
7080 	      vec_defs[0].release ();
7081 	      vec_oprnds1.safe_splice (vec_defs[1]);
7082 	      vec_defs[1].release ();
7083 	      if (op_type == ternary_op)
7084 		{
7085 		  vec_oprnds2.safe_splice (vec_defs[2]);
7086 		  vec_defs[2].release ();
7087 		}
7088 	    }
7089           else
7090 	    {
7091               vec_oprnds0.quick_push
7092 		(vect_get_vec_def_for_operand (ops[0], stmt_info));
7093               vec_oprnds1.quick_push
7094 		(vect_get_vec_def_for_operand (ops[1], stmt_info));
7095               if (op_type == ternary_op)
7096 		vec_oprnds2.quick_push
7097 		  (vect_get_vec_def_for_operand (ops[2], stmt_info));
7098 	    }
7099         }
7100       else
7101         {
7102           if (!slp_node)
7103             {
7104 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7105 
7106 	      if (single_defuse_cycle && reduc_index == 0)
7107 		vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7108 	      else
7109 		vec_oprnds0[0]
7110 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7111 						    vec_oprnds0[0]);
7112 	      if (single_defuse_cycle && reduc_index == 1)
7113 		vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7114 	      else
7115 		vec_oprnds1[0]
7116 		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7117 						    vec_oprnds1[0]);
7118 	      if (op_type == ternary_op)
7119 		{
7120 		  if (single_defuse_cycle && reduc_index == 2)
7121 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7122 		  else
7123 		    vec_oprnds2[0]
7124 		      = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7125 							vec_oprnds2[0]);
7126 		}
7127             }
7128         }
7129 
7130       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7131         {
7132 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7133 	  if (masked_loop_p)
7134 	    {
7135 	      /* Make sure that the reduction accumulator is vop[0].  */
7136 	      if (reduc_index == 1)
7137 		{
7138 		  gcc_assert (commutative_tree_code (code));
7139 		  std::swap (vop[0], vop[1]);
7140 		}
7141 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7142 					      vectype_in, i * ncopies + j);
7143 	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7144 							vop[0], vop[1],
7145 							vop[0]);
7146 	      new_temp = make_ssa_name (vec_dest, call);
7147 	      gimple_call_set_lhs (call, new_temp);
7148 	      gimple_call_set_nothrow (call, true);
7149 	      new_stmt_info
7150 		= vect_finish_stmt_generation (stmt_info, call, gsi);
7151 	    }
7152 	  else
7153 	    {
7154 	      if (op_type == ternary_op)
7155 		vop[2] = vec_oprnds2[i];
7156 
7157 	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
7158 						       vop[0], vop[1], vop[2]);
7159 	      new_temp = make_ssa_name (vec_dest, new_stmt);
7160 	      gimple_assign_set_lhs (new_stmt, new_temp);
7161 	      new_stmt_info
7162 		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7163 	    }
7164 
7165           if (slp_node)
7166             {
7167 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7168               vect_defs.quick_push (new_temp);
7169             }
7170           else
7171             vect_defs[0] = new_temp;
7172         }
7173 
7174       if (slp_node)
7175         continue;
7176 
7177       if (j == 0)
7178 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7179       else
7180 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7181 
7182       prev_stmt_info = new_stmt_info;
7183     }
7184 
7185   /* Finalize the reduction-phi (set its arguments) and create the
7186      epilog reduction code.  */
7187   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7188     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7189 
7190   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7191 				    epilog_copies, reduc_fn, phis,
7192 				    double_reduc, slp_node, slp_node_instance,
7193 				    cond_reduc_val, cond_reduc_op_code,
7194 				    neutral_op);
7195 
7196   return true;
7197 }
7198 
7199 /* Function vect_min_worthwhile_factor.
7200 
7201    For a loop where we could vectorize the operation indicated by CODE,
7202    return the minimum vectorization factor that makes it worthwhile
7203    to use generic vectors.  */
7204 static unsigned int
7205 vect_min_worthwhile_factor (enum tree_code code)
7206 {
7207   switch (code)
7208     {
7209     case PLUS_EXPR:
7210     case MINUS_EXPR:
7211     case NEGATE_EXPR:
7212       return 4;
7213 
7214     case BIT_AND_EXPR:
7215     case BIT_IOR_EXPR:
7216     case BIT_XOR_EXPR:
7217     case BIT_NOT_EXPR:
7218       return 2;
7219 
7220     default:
7221       return INT_MAX;
7222     }
7223 }
7224 
7225 /* Return true if VINFO indicates we are doing loop vectorization and if
7226    it is worth decomposing CODE operations into scalar operations for
7227    that loop's vectorization factor.  */
7228 
7229 bool
7230 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7231 {
7232   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7233   unsigned HOST_WIDE_INT value;
7234   return (loop_vinfo
7235 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7236 	  && value >= vect_min_worthwhile_factor (code));
7237 }
7238 
7239 /* Function vectorizable_induction
7240 
7241    Check if STMT_INFO performs an induction computation that can be vectorized.
7242    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7243    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7244    Return true if STMT_INFO is vectorizable in this way.  */
7245 
7246 bool
7247 vectorizable_induction (stmt_vec_info stmt_info,
7248 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7249 			stmt_vec_info *vec_stmt, slp_tree slp_node,
7250 			stmt_vector_for_cost *cost_vec)
7251 {
7252   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7253   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7254   unsigned ncopies;
7255   bool nested_in_vect_loop = false;
7256   struct loop *iv_loop;
7257   tree vec_def;
7258   edge pe = loop_preheader_edge (loop);
7259   basic_block new_bb;
7260   tree new_vec, vec_init, vec_step, t;
7261   tree new_name;
7262   gimple *new_stmt;
7263   gphi *induction_phi;
7264   tree induc_def, vec_dest;
7265   tree init_expr, step_expr;
7266   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7267   unsigned i;
7268   tree expr;
7269   gimple_seq stmts;
7270   imm_use_iterator imm_iter;
7271   use_operand_p use_p;
7272   gimple *exit_phi;
7273   edge latch_e;
7274   tree loop_arg;
7275   gimple_stmt_iterator si;
7276 
7277   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7278   if (!phi)
7279     return false;
7280 
7281   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7282     return false;
7283 
7284   /* Make sure it was recognized as induction computation.  */
7285   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7286     return false;
7287 
7288   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7289   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7290 
7291   if (slp_node)
7292     ncopies = 1;
7293   else
7294     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7295   gcc_assert (ncopies >= 1);
7296 
7297   /* FORNOW. These restrictions should be relaxed.  */
7298   if (nested_in_vect_loop_p (loop, stmt_info))
7299     {
7300       imm_use_iterator imm_iter;
7301       use_operand_p use_p;
7302       gimple *exit_phi;
7303       edge latch_e;
7304       tree loop_arg;
7305 
7306       if (ncopies > 1)
7307 	{
7308 	  if (dump_enabled_p ())
7309 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310 			     "multiple types in nested loop.\n");
7311 	  return false;
7312 	}
7313 
7314       /* FORNOW: outer loop induction with SLP not supported.  */
7315       if (STMT_SLP_TYPE (stmt_info))
7316 	return false;
7317 
7318       exit_phi = NULL;
7319       latch_e = loop_latch_edge (loop->inner);
7320       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7321       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7322 	{
7323 	  gimple *use_stmt = USE_STMT (use_p);
7324 	  if (is_gimple_debug (use_stmt))
7325 	    continue;
7326 
7327 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7328 	    {
7329 	      exit_phi = use_stmt;
7330 	      break;
7331 	    }
7332 	}
7333       if (exit_phi)
7334 	{
7335 	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7336 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7337 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7338 	    {
7339 	      if (dump_enabled_p ())
7340 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7341 				 "inner-loop induction only used outside "
7342 				 "of the outer vectorized loop.\n");
7343 	      return false;
7344 	    }
7345 	}
7346 
7347       nested_in_vect_loop = true;
7348       iv_loop = loop->inner;
7349     }
7350   else
7351     iv_loop = loop;
7352   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7353 
7354   if (slp_node && !nunits.is_constant ())
7355     {
7356       /* The current SLP code creates the initial value element-by-element.  */
7357       if (dump_enabled_p ())
7358 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7359 			 "SLP induction not supported for variable-length"
7360 			 " vectors.\n");
7361       return false;
7362     }
7363 
7364   if (!vec_stmt) /* transformation not required.  */
7365     {
7366       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7367       DUMP_VECT_SCOPE ("vectorizable_induction");
7368       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7369       return true;
7370     }
7371 
7372   /* Transform.  */
7373 
7374   /* Compute a vector variable, initialized with the first VF values of
7375      the induction variable.  E.g., for an iv with IV_PHI='X' and
7376      evolution S, for a vector of 4 units, we want to compute:
7377      [X, X + S, X + 2*S, X + 3*S].  */
7378 
7379   if (dump_enabled_p ())
7380     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7381 
7382   latch_e = loop_latch_edge (iv_loop);
7383   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7384 
7385   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7386   gcc_assert (step_expr != NULL_TREE);
7387 
7388   pe = loop_preheader_edge (iv_loop);
7389   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7390 				     loop_preheader_edge (iv_loop));
7391 
7392   stmts = NULL;
7393   if (!nested_in_vect_loop)
7394     {
7395       /* Convert the initial value to the desired type.  */
7396       tree new_type = TREE_TYPE (vectype);
7397       init_expr = gimple_convert (&stmts, new_type, init_expr);
7398 
7399       /* If we are using the loop mask to "peel" for alignment then we need
7400 	 to adjust the start value here.  */
7401       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7402       if (skip_niters != NULL_TREE)
7403 	{
7404 	  if (FLOAT_TYPE_P (vectype))
7405 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7406 					skip_niters);
7407 	  else
7408 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7409 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7410 					 skip_niters, step_expr);
7411 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7412 				    init_expr, skip_step);
7413 	}
7414     }
7415 
7416   /* Convert the step to the desired type.  */
7417   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7418 
7419   if (stmts)
7420     {
7421       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7422       gcc_assert (!new_bb);
7423     }
7424 
7425   /* Find the first insertion point in the BB.  */
7426   basic_block bb = gimple_bb (phi);
7427   si = gsi_after_labels (bb);
7428 
7429   /* For SLP induction we have to generate several IVs as for example
7430      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7431      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7432      [VF*S, VF*S, VF*S, VF*S] for all.  */
7433   if (slp_node)
7434     {
7435       /* Enforced above.  */
7436       unsigned int const_nunits = nunits.to_constant ();
7437 
7438       /* Generate [VF*S, VF*S, ... ].  */
7439       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7440 	{
7441 	  expr = build_int_cst (integer_type_node, vf);
7442 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7443 	}
7444       else
7445 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7446       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7447 			      expr, step_expr);
7448       if (! CONSTANT_CLASS_P (new_name))
7449 	new_name = vect_init_vector (stmt_info, new_name,
7450 				     TREE_TYPE (step_expr), NULL);
7451       new_vec = build_vector_from_val (vectype, new_name);
7452       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7453 
7454       /* Now generate the IVs.  */
7455       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7456       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7457       unsigned elts = const_nunits * nvects;
7458       unsigned nivs = least_common_multiple (group_size,
7459 					     const_nunits) / const_nunits;
7460       gcc_assert (elts % group_size == 0);
7461       tree elt = init_expr;
7462       unsigned ivn;
7463       for (ivn = 0; ivn < nivs; ++ivn)
7464 	{
7465 	  tree_vector_builder elts (vectype, const_nunits, 1);
7466 	  stmts = NULL;
7467 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7468 	    {
7469 	      if (ivn*const_nunits + eltn >= group_size
7470 		  && (ivn * const_nunits + eltn) % group_size == 0)
7471 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7472 				    elt, step_expr);
7473 	      elts.quick_push (elt);
7474 	    }
7475 	  vec_init = gimple_build_vector (&stmts, &elts);
7476 	  if (stmts)
7477 	    {
7478 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7479 	      gcc_assert (!new_bb);
7480 	    }
7481 
7482 	  /* Create the induction-phi that defines the induction-operand.  */
7483 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7484 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7485 	  stmt_vec_info induction_phi_info
7486 	    = loop_vinfo->add_stmt (induction_phi);
7487 	  induc_def = PHI_RESULT (induction_phi);
7488 
7489 	  /* Create the iv update inside the loop  */
7490 	  vec_def = make_ssa_name (vec_dest);
7491 	  new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7492 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7493 	  loop_vinfo->add_stmt (new_stmt);
7494 
7495 	  /* Set the arguments of the phi node:  */
7496 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7497 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7498 		       UNKNOWN_LOCATION);
7499 
7500 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7501 	}
7502 
7503       /* Re-use IVs when we can.  */
7504       if (ivn < nvects)
7505 	{
7506 	  unsigned vfp
7507 	    = least_common_multiple (group_size, const_nunits) / group_size;
7508 	  /* Generate [VF'*S, VF'*S, ... ].  */
7509 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7510 	    {
7511 	      expr = build_int_cst (integer_type_node, vfp);
7512 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7513 	    }
7514 	  else
7515 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7516 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7517 				  expr, step_expr);
7518 	  if (! CONSTANT_CLASS_P (new_name))
7519 	    new_name = vect_init_vector (stmt_info, new_name,
7520 					 TREE_TYPE (step_expr), NULL);
7521 	  new_vec = build_vector_from_val (vectype, new_name);
7522 	  vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7523 	  for (; ivn < nvects; ++ivn)
7524 	    {
7525 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7526 	      tree def;
7527 	      if (gimple_code (iv) == GIMPLE_PHI)
7528 		def = gimple_phi_result (iv);
7529 	      else
7530 		def = gimple_assign_lhs (iv);
7531 	      new_stmt = gimple_build_assign (make_ssa_name (vectype),
7532 					      PLUS_EXPR,
7533 					      def, vec_step);
7534 	      if (gimple_code (iv) == GIMPLE_PHI)
7535 		gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7536 	      else
7537 		{
7538 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7539 		  gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7540 		}
7541 	      SLP_TREE_VEC_STMTS (slp_node).quick_push
7542 		(loop_vinfo->add_stmt (new_stmt));
7543 	    }
7544 	}
7545 
7546       return true;
7547     }
7548 
7549   /* Create the vector that holds the initial_value of the induction.  */
7550   if (nested_in_vect_loop)
7551     {
7552       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7553 	 been created during vectorization of previous stmts.  We obtain it
7554 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7555       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7556       /* If the initial value is not of proper type, convert it.  */
7557       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7558 	{
7559 	  new_stmt
7560 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7561 							  vect_simple_var,
7562 							  "vec_iv_"),
7563 				   VIEW_CONVERT_EXPR,
7564 				   build1 (VIEW_CONVERT_EXPR, vectype,
7565 					   vec_init));
7566 	  vec_init = gimple_assign_lhs (new_stmt);
7567 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7568 						 new_stmt);
7569 	  gcc_assert (!new_bb);
7570 	  loop_vinfo->add_stmt (new_stmt);
7571 	}
7572     }
7573   else
7574     {
7575       /* iv_loop is the loop to be vectorized. Create:
7576 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7577       stmts = NULL;
7578       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7579 
7580       unsigned HOST_WIDE_INT const_nunits;
7581       if (nunits.is_constant (&const_nunits))
7582 	{
7583 	  tree_vector_builder elts (vectype, const_nunits, 1);
7584 	  elts.quick_push (new_name);
7585 	  for (i = 1; i < const_nunits; i++)
7586 	    {
7587 	      /* Create: new_name_i = new_name + step_expr  */
7588 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7589 				       new_name, step_expr);
7590 	      elts.quick_push (new_name);
7591 	    }
7592 	  /* Create a vector from [new_name_0, new_name_1, ...,
7593 	     new_name_nunits-1]  */
7594 	  vec_init = gimple_build_vector (&stmts, &elts);
7595 	}
7596       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7597 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7598 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7599 				 new_name, step_expr);
7600       else
7601 	{
7602 	  /* Build:
7603 	        [base, base, base, ...]
7604 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7605 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7606 	  gcc_assert (flag_associative_math);
7607 	  tree index = build_index_vector (vectype, 0, 1);
7608 	  tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7609 							new_name);
7610 	  tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7611 							step_expr);
7612 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7613 	  vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7614 				   vec_init, step_vec);
7615 	  vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7616 				   vec_init, base_vec);
7617 	}
7618 
7619       if (stmts)
7620 	{
7621 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7622 	  gcc_assert (!new_bb);
7623 	}
7624     }
7625 
7626 
7627   /* Create the vector that holds the step of the induction.  */
7628   if (nested_in_vect_loop)
7629     /* iv_loop is nested in the loop to be vectorized. Generate:
7630        vec_step = [S, S, S, S]  */
7631     new_name = step_expr;
7632   else
7633     {
7634       /* iv_loop is the loop to be vectorized. Generate:
7635 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7636       gimple_seq seq = NULL;
7637       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7638 	{
7639 	  expr = build_int_cst (integer_type_node, vf);
7640 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7641 	}
7642       else
7643 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7644       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7645 			       expr, step_expr);
7646       if (seq)
7647 	{
7648 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7649 	  gcc_assert (!new_bb);
7650 	}
7651     }
7652 
7653   t = unshare_expr (new_name);
7654   gcc_assert (CONSTANT_CLASS_P (new_name)
7655 	      || TREE_CODE (new_name) == SSA_NAME);
7656   new_vec = build_vector_from_val (vectype, t);
7657   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7658 
7659 
7660   /* Create the following def-use cycle:
7661      loop prolog:
7662          vec_init = ...
7663 	 vec_step = ...
7664      loop:
7665          vec_iv = PHI <vec_init, vec_loop>
7666          ...
7667          STMT
7668          ...
7669          vec_loop = vec_iv + vec_step;  */
7670 
7671   /* Create the induction-phi that defines the induction-operand.  */
7672   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7673   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7674   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7675   induc_def = PHI_RESULT (induction_phi);
7676 
7677   /* Create the iv update inside the loop  */
7678   vec_def = make_ssa_name (vec_dest);
7679   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7680   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7681   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7682 
7683   /* Set the arguments of the phi node:  */
7684   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7685   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7686 	       UNKNOWN_LOCATION);
7687 
7688   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7689 
7690   /* In case that vectorization factor (VF) is bigger than the number
7691      of elements that we can fit in a vectype (nunits), we have to generate
7692      more than one vector stmt - i.e - we need to "unroll" the
7693      vector stmt by a factor VF/nunits.  For more details see documentation
7694      in vectorizable_operation.  */
7695 
7696   if (ncopies > 1)
7697     {
7698       gimple_seq seq = NULL;
7699       stmt_vec_info prev_stmt_vinfo;
7700       /* FORNOW. This restriction should be relaxed.  */
7701       gcc_assert (!nested_in_vect_loop);
7702 
7703       /* Create the vector that holds the step of the induction.  */
7704       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7705 	{
7706 	  expr = build_int_cst (integer_type_node, nunits);
7707 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7708 	}
7709       else
7710 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7711       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7712 			       expr, step_expr);
7713       if (seq)
7714 	{
7715 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7716 	  gcc_assert (!new_bb);
7717 	}
7718 
7719       t = unshare_expr (new_name);
7720       gcc_assert (CONSTANT_CLASS_P (new_name)
7721 		  || TREE_CODE (new_name) == SSA_NAME);
7722       new_vec = build_vector_from_val (vectype, t);
7723       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7724 
7725       vec_def = induc_def;
7726       prev_stmt_vinfo = induction_phi_info;
7727       for (i = 1; i < ncopies; i++)
7728 	{
7729 	  /* vec_i = vec_prev + vec_step  */
7730 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7731 					  vec_def, vec_step);
7732 	  vec_def = make_ssa_name (vec_dest, new_stmt);
7733 	  gimple_assign_set_lhs (new_stmt, vec_def);
7734 
7735 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7736 	  new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7737 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7738 	  prev_stmt_vinfo = new_stmt_info;
7739 	}
7740     }
7741 
7742   if (nested_in_vect_loop)
7743     {
7744       /* Find the loop-closed exit-phi of the induction, and record
7745          the final vector of induction results:  */
7746       exit_phi = NULL;
7747       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7748         {
7749 	  gimple *use_stmt = USE_STMT (use_p);
7750 	  if (is_gimple_debug (use_stmt))
7751 	    continue;
7752 
7753 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7754 	    {
7755 	      exit_phi = use_stmt;
7756 	      break;
7757 	    }
7758         }
7759       if (exit_phi)
7760 	{
7761 	  stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7762 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
7763 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
7764 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7765 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
7766 
7767 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7768 	  if (dump_enabled_p ())
7769 	    dump_printf_loc (MSG_NOTE, vect_location,
7770 			     "vector of inductions after inner-loop:%G",
7771 			     new_stmt);
7772 	}
7773     }
7774 
7775 
7776   if (dump_enabled_p ())
7777     dump_printf_loc (MSG_NOTE, vect_location,
7778 		     "transform induction: created def-use cycle: %G%G",
7779 		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
7780 
7781   return true;
7782 }
7783 
7784 /* Function vectorizable_live_operation.
7785 
7786    STMT_INFO computes a value that is used outside the loop.  Check if
7787    it can be supported.  */
7788 
7789 bool
7790 vectorizable_live_operation (stmt_vec_info stmt_info,
7791 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7792 			     slp_tree slp_node, int slp_index,
7793 			     stmt_vec_info *vec_stmt,
7794 			     stmt_vector_for_cost *)
7795 {
7796   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7797   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7798   imm_use_iterator imm_iter;
7799   tree lhs, lhs_type, bitsize, vec_bitsize;
7800   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7801   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7802   int ncopies;
7803   gimple *use_stmt;
7804   auto_vec<tree> vec_oprnds;
7805   int vec_entry = 0;
7806   poly_uint64 vec_index = 0;
7807 
7808   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7809 
7810   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7811     return false;
7812 
7813   /* FORNOW.  CHECKME.  */
7814   if (nested_in_vect_loop_p (loop, stmt_info))
7815     return false;
7816 
7817   /* If STMT is not relevant and it is a simple assignment and its inputs are
7818      invariant then it can remain in place, unvectorized.  The original last
7819      scalar value that it computes will be used.  */
7820   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7821     {
7822       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7823       if (dump_enabled_p ())
7824 	dump_printf_loc (MSG_NOTE, vect_location,
7825 			 "statement is simple and uses invariant.  Leaving in "
7826 			 "place.\n");
7827       return true;
7828     }
7829 
7830   if (slp_node)
7831     ncopies = 1;
7832   else
7833     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7834 
7835   if (slp_node)
7836     {
7837       gcc_assert (slp_index >= 0);
7838 
7839       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7840       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7841 
7842       /* Get the last occurrence of the scalar index from the concatenation of
7843 	 all the slp vectors. Calculate which slp vector it is and the index
7844 	 within.  */
7845       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7846 
7847       /* Calculate which vector contains the result, and which lane of
7848 	 that vector we need.  */
7849       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7850 	{
7851 	  if (dump_enabled_p ())
7852 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7853 			     "Cannot determine which vector holds the"
7854 			     " final result.\n");
7855 	  return false;
7856 	}
7857     }
7858 
7859   if (!vec_stmt)
7860     {
7861       /* No transformation required.  */
7862       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7863 	{
7864 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7865 					       OPTIMIZE_FOR_SPEED))
7866 	    {
7867 	      if (dump_enabled_p ())
7868 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7869 				 "can't use a fully-masked loop because "
7870 				 "the target doesn't support extract last "
7871 				 "reduction.\n");
7872 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7873 	    }
7874 	  else if (slp_node)
7875 	    {
7876 	      if (dump_enabled_p ())
7877 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7878 				 "can't use a fully-masked loop because an "
7879 				 "SLP statement is live after the loop.\n");
7880 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7881 	    }
7882 	  else if (ncopies > 1)
7883 	    {
7884 	      if (dump_enabled_p ())
7885 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 				 "can't use a fully-masked loop because"
7887 				 " ncopies is greater than 1.\n");
7888 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7889 	    }
7890 	  else
7891 	    {
7892 	      gcc_assert (ncopies == 1 && !slp_node);
7893 	      vect_record_loop_mask (loop_vinfo,
7894 				     &LOOP_VINFO_MASKS (loop_vinfo),
7895 				     1, vectype);
7896 	    }
7897 	}
7898       return true;
7899     }
7900 
7901   /* Use the lhs of the original scalar statement.  */
7902   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7903 
7904   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7905 	: gimple_get_lhs (stmt);
7906   lhs_type = TREE_TYPE (lhs);
7907 
7908   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7909 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7910 	     : TYPE_SIZE (TREE_TYPE (vectype)));
7911   vec_bitsize = TYPE_SIZE (vectype);
7912 
7913   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7914   tree vec_lhs, bitstart;
7915   if (slp_node)
7916     {
7917       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7918 
7919       /* Get the correct slp vectorized stmt.  */
7920       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7921       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7922 	vec_lhs = gimple_phi_result (phi);
7923       else
7924 	vec_lhs = gimple_get_lhs (vec_stmt);
7925 
7926       /* Get entry to use.  */
7927       bitstart = bitsize_int (vec_index);
7928       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7929     }
7930   else
7931     {
7932       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7933       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7934       gcc_checking_assert (ncopies == 1
7935 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7936 
7937       /* For multiple copies, get the last copy.  */
7938       for (int i = 1; i < ncopies; ++i)
7939 	vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7940 
7941       /* Get the last lane in the vector.  */
7942       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7943     }
7944 
7945   gimple_seq stmts = NULL;
7946   tree new_tree;
7947   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7948     {
7949       /* Emit:
7950 
7951 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7952 
7953 	 where VEC_LHS is the vectorized live-out result and MASK is
7954 	 the loop mask for the final iteration.  */
7955       gcc_assert (ncopies == 1 && !slp_node);
7956       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7957       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7958 				      1, vectype, 0);
7959       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7960 				      scalar_type, mask, vec_lhs);
7961 
7962       /* Convert the extracted vector element to the required scalar type.  */
7963       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7964     }
7965   else
7966     {
7967       tree bftype = TREE_TYPE (vectype);
7968       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7969 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7970       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7971       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7972 				       &stmts, true, NULL_TREE);
7973     }
7974 
7975   if (stmts)
7976     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7977 
7978   /* Replace use of lhs with newly computed result.  If the use stmt is a
7979      single arg PHI, just replace all uses of PHI result.  It's necessary
7980      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7981   use_operand_p use_p;
7982   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7983     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7984 	&& !is_gimple_debug (use_stmt))
7985     {
7986       if (gimple_code (use_stmt) == GIMPLE_PHI
7987 	  && gimple_phi_num_args (use_stmt) == 1)
7988 	{
7989 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7990 	}
7991       else
7992 	{
7993 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7994 	    SET_USE (use_p, new_tree);
7995 	}
7996       update_stmt (use_stmt);
7997     }
7998 
7999   return true;
8000 }
8001 
8002 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8003 
8004 static void
8005 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8006 {
8007   ssa_op_iter op_iter;
8008   imm_use_iterator imm_iter;
8009   def_operand_p def_p;
8010   gimple *ustmt;
8011 
8012   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8013     {
8014       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8015 	{
8016 	  basic_block bb;
8017 
8018 	  if (!is_gimple_debug (ustmt))
8019 	    continue;
8020 
8021 	  bb = gimple_bb (ustmt);
8022 
8023 	  if (!flow_bb_inside_loop_p (loop, bb))
8024 	    {
8025 	      if (gimple_debug_bind_p (ustmt))
8026 		{
8027 		  if (dump_enabled_p ())
8028 		    dump_printf_loc (MSG_NOTE, vect_location,
8029                                      "killing debug use\n");
8030 
8031 		  gimple_debug_bind_reset_value (ustmt);
8032 		  update_stmt (ustmt);
8033 		}
8034 	      else
8035 		gcc_unreachable ();
8036 	    }
8037 	}
8038     }
8039 }
8040 
8041 /* Given loop represented by LOOP_VINFO, return true if computation of
8042    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8043    otherwise.  */
8044 
8045 static bool
8046 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8047 {
8048   /* Constant case.  */
8049   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8050     {
8051       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8052       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8053 
8054       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8055       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8056       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8057 	return true;
8058     }
8059 
8060   widest_int max;
8061   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8062   /* Check the upper bound of loop niters.  */
8063   if (get_max_loop_iterations (loop, &max))
8064     {
8065       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8066       signop sgn = TYPE_SIGN (type);
8067       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8068       if (max < type_max)
8069 	return true;
8070     }
8071   return false;
8072 }
8073 
8074 /* Return a mask type with half the number of elements as TYPE.  */
8075 
8076 tree
8077 vect_halve_mask_nunits (tree type)
8078 {
8079   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8080   return build_truth_vector_type (nunits, current_vector_size);
8081 }
8082 
8083 /* Return a mask type with twice as many elements as TYPE.  */
8084 
8085 tree
8086 vect_double_mask_nunits (tree type)
8087 {
8088   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8089   return build_truth_vector_type (nunits, current_vector_size);
8090 }
8091 
8092 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8093    contain a sequence of NVECTORS masks that each control a vector of type
8094    VECTYPE.  */
8095 
8096 void
8097 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8098 		       unsigned int nvectors, tree vectype)
8099 {
8100   gcc_assert (nvectors != 0);
8101   if (masks->length () < nvectors)
8102     masks->safe_grow_cleared (nvectors);
8103   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8104   /* The number of scalars per iteration and the number of vectors are
8105      both compile-time constants.  */
8106   unsigned int nscalars_per_iter
8107     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8108 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8109   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8110     {
8111       rgm->max_nscalars_per_iter = nscalars_per_iter;
8112       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8113     }
8114 }
8115 
8116 /* Given a complete set of masks MASKS, extract mask number INDEX
8117    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8118    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8119 
8120    See the comment above vec_loop_masks for more details about the mask
8121    arrangement.  */
8122 
8123 tree
8124 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8125 		    unsigned int nvectors, tree vectype, unsigned int index)
8126 {
8127   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8128   tree mask_type = rgm->mask_type;
8129 
8130   /* Populate the rgroup's mask array, if this is the first time we've
8131      used it.  */
8132   if (rgm->masks.is_empty ())
8133     {
8134       rgm->masks.safe_grow_cleared (nvectors);
8135       for (unsigned int i = 0; i < nvectors; ++i)
8136 	{
8137 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8138 	  /* Provide a dummy definition until the real one is available.  */
8139 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8140 	  rgm->masks[i] = mask;
8141 	}
8142     }
8143 
8144   tree mask = rgm->masks[index];
8145   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8146 		TYPE_VECTOR_SUBPARTS (vectype)))
8147     {
8148       /* A loop mask for data type X can be reused for data type Y
8149 	 if X has N times more elements than Y and if Y's elements
8150 	 are N times bigger than X's.  In this case each sequence
8151 	 of N elements in the loop mask will be all-zero or all-one.
8152 	 We can then view-convert the mask so that each sequence of
8153 	 N elements is replaced by a single element.  */
8154       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8155 			      TYPE_VECTOR_SUBPARTS (vectype)));
8156       gimple_seq seq = NULL;
8157       mask_type = build_same_sized_truth_vector_type (vectype);
8158       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8159       if (seq)
8160 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8161     }
8162   return mask;
8163 }
8164 
8165 /* Scale profiling counters by estimation for LOOP which is vectorized
8166    by factor VF.  */
8167 
8168 static void
8169 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8170 {
8171   edge preheader = loop_preheader_edge (loop);
8172   /* Reduce loop iterations by the vectorization factor.  */
8173   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8174   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8175 
8176   if (freq_h.nonzero_p ())
8177     {
8178       profile_probability p;
8179 
8180       /* Avoid dropping loop body profile counter to 0 because of zero count
8181 	 in loop's preheader.  */
8182       if (!(freq_e == profile_count::zero ()))
8183         freq_e = freq_e.force_nonzero ();
8184       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8185       scale_loop_frequencies (loop, p);
8186     }
8187 
8188   edge exit_e = single_exit (loop);
8189   exit_e->probability = profile_probability::always ()
8190 				 .apply_scale (1, new_est_niter + 1);
8191 
8192   edge exit_l = single_pred_edge (loop->latch);
8193   profile_probability prob = exit_l->probability;
8194   exit_l->probability = exit_e->probability.invert ();
8195   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8196     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8197 }
8198 
8199 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8200    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8201    stmt_vec_info.  */
8202 
8203 static void
8204 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8205 			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8206 {
8207   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8208   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8209 
8210   if (dump_enabled_p ())
8211     dump_printf_loc (MSG_NOTE, vect_location,
8212 		     "------>vectorizing statement: %G", stmt_info->stmt);
8213 
8214   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8215     vect_loop_kill_debug_uses (loop, stmt_info);
8216 
8217   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8218       && !STMT_VINFO_LIVE_P (stmt_info))
8219     return;
8220 
8221   if (STMT_VINFO_VECTYPE (stmt_info))
8222     {
8223       poly_uint64 nunits
8224 	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8225       if (!STMT_SLP_TYPE (stmt_info)
8226 	  && maybe_ne (nunits, vf)
8227 	  && dump_enabled_p ())
8228 	/* For SLP VF is set according to unrolling factor, and not
8229 	   to vector size, hence for SLP this print is not valid.  */
8230 	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8231     }
8232 
8233   /* Pure SLP statements have already been vectorized.  We still need
8234      to apply loop vectorization to hybrid SLP statements.  */
8235   if (PURE_SLP_STMT (stmt_info))
8236     return;
8237 
8238   if (dump_enabled_p ())
8239     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8240 
8241   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8242     *seen_store = stmt_info;
8243 }
8244 
8245 /* Function vect_transform_loop.
8246 
8247    The analysis phase has determined that the loop is vectorizable.
8248    Vectorize the loop - created vectorized stmts to replace the scalar
8249    stmts in the loop, and update the loop exit condition.
8250    Returns scalar epilogue loop if any.  */
8251 
8252 struct loop *
8253 vect_transform_loop (loop_vec_info loop_vinfo)
8254 {
8255   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8256   struct loop *epilogue = NULL;
8257   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8258   int nbbs = loop->num_nodes;
8259   int i;
8260   tree niters_vector = NULL_TREE;
8261   tree step_vector = NULL_TREE;
8262   tree niters_vector_mult_vf = NULL_TREE;
8263   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8264   unsigned int lowest_vf = constant_lower_bound (vf);
8265   gimple *stmt;
8266   bool check_profitability = false;
8267   unsigned int th;
8268 
8269   DUMP_VECT_SCOPE ("vec_transform_loop");
8270 
8271   loop_vinfo->shared->check_datarefs ();
8272 
8273   /* Use the more conservative vectorization threshold.  If the number
8274      of iterations is constant assume the cost check has been performed
8275      by our caller.  If the threshold makes all loops profitable that
8276      run at least the (estimated) vectorization factor number of times
8277      checking is pointless, too.  */
8278   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8279   if (th >= vect_vf_for_cost (loop_vinfo)
8280       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8281     {
8282       if (dump_enabled_p ())
8283 	dump_printf_loc (MSG_NOTE, vect_location,
8284 			 "Profitability threshold is %d loop iterations.\n",
8285                          th);
8286       check_profitability = true;
8287     }
8288 
8289   /* Make sure there exists a single-predecessor exit bb.  Do this before
8290      versioning.   */
8291   edge e = single_exit (loop);
8292   if (! single_pred_p (e->dest))
8293     {
8294       split_loop_exit_edge (e, true);
8295       if (dump_enabled_p ())
8296 	dump_printf (MSG_NOTE, "split exit edge\n");
8297     }
8298 
8299   /* Version the loop first, if required, so the profitability check
8300      comes first.  */
8301 
8302   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8303     {
8304       poly_uint64 versioning_threshold
8305 	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8306       if (check_profitability
8307 	  && ordered_p (poly_uint64 (th), versioning_threshold))
8308 	{
8309 	  versioning_threshold = ordered_max (poly_uint64 (th),
8310 					      versioning_threshold);
8311 	  check_profitability = false;
8312 	}
8313       struct loop *sloop
8314 	= vect_loop_versioning (loop_vinfo, th, check_profitability,
8315 				versioning_threshold);
8316       sloop->force_vectorize = false;
8317       check_profitability = false;
8318     }
8319 
8320   /* Make sure there exists a single-predecessor exit bb also on the
8321      scalar loop copy.  Do this after versioning but before peeling
8322      so CFG structure is fine for both scalar and if-converted loop
8323      to make slpeel_duplicate_current_defs_from_edges face matched
8324      loop closed PHI nodes on the exit.  */
8325   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8326     {
8327       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8328       if (! single_pred_p (e->dest))
8329 	{
8330 	  split_loop_exit_edge (e, true);
8331 	  if (dump_enabled_p ())
8332 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8333 	}
8334     }
8335 
8336   tree niters = vect_build_loop_niters (loop_vinfo);
8337   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8338   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8339   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8340   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8341 			      &step_vector, &niters_vector_mult_vf, th,
8342 			      check_profitability, niters_no_overflow);
8343 
8344   if (niters_vector == NULL_TREE)
8345     {
8346       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8347 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8348 	  && known_eq (lowest_vf, vf))
8349 	{
8350 	  niters_vector
8351 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8352 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8353 	  step_vector = build_one_cst (TREE_TYPE (niters));
8354 	}
8355       else
8356 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8357 				     &step_vector, niters_no_overflow);
8358     }
8359 
8360   /* 1) Make sure the loop header has exactly two entries
8361      2) Make sure we have a preheader basic block.  */
8362 
8363   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8364 
8365   split_edge (loop_preheader_edge (loop));
8366 
8367   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8368       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8369     /* This will deal with any possible peeling.  */
8370     vect_prepare_for_masked_peels (loop_vinfo);
8371 
8372   /* Schedule the SLP instances first, then handle loop vectorization
8373      below.  */
8374   if (!loop_vinfo->slp_instances.is_empty ())
8375     {
8376       DUMP_VECT_SCOPE ("scheduling SLP instances");
8377       vect_schedule_slp (loop_vinfo);
8378     }
8379 
8380   /* FORNOW: the vectorizer supports only loops which body consist
8381      of one basic block (header + empty latch). When the vectorizer will
8382      support more involved loop forms, the order by which the BBs are
8383      traversed need to be reconsidered.  */
8384 
8385   for (i = 0; i < nbbs; i++)
8386     {
8387       basic_block bb = bbs[i];
8388       stmt_vec_info stmt_info;
8389 
8390       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8391 	   gsi_next (&si))
8392         {
8393 	  gphi *phi = si.phi ();
8394 	  if (dump_enabled_p ())
8395 	    dump_printf_loc (MSG_NOTE, vect_location,
8396 			     "------>vectorizing phi: %G", phi);
8397 	  stmt_info = loop_vinfo->lookup_stmt (phi);
8398 	  if (!stmt_info)
8399 	    continue;
8400 
8401 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8402 	    vect_loop_kill_debug_uses (loop, stmt_info);
8403 
8404 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8405 	      && !STMT_VINFO_LIVE_P (stmt_info))
8406 	    continue;
8407 
8408 	  if (STMT_VINFO_VECTYPE (stmt_info)
8409 	      && (maybe_ne
8410 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8411 	      && dump_enabled_p ())
8412 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8413 
8414 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8415 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8416 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8417 	      && ! PURE_SLP_STMT (stmt_info))
8418 	    {
8419 	      if (dump_enabled_p ())
8420 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8421 	      vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8422 	    }
8423 	}
8424 
8425       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8426 	   !gsi_end_p (si);)
8427 	{
8428 	  stmt = gsi_stmt (si);
8429 	  /* During vectorization remove existing clobber stmts.  */
8430 	  if (gimple_clobber_p (stmt))
8431 	    {
8432 	      unlink_stmt_vdef (stmt);
8433 	      gsi_remove (&si, true);
8434 	      release_defs (stmt);
8435 	    }
8436 	  else
8437 	    {
8438 	      stmt_info = loop_vinfo->lookup_stmt (stmt);
8439 
8440 	      /* vector stmts created in the outer-loop during vectorization of
8441 		 stmts in an inner-loop may not have a stmt_info, and do not
8442 		 need to be vectorized.  */
8443 	      stmt_vec_info seen_store = NULL;
8444 	      if (stmt_info)
8445 		{
8446 		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8447 		    {
8448 		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8449 		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8450 			   !gsi_end_p (subsi); gsi_next (&subsi))
8451 			{
8452 			  stmt_vec_info pat_stmt_info
8453 			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8454 			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8455 						    &si, &seen_store);
8456 			}
8457 		      stmt_vec_info pat_stmt_info
8458 			= STMT_VINFO_RELATED_STMT (stmt_info);
8459 		      vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8460 						&seen_store);
8461 		    }
8462 		  vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8463 					    &seen_store);
8464 		}
8465 	      gsi_next (&si);
8466 	      if (seen_store)
8467 		{
8468 		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8469 		    /* Interleaving.  If IS_STORE is TRUE, the
8470 		       vectorization of the interleaving chain was
8471 		       completed - free all the stores in the chain.  */
8472 		    vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8473 		  else
8474 		    /* Free the attached stmt_vec_info and remove the stmt.  */
8475 		    loop_vinfo->remove_stmt (stmt_info);
8476 		}
8477 	    }
8478 	}
8479 
8480       /* Stub out scalar statements that must not survive vectorization.
8481 	 Doing this here helps with grouped statements, or statements that
8482 	 are involved in patterns.  */
8483       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8484 	   !gsi_end_p (gsi); gsi_next (&gsi))
8485 	{
8486 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8487 	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8488 	    {
8489 	      tree lhs = gimple_get_lhs (call);
8490 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8491 		{
8492 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8493 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8494 		  gsi_replace (&gsi, new_stmt, true);
8495 		}
8496 	    }
8497 	}
8498     }				/* BBs in loop */
8499 
8500   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8501      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8502   if (integer_onep (step_vector))
8503     niters_no_overflow = true;
8504   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8505 			   niters_vector_mult_vf, !niters_no_overflow);
8506 
8507   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8508   scale_profile_for_vect_loop (loop, assumed_vf);
8509 
8510   /* True if the final iteration might not handle a full vector's
8511      worth of scalar iterations.  */
8512   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8513   /* The minimum number of iterations performed by the epilogue.  This
8514      is 1 when peeling for gaps because we always need a final scalar
8515      iteration.  */
8516   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8517   /* +1 to convert latch counts to loop iteration counts,
8518      -min_epilogue_iters to remove iterations that cannot be performed
8519        by the vector code.  */
8520   int bias_for_lowest = 1 - min_epilogue_iters;
8521   int bias_for_assumed = bias_for_lowest;
8522   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8523   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8524     {
8525       /* When the amount of peeling is known at compile time, the first
8526 	 iteration will have exactly alignment_npeels active elements.
8527 	 In the worst case it will have at least one.  */
8528       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8529       bias_for_lowest += lowest_vf - min_first_active;
8530       bias_for_assumed += assumed_vf - min_first_active;
8531     }
8532   /* In these calculations the "- 1" converts loop iteration counts
8533      back to latch counts.  */
8534   if (loop->any_upper_bound)
8535     loop->nb_iterations_upper_bound
8536       = (final_iter_may_be_partial
8537 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8538 			  lowest_vf) - 1
8539 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8540 			   lowest_vf) - 1);
8541   if (loop->any_likely_upper_bound)
8542     loop->nb_iterations_likely_upper_bound
8543       = (final_iter_may_be_partial
8544 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8545 			  + bias_for_lowest, lowest_vf) - 1
8546 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8547 			   + bias_for_lowest, lowest_vf) - 1);
8548   if (loop->any_estimate)
8549     loop->nb_iterations_estimate
8550       = (final_iter_may_be_partial
8551 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8552 			  assumed_vf) - 1
8553 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8554 			   assumed_vf) - 1);
8555 
8556   if (dump_enabled_p ())
8557     {
8558       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8559 	{
8560 	  dump_printf_loc (MSG_NOTE, vect_location,
8561 			   "LOOP VECTORIZED\n");
8562 	  if (loop->inner)
8563 	    dump_printf_loc (MSG_NOTE, vect_location,
8564 			     "OUTER LOOP VECTORIZED\n");
8565 	  dump_printf (MSG_NOTE, "\n");
8566 	}
8567       else
8568 	{
8569 	  dump_printf_loc (MSG_NOTE, vect_location,
8570 			   "LOOP EPILOGUE VECTORIZED (VS=");
8571 	  dump_dec (MSG_NOTE, current_vector_size);
8572 	  dump_printf (MSG_NOTE, ")\n");
8573 	}
8574     }
8575 
8576   /* Loops vectorized with a variable factor won't benefit from
8577      unrolling/peeling.  */
8578   if (!vf.is_constant ())
8579     {
8580       loop->unroll = 1;
8581       if (dump_enabled_p ())
8582 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8583 			 " variable-length vectorization factor\n");
8584     }
8585   /* Free SLP instances here because otherwise stmt reference counting
8586      won't work.  */
8587   slp_instance instance;
8588   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8589     vect_free_slp_instance (instance, true);
8590   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8591   /* Clear-up safelen field since its value is invalid after vectorization
8592      since vectorized loop can have loop-carried dependencies.  */
8593   loop->safelen = 0;
8594 
8595   /* Don't vectorize epilogue for epilogue.  */
8596   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8597     epilogue = NULL;
8598 
8599   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8600     epilogue = NULL;
8601 
8602   if (epilogue)
8603     {
8604       auto_vector_sizes vector_sizes;
8605       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8606       unsigned int next_size = 0;
8607 
8608       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8609          on niters already ajusted for the iterations of the prologue.  */
8610       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8611 	  && known_eq (vf, lowest_vf))
8612 	{
8613 	  unsigned HOST_WIDE_INT eiters
8614 	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8615 	       - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8616 	  eiters
8617 	    = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8618 	  epilogue->nb_iterations_upper_bound = eiters - 1;
8619 	  epilogue->any_upper_bound = true;
8620 
8621 	  unsigned int ratio;
8622 	  while (next_size < vector_sizes.length ()
8623 		 && !(constant_multiple_p (current_vector_size,
8624 					   vector_sizes[next_size], &ratio)
8625 		      && eiters >= lowest_vf / ratio))
8626 	    next_size += 1;
8627 	}
8628       else
8629 	while (next_size < vector_sizes.length ()
8630 	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
8631 	  next_size += 1;
8632 
8633       if (next_size == vector_sizes.length ())
8634 	epilogue = NULL;
8635     }
8636 
8637   if (epilogue)
8638     {
8639       epilogue->force_vectorize = loop->force_vectorize;
8640       epilogue->safelen = loop->safelen;
8641       epilogue->dont_vectorize = false;
8642 
8643       /* We may need to if-convert epilogue to vectorize it.  */
8644       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8645 	tree_if_conversion (epilogue);
8646     }
8647 
8648   return epilogue;
8649 }
8650 
8651 /* The code below is trying to perform simple optimization - revert
8652    if-conversion for masked stores, i.e. if the mask of a store is zero
8653    do not perform it and all stored value producers also if possible.
8654    For example,
8655      for (i=0; i<n; i++)
8656        if (c[i])
8657 	{
8658 	  p1[i] += 1;
8659 	  p2[i] = p3[i] +2;
8660 	}
8661    this transformation will produce the following semi-hammock:
8662 
8663    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8664      {
8665        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8666        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8667        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8668        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8669        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8670        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8671      }
8672 */
8673 
8674 void
8675 optimize_mask_stores (struct loop *loop)
8676 {
8677   basic_block *bbs = get_loop_body (loop);
8678   unsigned nbbs = loop->num_nodes;
8679   unsigned i;
8680   basic_block bb;
8681   struct loop *bb_loop;
8682   gimple_stmt_iterator gsi;
8683   gimple *stmt;
8684   auto_vec<gimple *> worklist;
8685   auto_purge_vect_location sentinel;
8686 
8687   vect_location = find_loop_location (loop);
8688   /* Pick up all masked stores in loop if any.  */
8689   for (i = 0; i < nbbs; i++)
8690     {
8691       bb = bbs[i];
8692       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8693 	   gsi_next (&gsi))
8694 	{
8695 	  stmt = gsi_stmt (gsi);
8696 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8697 	    worklist.safe_push (stmt);
8698 	}
8699     }
8700 
8701   free (bbs);
8702   if (worklist.is_empty ())
8703     return;
8704 
8705   /* Loop has masked stores.  */
8706   while (!worklist.is_empty ())
8707     {
8708       gimple *last, *last_store;
8709       edge e, efalse;
8710       tree mask;
8711       basic_block store_bb, join_bb;
8712       gimple_stmt_iterator gsi_to;
8713       tree vdef, new_vdef;
8714       gphi *phi;
8715       tree vectype;
8716       tree zero;
8717 
8718       last = worklist.pop ();
8719       mask = gimple_call_arg (last, 2);
8720       bb = gimple_bb (last);
8721       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8722 	 the same loop as if_bb.  It could be different to LOOP when two
8723 	 level loop-nest is vectorized and mask_store belongs to the inner
8724 	 one.  */
8725       e = split_block (bb, last);
8726       bb_loop = bb->loop_father;
8727       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8728       join_bb = e->dest;
8729       store_bb = create_empty_bb (bb);
8730       add_bb_to_loop (store_bb, bb_loop);
8731       e->flags = EDGE_TRUE_VALUE;
8732       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8733       /* Put STORE_BB to likely part.  */
8734       efalse->probability = profile_probability::unlikely ();
8735       store_bb->count = efalse->count ();
8736       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8737       if (dom_info_available_p (CDI_DOMINATORS))
8738 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8739       if (dump_enabled_p ())
8740 	dump_printf_loc (MSG_NOTE, vect_location,
8741 			 "Create new block %d to sink mask stores.",
8742 			 store_bb->index);
8743       /* Create vector comparison with boolean result.  */
8744       vectype = TREE_TYPE (mask);
8745       zero = build_zero_cst (vectype);
8746       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8747       gsi = gsi_last_bb (bb);
8748       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8749       /* Create new PHI node for vdef of the last masked store:
8750 	 .MEM_2 = VDEF <.MEM_1>
8751 	 will be converted to
8752 	 .MEM.3 = VDEF <.MEM_1>
8753 	 and new PHI node will be created in join bb
8754 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
8755       */
8756       vdef = gimple_vdef (last);
8757       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8758       gimple_set_vdef (last, new_vdef);
8759       phi = create_phi_node (vdef, join_bb);
8760       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8761 
8762       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8763       while (true)
8764 	{
8765 	  gimple_stmt_iterator gsi_from;
8766 	  gimple *stmt1 = NULL;
8767 
8768 	  /* Move masked store to STORE_BB.  */
8769 	  last_store = last;
8770 	  gsi = gsi_for_stmt (last);
8771 	  gsi_from = gsi;
8772 	  /* Shift GSI to the previous stmt for further traversal.  */
8773 	  gsi_prev (&gsi);
8774 	  gsi_to = gsi_start_bb (store_bb);
8775 	  gsi_move_before (&gsi_from, &gsi_to);
8776 	  /* Setup GSI_TO to the non-empty block start.  */
8777 	  gsi_to = gsi_start_bb (store_bb);
8778 	  if (dump_enabled_p ())
8779 	    dump_printf_loc (MSG_NOTE, vect_location,
8780 			     "Move stmt to created bb\n%G", last);
8781 	  /* Move all stored value producers if possible.  */
8782 	  while (!gsi_end_p (gsi))
8783 	    {
8784 	      tree lhs;
8785 	      imm_use_iterator imm_iter;
8786 	      use_operand_p use_p;
8787 	      bool res;
8788 
8789 	      /* Skip debug statements.  */
8790 	      if (is_gimple_debug (gsi_stmt (gsi)))
8791 		{
8792 		  gsi_prev (&gsi);
8793 		  continue;
8794 		}
8795 	      stmt1 = gsi_stmt (gsi);
8796 	      /* Do not consider statements writing to memory or having
8797 		 volatile operand.  */
8798 	      if (gimple_vdef (stmt1)
8799 		  || gimple_has_volatile_ops (stmt1))
8800 		break;
8801 	      gsi_from = gsi;
8802 	      gsi_prev (&gsi);
8803 	      lhs = gimple_get_lhs (stmt1);
8804 	      if (!lhs)
8805 		break;
8806 
8807 	      /* LHS of vectorized stmt must be SSA_NAME.  */
8808 	      if (TREE_CODE (lhs) != SSA_NAME)
8809 		break;
8810 
8811 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8812 		{
8813 		  /* Remove dead scalar statement.  */
8814 		  if (has_zero_uses (lhs))
8815 		    {
8816 		      gsi_remove (&gsi_from, true);
8817 		      continue;
8818 		    }
8819 		}
8820 
8821 	      /* Check that LHS does not have uses outside of STORE_BB.  */
8822 	      res = true;
8823 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8824 		{
8825 		  gimple *use_stmt;
8826 		  use_stmt = USE_STMT (use_p);
8827 		  if (is_gimple_debug (use_stmt))
8828 		    continue;
8829 		  if (gimple_bb (use_stmt) != store_bb)
8830 		    {
8831 		      res = false;
8832 		      break;
8833 		    }
8834 		}
8835 	      if (!res)
8836 		break;
8837 
8838 	      if (gimple_vuse (stmt1)
8839 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
8840 		break;
8841 
8842 	      /* Can move STMT1 to STORE_BB.  */
8843 	      if (dump_enabled_p ())
8844 		dump_printf_loc (MSG_NOTE, vect_location,
8845 				 "Move stmt to created bb\n%G", stmt1);
8846 	      gsi_move_before (&gsi_from, &gsi_to);
8847 	      /* Shift GSI_TO for further insertion.  */
8848 	      gsi_prev (&gsi_to);
8849 	    }
8850 	  /* Put other masked stores with the same mask to STORE_BB.  */
8851 	  if (worklist.is_empty ()
8852 	      || gimple_call_arg (worklist.last (), 2) != mask
8853 	      || worklist.last () != stmt1)
8854 	    break;
8855 	  last = worklist.pop ();
8856 	}
8857       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8858     }
8859 }
8860