xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-loop.c (revision 181254a7b1bdde6873432bffef2d2decc4b5c22f)
1 /* Loop Vectorization
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 
58 /* Loop Vectorization Pass.
59 
60    This pass tries to vectorize loops.
61 
62    For example, the vectorizer transforms the following simple loop:
63 
64         short a[N]; short b[N]; short c[N]; int i;
65 
66         for (i=0; i<N; i++){
67           a[i] = b[i] + c[i];
68         }
69 
70    as if it was manually vectorized by rewriting the source code into:
71 
72         typedef int __attribute__((mode(V8HI))) v8hi;
73         short a[N];  short b[N]; short c[N];   int i;
74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75         v8hi va, vb, vc;
76 
77         for (i=0; i<N/8; i++){
78           vb = pb[i];
79           vc = pc[i];
80           va = vb + vc;
81           pa[i] = va;
82         }
83 
84         The main entry to this pass is vectorize_loops(), in which
85    the vectorizer applies a set of analyses on a given set of loops,
86    followed by the actual vectorization transformation for the loops that
87    had successfully passed the analysis phase.
88         Throughout this pass we make a distinction between two types of
89    data: scalars (which are represented by SSA_NAMES), and memory references
90    ("data-refs").  These two types of data require different handling both
91    during analysis and transformation. The types of data-refs that the
92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94    accesses are required to have a simple (consecutive) access pattern.
95 
96    Analysis phase:
97    ===============
98         The driver for the analysis phase is vect_analyze_loop().
99    It applies a set of analyses, some of which rely on the scalar evolution
100    analyzer (scev) developed by Sebastian Pop.
101 
102         During the analysis phase the vectorizer records some information
103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104    loop, as well as general information about the loop as a whole, which is
105    recorded in a "loop_vec_info" struct attached to each loop.
106 
107    Transformation phase:
108    =====================
109         The loop transformation phase scans all the stmts in the loop, and
110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111    the loop that needs to be vectorized.  It inserts the vector code sequence
112    just before the scalar stmt S, and records a pointer to the vector code
113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114    attached to S).  This pointer will be used for the vectorization of following
115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116    otherwise, we rely on dead code elimination for removing it.
117 
118         For example, say stmt S1 was vectorized into stmt VS1:
119 
120    VS1: vb = px[i];
121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122    S2:  a = b;
123 
124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
127    resulting sequence would be:
128 
129    VS1: vb = px[i];
130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131    VS2: va = vb;
132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 
134         Operands that are not SSA_NAMEs, are data-refs that appear in
135    load/store operations (like 'x[i]' in S1), and are handled differently.
136 
137    Target modeling:
138    =================
139         Currently the only target specific information that is used is the
140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141    Targets that can support different sizes of vectors, for now will need
142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
143    flexibility will be added in the future.
144 
145         Since we only vectorize operations which vector form can be
146    expressed using existing tree codes, to verify that an operation is
147    supported, the vectorizer checks the relevant optab at the relevant
148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
149    the value found is CODE_FOR_nothing, then there's no target support, and
150    we can't vectorize the stmt.
151 
152    For additional information on this project see:
153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155 
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 
158 /* Function vect_determine_vectorization_factor
159 
160    Determine the vectorization factor (VF).  VF is the number of data elements
161    that are operated upon in parallel in a single iteration of the vectorized
162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164    elements can fit in a single vector register.
165 
166    We currently support vectorization of loops in which all types operated upon
167    are of the same size.  Therefore this function currently sets VF according to
168    the size of the types operated upon, and fails if there are multiple sizes
169    in the loop.
170 
171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
172    original loop:
173         for (i=0; i<N; i++){
174           a[i] = b[i] + c[i];
175         }
176 
177    vectorized loop:
178         for (i=0; i<N; i+=VF){
179           a[i:VF] = b[i:VF] + c[i:VF];
180         }
181 */
182 
183 static bool
184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
185 {
186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188   unsigned nbbs = loop->num_nodes;
189   poly_uint64 vectorization_factor = 1;
190   tree scalar_type = NULL_TREE;
191   gphi *phi;
192   tree vectype;
193   stmt_vec_info stmt_info;
194   unsigned i;
195   HOST_WIDE_INT dummy;
196   gimple *stmt, *pattern_stmt = NULL;
197   gimple_seq pattern_def_seq = NULL;
198   gimple_stmt_iterator pattern_def_si = gsi_none ();
199   bool analyze_pattern_stmt = false;
200   bool bool_result;
201   auto_vec<stmt_vec_info> mask_producers;
202 
203   if (dump_enabled_p ())
204     dump_printf_loc (MSG_NOTE, vect_location,
205                      "=== vect_determine_vectorization_factor ===\n");
206 
207   for (i = 0; i < nbbs; i++)
208     {
209       basic_block bb = bbs[i];
210 
211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212 	   gsi_next (&si))
213 	{
214 	  phi = si.phi ();
215 	  stmt_info = vinfo_for_stmt (phi);
216 	  if (dump_enabled_p ())
217 	    {
218 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
220 	    }
221 
222 	  gcc_assert (stmt_info);
223 
224 	  if (STMT_VINFO_RELEVANT_P (stmt_info)
225 	      || STMT_VINFO_LIVE_P (stmt_info))
226             {
227 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
229 
230 	      if (dump_enabled_p ())
231 		{
232 		  dump_printf_loc (MSG_NOTE, vect_location,
233                                    "get vectype for scalar type:  ");
234 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235                   dump_printf (MSG_NOTE, "\n");
236 		}
237 
238 	      vectype = get_vectype_for_scalar_type (scalar_type);
239 	      if (!vectype)
240 		{
241 		  if (dump_enabled_p ())
242 		    {
243 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244                                        "not vectorized: unsupported "
245                                        "data-type ");
246 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247                                          scalar_type);
248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
249 		    }
250 		  return false;
251 		}
252 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
253 
254 	      if (dump_enabled_p ())
255 		{
256 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258                   dump_printf (MSG_NOTE, "\n");
259 		}
260 
261 	      if (dump_enabled_p ())
262 		{
263 		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264 		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265 		  dump_printf (MSG_NOTE, "\n");
266 		}
267 
268 	      vect_update_max_nunits (&vectorization_factor, vectype);
269 	    }
270 	}
271 
272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
273 	   !gsi_end_p (si) || analyze_pattern_stmt;)
274         {
275           tree vf_vectype;
276 
277           if (analyze_pattern_stmt)
278 	    stmt = pattern_stmt;
279           else
280             stmt = gsi_stmt (si);
281 
282           stmt_info = vinfo_for_stmt (stmt);
283 
284 	  if (dump_enabled_p ())
285 	    {
286 	      dump_printf_loc (MSG_NOTE, vect_location,
287                                "==> examining statement: ");
288 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
289 	    }
290 
291 	  gcc_assert (stmt_info);
292 
293 	  /* Skip stmts which do not need to be vectorized.  */
294 	  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 	       && !STMT_VINFO_LIVE_P (stmt_info))
296 	      || gimple_clobber_p (stmt))
297             {
298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302                 {
303                   stmt = pattern_stmt;
304                   stmt_info = vinfo_for_stmt (pattern_stmt);
305                   if (dump_enabled_p ())
306                     {
307                       dump_printf_loc (MSG_NOTE, vect_location,
308                                        "==> examining pattern statement: ");
309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
310                     }
311                 }
312               else
313 	        {
314 	          if (dump_enabled_p ())
315 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316                   gsi_next (&si);
317 	          continue;
318                 }
319 	    }
320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324             analyze_pattern_stmt = true;
325 
326 	  /* If a pattern statement has def stmts, analyze them too.  */
327 	  if (is_pattern_stmt_p (stmt_info))
328 	    {
329 	      if (pattern_def_seq == NULL)
330 		{
331 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 		  pattern_def_si = gsi_start (pattern_def_seq);
333 		}
334 	      else if (!gsi_end_p (pattern_def_si))
335 		gsi_next (&pattern_def_si);
336 	      if (pattern_def_seq != NULL)
337 		{
338 		  gimple *pattern_def_stmt = NULL;
339 		  stmt_vec_info pattern_def_stmt_info = NULL;
340 
341 		  while (!gsi_end_p (pattern_def_si))
342 		    {
343 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
344 		      pattern_def_stmt_info
345 			= vinfo_for_stmt (pattern_def_stmt);
346 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 			break;
349 		      gsi_next (&pattern_def_si);
350 		    }
351 
352 		  if (!gsi_end_p (pattern_def_si))
353 		    {
354 		      if (dump_enabled_p ())
355 			{
356 			  dump_printf_loc (MSG_NOTE, vect_location,
357                                            "==> examining pattern def stmt: ");
358 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359                                             pattern_def_stmt, 0);
360 			}
361 
362 		      stmt = pattern_def_stmt;
363 		      stmt_info = pattern_def_stmt_info;
364 		    }
365 		  else
366 		    {
367 		      pattern_def_si = gsi_none ();
368 		      analyze_pattern_stmt = false;
369 		    }
370 		}
371 	      else
372 		analyze_pattern_stmt = false;
373 	    }
374 
375 	  if (gimple_get_lhs (stmt) == NULL_TREE
376 	      /* MASK_STORE has no lhs, but is ok.  */
377 	      && (!is_gimple_call (stmt)
378 		  || !gimple_call_internal_p (stmt)
379 		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
380 	    {
381 	      if (is_gimple_call (stmt))
382 		{
383 		  /* Ignore calls with no lhs.  These must be calls to
384 		     #pragma omp simd functions, and what vectorization factor
385 		     it really needs can't be determined until
386 		     vectorizable_simd_clone_call.  */
387 		  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
388 		    {
389 		      pattern_def_seq = NULL;
390 		      gsi_next (&si);
391 		    }
392 		  continue;
393 		}
394 	      if (dump_enabled_p ())
395 		{
396 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397                                    "not vectorized: irregular stmt.");
398 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
399                                     0);
400 		}
401 	      return false;
402 	    }
403 
404 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405 	    {
406 	      if (dump_enabled_p ())
407 	        {
408 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409                                    "not vectorized: vector stmt in loop:");
410 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
411 	        }
412 	      return false;
413 	    }
414 
415 	  bool_result = false;
416 
417 	  if (STMT_VINFO_VECTYPE (stmt_info))
418 	    {
419 	      /* The only case when a vectype had been already set is for stmts
420 	         that contain a dataref, or for "pattern-stmts" (stmts
421 		 generated by the vectorizer to represent/replace a certain
422 		 idiom).  */
423 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 			  || is_pattern_stmt_p (stmt_info)
425 			  || !gsi_end_p (pattern_def_si));
426 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
427 	    }
428 	  else
429 	    {
430 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 	      if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433 	      else
434 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
435 
436 	      /* Bool ops don't participate in vectorization factor
437 		 computation.  For comparison use compared types to
438 		 compute a factor.  */
439 	      if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440 		  && is_gimple_assign (stmt)
441 		  && gimple_assign_rhs_code (stmt) != COND_EXPR)
442 		{
443 		  if (STMT_VINFO_RELEVANT_P (stmt_info)
444 		      || STMT_VINFO_LIVE_P (stmt_info))
445 		    mask_producers.safe_push (stmt_info);
446 		  bool_result = true;
447 
448 		  if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449 		      == tcc_comparison
450 		      && !VECT_SCALAR_BOOLEAN_TYPE_P
451 			    (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452 		    scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453 		  else
454 		    {
455 		      if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 			{
457 			  pattern_def_seq = NULL;
458 			  gsi_next (&si);
459 			}
460 		      continue;
461 		    }
462 		}
463 
464 	      if (dump_enabled_p ())
465 		{
466 		  dump_printf_loc (MSG_NOTE, vect_location,
467                                    "get vectype for scalar type:  ");
468 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469                   dump_printf (MSG_NOTE, "\n");
470 		}
471 	      vectype = get_vectype_for_scalar_type (scalar_type);
472 	      if (!vectype)
473 		{
474 		  if (dump_enabled_p ())
475 		    {
476 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477                                        "not vectorized: unsupported "
478                                        "data-type ");
479 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480                                          scalar_type);
481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 		    }
483 		  return false;
484 		}
485 
486 	      if (!bool_result)
487 		STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 
489 	      if (dump_enabled_p ())
490 		{
491 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493                   dump_printf (MSG_NOTE, "\n");
494 		}
495             }
496 
497 	  /* Don't try to compute VF out scalar types if we stmt
498 	     produces boolean vector.  Use result vectype instead.  */
499 	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
500 	    vf_vectype = vectype;
501 	  else
502 	    {
503 	      /* The vectorization factor is according to the smallest
504 		 scalar type (or the largest vector size, but we only
505 		 support one vector size per loop).  */
506 	      if (!bool_result)
507 		scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508 							     &dummy);
509 	      if (dump_enabled_p ())
510 		{
511 		  dump_printf_loc (MSG_NOTE, vect_location,
512 				   "get vectype for scalar type:  ");
513 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514 		  dump_printf (MSG_NOTE, "\n");
515 		}
516 	      vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 	    }
518 	  if (!vf_vectype)
519 	    {
520 	      if (dump_enabled_p ())
521 		{
522 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523                                    "not vectorized: unsupported data-type ");
524 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525                                      scalar_type);
526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 		}
528 	      return false;
529 	    }
530 
531 	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532 			GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 	    {
534 	      if (dump_enabled_p ())
535 		{
536 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537                                    "not vectorized: different sized vector "
538                                    "types in statement, ");
539 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540                                      vectype);
541 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543                                      vf_vectype);
544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 		}
546 	      return false;
547 	    }
548 
549 	  if (dump_enabled_p ())
550 	    {
551 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553               dump_printf (MSG_NOTE, "\n");
554 	    }
555 
556 	  if (dump_enabled_p ())
557 	    {
558 	      dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559 	      dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560 	      dump_printf (MSG_NOTE, "\n");
561 	    }
562 
563 	  vect_update_max_nunits (&vectorization_factor, vf_vectype);
564 
565 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
566 	    {
567 	      pattern_def_seq = NULL;
568 	      gsi_next (&si);
569 	    }
570         }
571     }
572 
573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
574   if (dump_enabled_p ())
575     {
576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577       dump_dec (MSG_NOTE, vectorization_factor);
578       dump_printf (MSG_NOTE, "\n");
579     }
580 
581   if (known_le (vectorization_factor, 1U))
582     {
583       if (dump_enabled_p ())
584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585                          "not vectorized: unsupported data-type\n");
586       return false;
587     }
588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
589 
590   for (i = 0; i < mask_producers.length (); i++)
591     {
592       tree mask_type = NULL;
593 
594       stmt = STMT_VINFO_STMT (mask_producers[i]);
595 
596       if (is_gimple_assign (stmt)
597 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598 	  && !VECT_SCALAR_BOOLEAN_TYPE_P
599 				      (TREE_TYPE (gimple_assign_rhs1 (stmt))))
600 	{
601 	  scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602 	  mask_type = get_mask_type_for_scalar_type (scalar_type);
603 
604 	  if (!mask_type)
605 	    {
606 	      if (dump_enabled_p ())
607 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 				 "not vectorized: unsupported mask\n");
609 	      return false;
610 	    }
611 	}
612       else
613 	{
614 	  tree rhs;
615 	  ssa_op_iter iter;
616 	  gimple *def_stmt;
617 	  enum vect_def_type dt;
618 
619 	  FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
620 	    {
621 	      if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622 				       &def_stmt, &dt, &vectype))
623 		{
624 		  if (dump_enabled_p ())
625 		    {
626 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 				       "not vectorized: can't compute mask type "
628 				       "for statement, ");
629 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
630 					0);
631 		    }
632 		  return false;
633 		}
634 
635 	      /* No vectype probably means external definition.
636 		 Allow it in case there is another operand which
637 		 allows to determine mask type.  */
638 	      if (!vectype)
639 		continue;
640 
641 	      if (!mask_type)
642 		mask_type = vectype;
643 	      else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644 				 TYPE_VECTOR_SUBPARTS (vectype)))
645 		{
646 		  if (dump_enabled_p ())
647 		    {
648 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 				       "not vectorized: different sized masks "
650 				       "types in statement, ");
651 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 					 mask_type);
653 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655 					 vectype);
656 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
657 		    }
658 		  return false;
659 		}
660 	      else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661 		       != VECTOR_BOOLEAN_TYPE_P (vectype))
662 		{
663 		  if (dump_enabled_p ())
664 		    {
665 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 				       "not vectorized: mixed mask and "
667 				       "nonmask vector types in statement, ");
668 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 					 mask_type);
670 		      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672 					 vectype);
673 		      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
674 		    }
675 		  return false;
676 		}
677 	    }
678 
679 	  /* We may compare boolean value loaded as vector of integers.
680 	     Fix mask_type in such case.  */
681 	  if (mask_type
682 	      && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683 	      && gimple_code (stmt) == GIMPLE_ASSIGN
684 	      && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685 	    mask_type = build_same_sized_truth_vector_type (mask_type);
686 	}
687 
688       /* No mask_type should mean loop invariant predicate.
689 	 This is probably a subject for optimization in
690 	 if-conversion.  */
691       if (!mask_type)
692 	{
693 	  if (dump_enabled_p ())
694 	    {
695 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 			       "not vectorized: can't compute mask type "
697 			       "for statement, ");
698 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
699 				0);
700 	    }
701 	  return false;
702 	}
703 
704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
705     }
706 
707   return true;
708 }
709 
710 
711 /* Function vect_is_simple_iv_evolution.
712 
713    FORNOW: A simple evolution of an induction variables in the loop is
714    considered a polynomial evolution.  */
715 
716 static bool
717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718                              tree * step)
719 {
720   tree init_expr;
721   tree step_expr;
722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723   basic_block bb;
724 
725   /* When there is no evolution in this loop, the evolution function
726      is not "simple".  */
727   if (evolution_part == NULL_TREE)
728     return false;
729 
730   /* When the evolution is a polynomial of degree >= 2
731      the evolution function is not "simple".  */
732   if (tree_is_chrec (evolution_part))
733     return false;
734 
735   step_expr = evolution_part;
736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
737 
738   if (dump_enabled_p ())
739     {
740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742       dump_printf (MSG_NOTE, ",  init: ");
743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744       dump_printf (MSG_NOTE, "\n");
745     }
746 
747   *init = init_expr;
748   *step = step_expr;
749 
750   if (TREE_CODE (step_expr) != INTEGER_CST
751       && (TREE_CODE (step_expr) != SSA_NAME
752 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756 		  || !flag_associative_math)))
757       && (TREE_CODE (step_expr) != REAL_CST
758 	  || !flag_associative_math))
759     {
760       if (dump_enabled_p ())
761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762                          "step unknown.\n");
763       return false;
764     }
765 
766   return true;
767 }
768 
769 /* Function vect_analyze_scalar_cycles_1.
770 
771    Examine the cross iteration def-use cycles of scalar variables
772    in LOOP.  LOOP_VINFO represents the loop that is now being
773    considered for vectorization (can be LOOP, or an outer-loop
774    enclosing LOOP).  */
775 
776 static void
777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
778 {
779   basic_block bb = loop->header;
780   tree init, step;
781   auto_vec<gimple *, 64> worklist;
782   gphi_iterator gsi;
783   bool double_reduc;
784 
785   if (dump_enabled_p ())
786     dump_printf_loc (MSG_NOTE, vect_location,
787                      "=== vect_analyze_scalar_cycles ===\n");
788 
789   /* First - identify all inductions.  Reduction detection assumes that all the
790      inductions have been identified, therefore, this order must not be
791      changed.  */
792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
793     {
794       gphi *phi = gsi.phi ();
795       tree access_fn = NULL;
796       tree def = PHI_RESULT (phi);
797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
798 
799       if (dump_enabled_p ())
800 	{
801 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
803 	}
804 
805       /* Skip virtual phi's.  The data dependences that are associated with
806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
807       if (virtual_operand_p (def))
808 	continue;
809 
810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
811 
812       /* Analyze the evolution function.  */
813       access_fn = analyze_scalar_evolution (loop, def);
814       if (access_fn)
815 	{
816 	  STRIP_NOPS (access_fn);
817 	  if (dump_enabled_p ())
818 	    {
819 	      dump_printf_loc (MSG_NOTE, vect_location,
820                                "Access function of PHI: ");
821 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822               dump_printf (MSG_NOTE, "\n");
823 	    }
824 	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825 	    = initial_condition_in_loop_num (access_fn, loop->num);
826 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827 	    = evolution_part_in_loop_num (access_fn, loop->num);
828 	}
829 
830       if (!access_fn
831 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833 	      && TREE_CODE (step) != INTEGER_CST))
834 	{
835 	  worklist.safe_push (phi);
836 	  continue;
837 	}
838 
839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840 		  != NULL_TREE);
841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
842 
843       if (dump_enabled_p ())
844 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
846     }
847 
848 
849   /* Second - identify all reductions and nested cycles.  */
850   while (worklist.length () > 0)
851     {
852       gimple *phi = worklist.pop ();
853       tree def = PHI_RESULT (phi);
854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855       gimple *reduc_stmt;
856 
857       if (dump_enabled_p ())
858         {
859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
861         }
862 
863       gcc_assert (!virtual_operand_p (def)
864 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
865 
866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867 						&double_reduc, false);
868       if (reduc_stmt)
869         {
870           if (double_reduc)
871             {
872               if (dump_enabled_p ())
873                 dump_printf_loc (MSG_NOTE, vect_location,
874 				 "Detected double reduction.\n");
875 
876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878                                                     vect_double_reduction_def;
879             }
880           else
881             {
882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
883                 {
884                   if (dump_enabled_p ())
885                     dump_printf_loc (MSG_NOTE, vect_location,
886 				     "Detected vectorizable nested cycle.\n");
887 
888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890                                                              vect_nested_cycle;
891                 }
892               else
893                 {
894                   if (dump_enabled_p ())
895                     dump_printf_loc (MSG_NOTE, vect_location,
896 				     "Detected reduction.\n");
897 
898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900                                                            vect_reduction_def;
901                   /* Store the reduction cycles for possible vectorization in
902                      loop-aware SLP if it was not detected as reduction
903 		     chain.  */
904 		  if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905 		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
906                 }
907             }
908         }
909       else
910         if (dump_enabled_p ())
911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 			   "Unknown def-use cycle pattern.\n");
913     }
914 }
915 
916 
917 /* Function vect_analyze_scalar_cycles.
918 
919    Examine the cross iteration def-use cycles of scalar variables, by
920    analyzing the loop-header PHIs of scalar variables.  Classify each
921    cycle as one of the following: invariant, induction, reduction, unknown.
922    We do that for the loop represented by LOOP_VINFO, and also to its
923    inner-loop, if exists.
924    Examples for scalar cycles:
925 
926    Example1: reduction:
927 
928               loop1:
929               for (i=0; i<N; i++)
930                  sum += a[i];
931 
932    Example2: induction:
933 
934               loop2:
935               for (i=0; i<N; i++)
936                  a[i] = i;  */
937 
938 static void
939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
940 {
941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
942 
943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
944 
945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946      Reductions in such inner-loop therefore have different properties than
947      the reductions in the nest that gets vectorized:
948      1. When vectorized, they are executed in the same order as in the original
949         scalar loop, so we can't change the order of computation when
950         vectorizing them.
951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952         current checks are too strict.  */
953 
954   if (loop->inner)
955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
956 }
957 
958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
959 
960 static void
961 vect_fixup_reduc_chain (gimple *stmt)
962 {
963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964   gimple *stmtp;
965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 	      && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
968   do
969     {
970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973       if (stmt)
974 	GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 	  = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
976     }
977   while (stmt);
978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
979 }
980 
981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
982 
983 static void
984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
985 {
986   gimple *first;
987   unsigned i;
988 
989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
991       {
992 	gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 	while (next)
994 	  {
995 	    if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 	      break;
997 	    next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
998 	  }
999 	/* If not all stmt in the chain are patterns try to handle
1000 	   the chain without patterns.  */
1001 	if (! next)
1002 	  {
1003 	    vect_fixup_reduc_chain (first);
1004 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 	      = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006 	  }
1007       }
1008 }
1009 
1010 /* Function vect_get_loop_niters.
1011 
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016 
1017    Return the loop exit condition.  */
1018 
1019 
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022 		      tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028 
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034 		     "=== get_loop_niters ===\n");
1035 
1036   if (!exit)
1037     return cond;
1038 
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045 
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049 
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052 
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056 	{
1057 	  /* Try to combine may_be_zero with assumptions, this can simplify
1058 	     computation of niter expression.  */
1059 	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060 	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061 					     niter_assumptions,
1062 					     fold_build1 (TRUTH_NOT_EXPR,
1063 							  boolean_type_node,
1064 							  may_be_zero));
1065 	  else
1066 	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067 				 build_int_cst (TREE_TYPE (niter), 0),
1068 				 rewrite_to_non_trapping_overflow (niter));
1069 
1070 	  may_be_zero = NULL_TREE;
1071 	}
1072       else if (integer_nonzerop (may_be_zero))
1073 	{
1074 	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075 	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076 	  return cond;
1077 	}
1078       else
1079 	return cond;
1080     }
1081 
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084 
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091 			  build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093 
1094   return cond;
1095 }
1096 
1097 /* Function bb_in_loop_p
1098 
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100 
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109 
1110 
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113 
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     ivexpr_map (NULL),
1132     slp_unrolling_factor (1),
1133     single_scalar_iteration_cost (0),
1134     vectorizable (false),
1135     can_fully_mask_p (true),
1136     fully_masked_p (false),
1137     peeling_for_gaps (false),
1138     peeling_for_niter (false),
1139     operands_swapped (false),
1140     no_data_dependencies (false),
1141     has_mask_store (false),
1142     scalar_loop (NULL),
1143     orig_loop_info (NULL)
1144 {
1145   /* Create/Update stmt_info for all stmts in the loop.  */
1146   basic_block *body = get_loop_body (loop);
1147   for (unsigned int i = 0; i < loop->num_nodes; i++)
1148     {
1149       basic_block bb = body[i];
1150       gimple_stmt_iterator si;
1151 
1152       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153 	{
1154 	  gimple *phi = gsi_stmt (si);
1155 	  gimple_set_uid (phi, 0);
1156 	  set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1157 	}
1158 
1159       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160 	{
1161 	  gimple *stmt = gsi_stmt (si);
1162 	  gimple_set_uid (stmt, 0);
1163 	  set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1164 	}
1165     }
1166   free (body);
1167 
1168   /* CHECKME: We want to visit all BBs before their successors (except for
1169      latch blocks, for which this assertion wouldn't hold).  In the simple
1170      case of the loop forms we allow, a dfs order of the BBs would the same
1171      as reversed postorder traversal, so we are safe.  */
1172 
1173   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174 					  bbs, loop->num_nodes, loop);
1175   gcc_assert (nbbs == loop->num_nodes);
1176 }
1177 
1178 /* Free all levels of MASKS.  */
1179 
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1182 {
1183   rgroup_masks *rgm;
1184   unsigned int i;
1185   FOR_EACH_VEC_ELT (*masks, i, rgm)
1186     rgm->masks.release ();
1187   masks->release ();
1188 }
1189 
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191    stmt_vec_info structs of all the stmts in the loop.  */
1192 
1193 _loop_vec_info::~_loop_vec_info ()
1194 {
1195   int nbbs;
1196   gimple_stmt_iterator si;
1197   int j;
1198 
1199   nbbs = loop->num_nodes;
1200   for (j = 0; j < nbbs; j++)
1201     {
1202       basic_block bb = bbs[j];
1203       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204         free_stmt_vec_info (gsi_stmt (si));
1205 
1206       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207         {
1208 	  gimple *stmt = gsi_stmt (si);
1209 
1210 	  /* We may have broken canonical form by moving a constant
1211 	     into RHS1 of a commutative op.  Fix such occurrences.  */
1212 	  if (operands_swapped && is_gimple_assign (stmt))
1213 	    {
1214 	      enum tree_code code = gimple_assign_rhs_code (stmt);
1215 
1216 	      if ((code == PLUS_EXPR
1217 		   || code == POINTER_PLUS_EXPR
1218 		   || code == MULT_EXPR)
1219 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220 		swap_ssa_operands (stmt,
1221 				   gimple_assign_rhs1_ptr (stmt),
1222 				   gimple_assign_rhs2_ptr (stmt));
1223 	      else if (code == COND_EXPR
1224 		       && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225 		{
1226 		  tree cond_expr = gimple_assign_rhs1 (stmt);
1227 		  enum tree_code cond_code = TREE_CODE (cond_expr);
1228 
1229 		  if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230 		    {
1231 		      bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232 								  0));
1233 		      cond_code = invert_tree_comparison (cond_code,
1234 							  honor_nans);
1235 		      if (cond_code != ERROR_MARK)
1236 			{
1237 			  TREE_SET_CODE (cond_expr, cond_code);
1238 			  swap_ssa_operands (stmt,
1239 					     gimple_assign_rhs2_ptr (stmt),
1240 					     gimple_assign_rhs3_ptr (stmt));
1241 			}
1242 		    }
1243 		}
1244 	    }
1245 
1246 	  /* Free stmt_vec_info.  */
1247 	  free_stmt_vec_info (stmt);
1248           gsi_next (&si);
1249         }
1250     }
1251 
1252   free (bbs);
1253 
1254   release_vec_loop_masks (&masks);
1255   delete ivexpr_map;
1256 
1257   loop->aux = NULL;
1258 }
1259 
1260 /* Return an invariant or register for EXPR and emit necessary
1261    computations in the LOOP_VINFO loop preheader.  */
1262 
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1265 {
1266   if (is_gimple_reg (expr)
1267       || is_gimple_min_invariant (expr))
1268     return expr;
1269 
1270   if (! loop_vinfo->ivexpr_map)
1271     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273   if (! cached)
1274     {
1275       gimple_seq stmts = NULL;
1276       cached = force_gimple_operand (unshare_expr (expr),
1277 				     &stmts, true, NULL_TREE);
1278       if (stmts)
1279 	{
1280 	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281 	  gsi_insert_seq_on_edge_immediate (e, stmts);
1282 	}
1283     }
1284   return cached;
1285 }
1286 
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288    all masks required to mask LOOP_VINFO.  */
1289 
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1292 {
1293   rgroup_masks *rgm;
1294   unsigned int i;
1295   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296     if (rgm->mask_type != NULL_TREE
1297 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298 					    cmp_type, rgm->mask_type,
1299 					    OPTIMIZE_FOR_SPEED))
1300       return false;
1301   return true;
1302 }
1303 
1304 /* Calculate the maximum number of scalars per iteration for every
1305    rgroup in LOOP_VINFO.  */
1306 
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1309 {
1310   unsigned int res = 1;
1311   unsigned int i;
1312   rgroup_masks *rgm;
1313   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314     res = MAX (res, rgm->max_nscalars_per_iter);
1315   return res;
1316 }
1317 
1318 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1319    whether we can actually generate the masks required.  Return true if so,
1320    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1321 
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1324 {
1325   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326   unsigned int min_ni_width;
1327 
1328   /* Use a normal loop if there are no statements that need masking.
1329      This only happens in rare degenerate cases: it means that the loop
1330      has no loads, no stores, and no live-out values.  */
1331   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332     return false;
1333 
1334   /* Get the maximum number of iterations that is representable
1335      in the counter type.  */
1336   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1338 
1339   /* Get a more refined estimate for the number of iterations.  */
1340   widest_int max_back_edges;
1341   if (max_loop_iterations (loop, &max_back_edges))
1342     max_ni = wi::smin (max_ni, max_back_edges + 1);
1343 
1344   /* Account for rgroup masks, in which each bit is replicated N times.  */
1345   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1346 
1347   /* Work out how many bits we need to represent the limit.  */
1348   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1349 
1350   /* Find a scalar mode for which WHILE_ULT is supported.  */
1351   opt_scalar_int_mode cmp_mode_iter;
1352   tree cmp_type = NULL_TREE;
1353   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1354     {
1355       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356       if (cmp_bits >= min_ni_width
1357 	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1358 	{
1359 	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360 	  if (this_type
1361 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1362 	    {
1363 	      /* Although we could stop as soon as we find a valid mode,
1364 		 it's often better to continue until we hit Pmode, since the
1365 		 operands to the WHILE are more likely to be reusable in
1366 		 address calculations.  */
1367 	      cmp_type = this_type;
1368 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 		break;
1370 	    }
1371 	}
1372     }
1373 
1374   if (!cmp_type)
1375     return false;
1376 
1377   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378   return true;
1379 }
1380 
1381 /* Calculate the cost of one scalar iteration of the loop.  */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1384 {
1385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387   int nbbs = loop->num_nodes, factor;
1388   int innerloop_iters, i;
1389 
1390   /* Gather costs for statements in the scalar loop.  */
1391 
1392   /* FORNOW.  */
1393   innerloop_iters = 1;
1394   if (loop->inner)
1395     innerloop_iters = 50; /* FIXME */
1396 
1397   for (i = 0; i < nbbs; i++)
1398     {
1399       gimple_stmt_iterator si;
1400       basic_block bb = bbs[i];
1401 
1402       if (bb->loop_father == loop->inner)
1403         factor = innerloop_iters;
1404       else
1405         factor = 1;
1406 
1407       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409 	  gimple *stmt = gsi_stmt (si);
1410           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411 
1412           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413             continue;
1414 
1415           /* Skip stmts that are not vectorized inside the loop.  */
1416           if (stmt_info
1417               && !STMT_VINFO_RELEVANT_P (stmt_info)
1418               && (!STMT_VINFO_LIVE_P (stmt_info)
1419                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421             continue;
1422 
1423 	  vect_cost_for_stmt kind;
1424           if (STMT_VINFO_DATA_REF (stmt_info))
1425             {
1426               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427                kind = scalar_load;
1428              else
1429                kind = scalar_store;
1430             }
1431           else
1432             kind = scalar_stmt;
1433 
1434 	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435 			    factor, kind, stmt_info, 0, vect_prologue);
1436         }
1437     }
1438 
1439   /* Now accumulate cost.  */
1440   void *target_cost_data = init_cost (loop);
1441   stmt_info_for_cost *si;
1442   int j;
1443   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444 		    j, si)
1445     {
1446       struct _stmt_vec_info *stmt_info
1447 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448       (void) add_stmt_cost (target_cost_data, si->count,
1449 			    si->kind, stmt_info, si->misalign,
1450 			    vect_body);
1451     }
1452   unsigned dummy, body_cost = 0;
1453   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454   destroy_cost_data (target_cost_data);
1455   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1456 }
1457 
1458 
1459 /* Function vect_analyze_loop_form_1.
1460 
1461    Verify that certain CFG restrictions hold, including:
1462    - the loop has a pre-header
1463    - the loop has a single entry and exit
1464    - the loop exit condition is simple enough
1465    - the number of iterations can be analyzed, i.e, a countable loop.  The
1466      niter could be analyzed under some assumptions.  */
1467 
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470 			  tree *assumptions, tree *number_of_iterationsm1,
1471 			  tree *number_of_iterations, gcond **inner_loop_cond)
1472 {
1473   if (dump_enabled_p ())
1474     dump_printf_loc (MSG_NOTE, vect_location,
1475 		     "=== vect_analyze_loop_form ===\n");
1476 
1477   /* Different restrictions apply when we are considering an inner-most loop,
1478      vs. an outer (nested) loop.
1479      (FORNOW. May want to relax some of these restrictions in the future).  */
1480 
1481   if (!loop->inner)
1482     {
1483       /* Inner-most loop.  We currently require that the number of BBs is
1484 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1485 	 look like this:
1486 
1487                         (pre-header)
1488                            |
1489                           header <--------+
1490                            | |            |
1491                            | +--> latch --+
1492                            |
1493                         (exit-bb)  */
1494 
1495       if (loop->num_nodes != 2)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 			     "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502 
1503       if (empty_block_p (loop->header))
1504 	{
1505 	  if (dump_enabled_p ())
1506 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 			     "not vectorized: empty loop.\n");
1508 	  return false;
1509 	}
1510     }
1511   else
1512     {
1513       struct loop *innerloop = loop->inner;
1514       edge entryedge;
1515 
1516       /* Nested loop. We currently require that the loop is doubly-nested,
1517 	 contains a single inner loop, and the number of BBs is exactly 5.
1518 	 Vectorizable outer-loops look like this:
1519 
1520 			(pre-header)
1521 			   |
1522 			  header <---+
1523 			   |         |
1524 		          inner-loop |
1525 			   |         |
1526 			  tail ------+
1527 			   |
1528 		        (exit-bb)
1529 
1530 	 The inner-loop has the properties expected of inner-most loops
1531 	 as described above.  */
1532 
1533       if ((loop->inner)->inner || (loop->inner)->next)
1534 	{
1535 	  if (dump_enabled_p ())
1536 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 			     "not vectorized: multiple nested loops.\n");
1538 	  return false;
1539 	}
1540 
1541       if (loop->num_nodes != 5)
1542         {
1543 	  if (dump_enabled_p ())
1544 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545 			     "not vectorized: control flow in loop.\n");
1546 	  return false;
1547         }
1548 
1549       entryedge = loop_preheader_edge (innerloop);
1550       if (entryedge->src != loop->header
1551 	  || !single_exit (innerloop)
1552 	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1553 	{
1554 	  if (dump_enabled_p ())
1555 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 			     "not vectorized: unsupported outerloop form.\n");
1557 	  return false;
1558 	}
1559 
1560       /* Analyze the inner-loop.  */
1561       tree inner_niterm1, inner_niter, inner_assumptions;
1562       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563 				      &inner_assumptions, &inner_niterm1,
1564 				      &inner_niter, NULL)
1565 	  /* Don't support analyzing niter under assumptions for inner
1566 	     loop.  */
1567 	  || !integer_onep (inner_assumptions))
1568 	{
1569 	  if (dump_enabled_p ())
1570             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 			     "not vectorized: Bad inner loop.\n");
1572 	  return false;
1573 	}
1574 
1575       if (!expr_invariant_in_loop_p (loop, inner_niter))
1576 	{
1577 	  if (dump_enabled_p ())
1578 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 			     "not vectorized: inner-loop count not"
1580                              " invariant.\n");
1581 	  return false;
1582 	}
1583 
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_NOTE, vect_location,
1586 			 "Considering outer-loop vectorization.\n");
1587     }
1588 
1589   if (!single_exit (loop)
1590       || EDGE_COUNT (loop->header->preds) != 2)
1591     {
1592       if (dump_enabled_p ())
1593         {
1594           if (!single_exit (loop))
1595 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 			     "not vectorized: multiple exits.\n");
1597           else if (EDGE_COUNT (loop->header->preds) != 2)
1598 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 			     "not vectorized: too many incoming edges.\n");
1600         }
1601       return false;
1602     }
1603 
1604   /* We assume that the loop exit condition is at the end of the loop. i.e,
1605      that the loop is represented as a do-while (with a proper if-guard
1606      before the loop if needed), where the loop header contains all the
1607      executable statements, and the latch is empty.  */
1608   if (!empty_block_p (loop->latch)
1609       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1610     {
1611       if (dump_enabled_p ())
1612 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 			 "not vectorized: latch block not empty.\n");
1614       return false;
1615     }
1616 
1617   /* Make sure the exit is not abnormal.  */
1618   edge e = single_exit (loop);
1619   if (e->flags & EDGE_ABNORMAL)
1620     {
1621       if (dump_enabled_p ())
1622 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 			 "not vectorized: abnormal loop exit edge.\n");
1624       return false;
1625     }
1626 
1627   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628 				     number_of_iterationsm1);
1629   if (!*loop_cond)
1630     {
1631       if (dump_enabled_p ())
1632 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633 			 "not vectorized: complicated exit condition.\n");
1634       return false;
1635     }
1636 
1637   if (integer_zerop (*assumptions)
1638       || !*number_of_iterations
1639       || chrec_contains_undetermined (*number_of_iterations))
1640     {
1641       if (dump_enabled_p ())
1642 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 			 "not vectorized: number of iterations cannot be "
1644 			 "computed.\n");
1645       return false;
1646     }
1647 
1648   if (integer_zerop (*number_of_iterations))
1649     {
1650       if (dump_enabled_p ())
1651 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 			 "not vectorized: number of iterations = 0.\n");
1653       return false;
1654     }
1655 
1656   return true;
1657 }
1658 
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1660 
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1663 {
1664   tree assumptions, number_of_iterations, number_of_iterationsm1;
1665   gcond *loop_cond, *inner_loop_cond = NULL;
1666 
1667   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668 				  &assumptions, &number_of_iterationsm1,
1669 				  &number_of_iterations, &inner_loop_cond))
1670     return NULL;
1671 
1672   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676   if (!integer_onep (assumptions))
1677     {
1678       /* We consider to vectorize this loop by versioning it under
1679 	 some assumptions.  In order to do this, we need to clear
1680 	 existing information computed by scev and niter analyzer.  */
1681       scev_reset_htab ();
1682       free_numbers_of_iterations_estimates (loop);
1683       /* Also set flag for this loop so that following scev and niter
1684 	 analysis are done under the assumptions.  */
1685       loop_constraint_set (loop, LOOP_C_FINITE);
1686       /* Also record the assumptions for versioning.  */
1687       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1688     }
1689 
1690   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1691     {
1692       if (dump_enabled_p ())
1693         {
1694           dump_printf_loc (MSG_NOTE, vect_location,
1695 			   "Symbolic number of iterations is ");
1696 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697           dump_printf (MSG_NOTE, "\n");
1698         }
1699     }
1700 
1701   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702   if (inner_loop_cond)
1703     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704       = loop_exit_ctrl_vec_info_type;
1705 
1706   gcc_assert (!loop->aux);
1707   loop->aux = loop_vinfo;
1708   return loop_vinfo;
1709 }
1710 
1711 
1712 
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714    statements update the vectorization factor.  */
1715 
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1718 {
1719   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721   int nbbs = loop->num_nodes;
1722   poly_uint64 vectorization_factor;
1723   int i;
1724 
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location,
1727 		     "=== vect_update_vf_for_slp ===\n");
1728 
1729   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730   gcc_assert (known_ne (vectorization_factor, 0U));
1731 
1732   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733      vectorization factor of the loop is the unrolling factor required by
1734      the SLP instances.  If that unrolling factor is 1, we say, that we
1735      perform pure SLP on loop - cross iteration parallelism is not
1736      exploited.  */
1737   bool only_slp_in_loop = true;
1738   for (i = 0; i < nbbs; i++)
1739     {
1740       basic_block bb = bbs[i];
1741       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742 	   gsi_next (&si))
1743 	{
1744 	  gimple *stmt = gsi_stmt (si);
1745 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747 	      && STMT_VINFO_RELATED_STMT (stmt_info))
1748 	    {
1749 	      stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750 	      stmt_info = vinfo_for_stmt (stmt);
1751 	    }
1752 	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753 	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754 	      && !PURE_SLP_STMT (stmt_info))
1755 	    /* STMT needs both SLP and loop-based vectorization.  */
1756 	    only_slp_in_loop = false;
1757 	}
1758     }
1759 
1760   if (only_slp_in_loop)
1761     {
1762       dump_printf_loc (MSG_NOTE, vect_location,
1763 		       "Loop contains only SLP stmts\n");
1764       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1765     }
1766   else
1767     {
1768       dump_printf_loc (MSG_NOTE, vect_location,
1769 		       "Loop contains SLP and non-SLP stmts\n");
1770       /* Both the vectorization factor and unroll factor have the form
1771 	 current_vector_size * X for some rational X, so they must have
1772 	 a common multiple.  */
1773       vectorization_factor
1774 	= force_common_multiple (vectorization_factor,
1775 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1776     }
1777 
1778   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779   if (dump_enabled_p ())
1780     {
1781       dump_printf_loc (MSG_NOTE, vect_location,
1782 		       "Updating vectorization factor to ");
1783       dump_dec (MSG_NOTE, vectorization_factor);
1784       dump_printf (MSG_NOTE, ".\n");
1785     }
1786 }
1787 
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789    the other phi in the reduction is also relevant for vectorization.
1790    This rejects cases such as:
1791 
1792       outer1:
1793 	x_1 = PHI <x_3(outer2), ...>;
1794 	...
1795 
1796       inner:
1797 	x_2 = ...;
1798 	...
1799 
1800       outer2:
1801 	x_3 = PHI <x_2(inner)>;
1802 
1803    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1804 
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1807 {
1808   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809     return false;
1810 
1811   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1813 }
1814 
1815 /* Function vect_analyze_loop_operations.
1816 
1817    Scan the loop stmts and make sure they are all vectorizable.  */
1818 
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1821 {
1822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824   int nbbs = loop->num_nodes;
1825   int i;
1826   stmt_vec_info stmt_info;
1827   bool need_to_vectorize = false;
1828   bool ok;
1829 
1830   if (dump_enabled_p ())
1831     dump_printf_loc (MSG_NOTE, vect_location,
1832 		     "=== vect_analyze_loop_operations ===\n");
1833 
1834   for (i = 0; i < nbbs; i++)
1835     {
1836       basic_block bb = bbs[i];
1837 
1838       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839 	   gsi_next (&si))
1840         {
1841           gphi *phi = si.phi ();
1842           ok = true;
1843 
1844           stmt_info = vinfo_for_stmt (phi);
1845           if (dump_enabled_p ())
1846             {
1847               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1849             }
1850 	  if (virtual_operand_p (gimple_phi_result (phi)))
1851 	    continue;
1852 
1853           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854              (i.e., a phi in the tail of the outer-loop).  */
1855           if (! is_loop_header_bb_p (bb))
1856             {
1857               /* FORNOW: we currently don't support the case that these phis
1858                  are not used in the outerloop (unless it is double reduction,
1859                  i.e., this phi is vect_reduction_def), cause this case
1860                  requires to actually do something here.  */
1861               if (STMT_VINFO_LIVE_P (stmt_info)
1862 		  && !vect_active_double_reduction_p (stmt_info))
1863                 {
1864                   if (dump_enabled_p ())
1865 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 				     "Unsupported loop-closed phi in "
1867 				     "outer-loop.\n");
1868                   return false;
1869                 }
1870 
1871               /* If PHI is used in the outer loop, we check that its operand
1872                  is defined in the inner loop.  */
1873               if (STMT_VINFO_RELEVANT_P (stmt_info))
1874                 {
1875                   tree phi_op;
1876 		  gimple *op_def_stmt;
1877 
1878                   if (gimple_phi_num_args (phi) != 1)
1879                     return false;
1880 
1881                   phi_op = PHI_ARG_DEF (phi, 0);
1882                   if (TREE_CODE (phi_op) != SSA_NAME)
1883                     return false;
1884 
1885                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886 		  if (gimple_nop_p (op_def_stmt)
1887 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888 		      || !vinfo_for_stmt (op_def_stmt))
1889                     return false;
1890 
1891                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892                         != vect_used_in_outer
1893                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894                            != vect_used_in_outer_by_reduction)
1895                     return false;
1896                 }
1897 
1898               continue;
1899             }
1900 
1901           gcc_assert (stmt_info);
1902 
1903           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904                || STMT_VINFO_LIVE_P (stmt_info))
1905               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1906             {
1907               /* A scalar-dependence cycle that we don't support.  */
1908               if (dump_enabled_p ())
1909 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 				 "not vectorized: scalar dependence cycle.\n");
1911               return false;
1912             }
1913 
1914           if (STMT_VINFO_RELEVANT_P (stmt_info))
1915             {
1916               need_to_vectorize = true;
1917               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918 		  && ! PURE_SLP_STMT (stmt_info))
1919                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922 		       && ! PURE_SLP_STMT (stmt_info))
1923 		ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1924             }
1925 
1926 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1927 	  if (ok
1928 	      && STMT_VINFO_LIVE_P (stmt_info)
1929 	      && !PURE_SLP_STMT (stmt_info))
1930 	    ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1931 
1932           if (!ok)
1933             {
1934               if (dump_enabled_p ())
1935                 {
1936 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 				   "not vectorized: relevant phi not "
1938 				   "supported: ");
1939                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1940                 }
1941 	      return false;
1942             }
1943         }
1944 
1945       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946 	   gsi_next (&si))
1947         {
1948 	  gimple *stmt = gsi_stmt (si);
1949 	  if (!gimple_clobber_p (stmt)
1950 	      && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951 	    return false;
1952         }
1953     } /* bbs */
1954 
1955   /* All operations in the loop are either irrelevant (deal with loop
1956      control, or dead), or only used outside the loop and can be moved
1957      out of the loop (e.g. invariants, inductions).  The loop can be
1958      optimized away by scalar optimizations.  We're better off not
1959      touching this loop.  */
1960   if (!need_to_vectorize)
1961     {
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964 			 "All the computation can be taken out of the loop.\n");
1965       if (dump_enabled_p ())
1966 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967 			 "not vectorized: redundant loop. no profit to "
1968 			 "vectorize.\n");
1969       return false;
1970     }
1971 
1972   return true;
1973 }
1974 
1975 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1976    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1977    definitely no, or -1 if it's worth retrying.  */
1978 
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1981 {
1982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1984 
1985   /* Only fully-masked loops can have iteration counts less than the
1986      vectorization factor.  */
1987   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988     {
1989       HOST_WIDE_INT max_niter;
1990 
1991       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992 	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993       else
1994 	max_niter = max_stmt_executions_int (loop);
1995 
1996       if (max_niter != -1
1997 	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1998 	{
1999 	  if (dump_enabled_p ())
2000 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 			     "not vectorized: iteration count smaller than "
2002 			     "vectorization factor.\n");
2003 	  return 0;
2004 	}
2005     }
2006 
2007   int min_profitable_iters, min_profitable_estimate;
2008   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009 				      &min_profitable_estimate);
2010 
2011   if (min_profitable_iters < 0)
2012     {
2013       if (dump_enabled_p ())
2014 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 			 "not vectorized: vectorization not profitable.\n");
2016       if (dump_enabled_p ())
2017 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 			 "not vectorized: vector version will never be "
2019 			 "profitable.\n");
2020       return -1;
2021     }
2022 
2023   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024 			       * assumed_vf);
2025 
2026   /* Use the cost model only if it is more conservative than user specified
2027      threshold.  */
2028   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029 				    min_profitable_iters);
2030 
2031   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2032 
2033   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2035     {
2036       if (dump_enabled_p ())
2037 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038 			 "not vectorized: vectorization not profitable.\n");
2039       if (dump_enabled_p ())
2040 	dump_printf_loc (MSG_NOTE, vect_location,
2041 			 "not vectorized: iteration count smaller than user "
2042 			 "specified loop bound parameter or minimum profitable "
2043 			 "iterations (whichever is more conservative).\n");
2044       return 0;
2045     }
2046 
2047   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048   if (estimated_niter == -1)
2049     estimated_niter = likely_max_stmt_executions_int (loop);
2050   if (estimated_niter != -1
2051       && ((unsigned HOST_WIDE_INT) estimated_niter
2052 	  < MAX (th, (unsigned) min_profitable_estimate)))
2053     {
2054       if (dump_enabled_p ())
2055 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056 			 "not vectorized: estimated iteration count too "
2057 			 "small.\n");
2058       if (dump_enabled_p ())
2059 	dump_printf_loc (MSG_NOTE, vect_location,
2060 			 "not vectorized: estimated iteration count smaller "
2061 			 "than specified loop bound parameter or minimum "
2062 			 "profitable iterations (whichever is more "
2063 			 "conservative).\n");
2064       return -1;
2065     }
2066 
2067   return 1;
2068 }
2069 
2070 
2071 /* Function vect_analyze_loop_2.
2072 
2073    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074    for it.  The different analyses will record information in the
2075    loop_vec_info struct.  */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2078 {
2079   bool ok;
2080   int res;
2081   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082   poly_uint64 min_vf = 2;
2083   unsigned int n_stmts = 0;
2084 
2085   /* The first group of checks is independent of the vector size.  */
2086   fatal = true;
2087 
2088   /* Find all data references in the loop (which correspond to vdefs/vuses)
2089      and analyze their evolution in the loop.  */
2090 
2091   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2092 
2093   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2095     {
2096       if (dump_enabled_p ())
2097 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 			 "not vectorized: loop nest containing two "
2099 			 "or more consecutive inner loops cannot be "
2100 			 "vectorized\n");
2101       return false;
2102     }
2103 
2104   for (unsigned i = 0; i < loop->num_nodes; i++)
2105     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106 	 !gsi_end_p (gsi); gsi_next (&gsi))
2107       {
2108 	gimple *stmt = gsi_stmt (gsi);
2109 	if (is_gimple_debug (stmt))
2110 	  continue;
2111 	++n_stmts;
2112 	if (!find_data_references_in_stmt (loop, stmt,
2113 					   &LOOP_VINFO_DATAREFS (loop_vinfo)))
2114 	  {
2115 	    if (is_gimple_call (stmt) && loop->safelen)
2116 	      {
2117 		tree fndecl = gimple_call_fndecl (stmt), op;
2118 		if (fndecl != NULL_TREE)
2119 		  {
2120 		    cgraph_node *node = cgraph_node::get (fndecl);
2121 		    if (node != NULL && node->simd_clones != NULL)
2122 		      {
2123 			unsigned int j, n = gimple_call_num_args (stmt);
2124 			for (j = 0; j < n; j++)
2125 			  {
2126 			    op = gimple_call_arg (stmt, j);
2127 			    if (DECL_P (op)
2128 				|| (REFERENCE_CLASS_P (op)
2129 				    && get_base_address (op)))
2130 			      break;
2131 			  }
2132 			op = gimple_call_lhs (stmt);
2133 			/* Ignore #pragma omp declare simd functions
2134 			   if they don't have data references in the
2135 			   call stmt itself.  */
2136 			if (j == n
2137 			    && !(op
2138 				 && (DECL_P (op)
2139 				     || (REFERENCE_CLASS_P (op)
2140 					 && get_base_address (op)))))
2141 			  continue;
2142 		      }
2143 		  }
2144 	      }
2145 	    if (dump_enabled_p ())
2146 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 			       "not vectorized: loop contains function "
2148 			       "calls or data references that cannot "
2149 			       "be analyzed\n");
2150 	    return false;
2151 	  }
2152       }
2153 
2154   /* Analyze the data references and also adjust the minimal
2155      vectorization factor according to the loads and stores.  */
2156 
2157   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158   if (!ok)
2159     {
2160       if (dump_enabled_p ())
2161 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 			 "bad data references.\n");
2163       return false;
2164     }
2165 
2166   /* Classify all cross-iteration scalar data-flow cycles.
2167      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2168   vect_analyze_scalar_cycles (loop_vinfo);
2169 
2170   vect_pattern_recog (loop_vinfo);
2171 
2172   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2173 
2174   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2176 
2177   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178   if (!ok)
2179     {
2180       if (dump_enabled_p ())
2181 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 			 "bad data access.\n");
2183       return false;
2184     }
2185 
2186   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2187 
2188   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189   if (!ok)
2190     {
2191       if (dump_enabled_p ())
2192 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 			 "unexpected pattern.\n");
2194       return false;
2195     }
2196 
2197   /* While the rest of the analysis below depends on it in some way.  */
2198   fatal = false;
2199 
2200   /* Analyze data dependences between the data-refs in the loop
2201      and adjust the maximum vectorization factor according to
2202      the dependences.
2203      FORNOW: fail at the first data dependence that we encounter.  */
2204 
2205   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206   if (!ok
2207       || (max_vf != MAX_VECTORIZATION_FACTOR
2208 	  && maybe_lt (max_vf, min_vf)))
2209     {
2210       if (dump_enabled_p ())
2211 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 			     "bad data dependence.\n");
2213       return false;
2214     }
2215   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2216 
2217   ok = vect_determine_vectorization_factor (loop_vinfo);
2218   if (!ok)
2219     {
2220       if (dump_enabled_p ())
2221 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222 			 "can't determine vectorization factor.\n");
2223       return false;
2224     }
2225   if (max_vf != MAX_VECTORIZATION_FACTOR
2226       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2227     {
2228       if (dump_enabled_p ())
2229 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 			 "bad data dependence.\n");
2231       return false;
2232     }
2233 
2234   /* Compute the scalar iteration cost.  */
2235   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2236 
2237   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238   unsigned th;
2239 
2240   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2241   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242   if (!ok)
2243     return false;
2244 
2245   /* If there are any SLP instances mark them as pure_slp.  */
2246   bool slp = vect_make_slp_decision (loop_vinfo);
2247   if (slp)
2248     {
2249       /* Find stmts that need to be both vectorized and SLPed.  */
2250       vect_detect_hybrid_slp (loop_vinfo);
2251 
2252       /* Update the vectorization factor based on the SLP decision.  */
2253       vect_update_vf_for_slp (loop_vinfo);
2254     }
2255 
2256   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2257 
2258   /* We don't expect to have to roll back to anything other than an empty
2259      set of rgroups.  */
2260   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2261 
2262   /* This is the point where we can re-start analysis with SLP forced off.  */
2263 start_over:
2264 
2265   /* Now the vectorization factor is final.  */
2266   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267   gcc_assert (known_ne (vectorization_factor, 0U));
2268 
2269   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2270     {
2271       dump_printf_loc (MSG_NOTE, vect_location,
2272 		       "vectorization_factor = ");
2273       dump_dec (MSG_NOTE, vectorization_factor);
2274       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2276     }
2277 
2278   HOST_WIDE_INT max_niter
2279     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2280 
2281   /* Analyze the alignment of the data-refs in the loop.
2282      Fail if a data reference is found that cannot be vectorized.  */
2283 
2284   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 			 "bad data alignment.\n");
2290       return false;
2291     }
2292 
2293   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294      It is important to call pruning after vect_analyze_data_ref_accesses,
2295      since we use grouping information gathered by interleaving analysis.  */
2296   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297   if (!ok)
2298     return false;
2299 
2300   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301      vectorization.  */
2302   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2303     {
2304     /* This pass will decide on using loop versioning and/or loop peeling in
2305        order to enhance the alignment of data references in the loop.  */
2306     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307     if (!ok)
2308       {
2309 	if (dump_enabled_p ())
2310 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 			   "bad data alignment.\n");
2312         return false;
2313       }
2314     }
2315 
2316   if (slp)
2317     {
2318       /* Analyze operations in the SLP instances.  Note this may
2319 	 remove unsupported SLP instances which makes the above
2320 	 SLP kind detection invalid.  */
2321       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322       vect_slp_analyze_operations (loop_vinfo);
2323       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324 	goto again;
2325     }
2326 
2327   /* Scan all the remaining operations in the loop that are not subject
2328      to SLP and make sure they are vectorizable.  */
2329   ok = vect_analyze_loop_operations (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 			 "bad operation or unsupported loop bound.\n");
2335       return false;
2336     }
2337 
2338   /* Decide whether to use a fully-masked loop for this vectorization
2339      factor.  */
2340   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342        && vect_verify_full_masking (loop_vinfo));
2343   if (dump_enabled_p ())
2344     {
2345       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346 	dump_printf_loc (MSG_NOTE, vect_location,
2347 			 "using a fully-masked loop.\n");
2348       else
2349 	dump_printf_loc (MSG_NOTE, vect_location,
2350 			 "not using a fully-masked loop.\n");
2351     }
2352 
2353   /* If epilog loop is required because of data accesses with gaps,
2354      one additional iteration needs to be peeled.  Check if there is
2355      enough iterations for vectorization.  */
2356   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2359     {
2360       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2362 
2363       if (known_lt (wi::to_widest (scalar_niters), vf))
2364 	{
2365 	  if (dump_enabled_p ())
2366 	    dump_printf_loc (MSG_NOTE, vect_location,
2367 			     "loop has no enough iterations to support"
2368 			     " peeling for gaps.\n");
2369 	  return false;
2370 	}
2371     }
2372 
2373   /* Check the costings of the loop make vectorizing worthwhile.  */
2374   res = vect_analyze_loop_costing (loop_vinfo);
2375   if (res < 0)
2376     goto again;
2377   if (!res)
2378     {
2379       if (dump_enabled_p ())
2380 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 			 "Loop costings not worthwhile.\n");
2382       return false;
2383     }
2384 
2385   /* Decide whether we need to create an epilogue loop to handle
2386      remaining scalar iterations.  */
2387   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2388 
2389   unsigned HOST_WIDE_INT const_vf;
2390   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391     /* The main loop handles all iterations.  */
2392     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394 	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2395     {
2396       /* Work out the (constant) number of iterations that need to be
2397 	 peeled for reasons other than niters.  */
2398       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2399       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2400 	peel_niter += 1;
2401       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2402 		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2403 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2404     }
2405   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2406 	   /* ??? When peeling for gaps but not alignment, we could
2407 	      try to check whether the (variable) niters is known to be
2408 	      VF * N + 1.  That's something of a niche case though.  */
2409 	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2410 	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2411 	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2412 		< (unsigned) exact_log2 (const_vf))
2413 	       /* In case of versioning, check if the maximum number of
2414 		  iterations is greater than th.  If they are identical,
2415 		  the epilogue is unnecessary.  */
2416 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2417 		   || ((unsigned HOST_WIDE_INT) max_niter
2418 		       > (th / const_vf) * const_vf))))
2419     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2420 
2421   /* If an epilogue loop is required make sure we can create one.  */
2422   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2423       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2427       if (!vect_can_advance_ivs_p (loop_vinfo)
2428 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2429 					   single_exit (LOOP_VINFO_LOOP
2430 							 (loop_vinfo))))
2431         {
2432           if (dump_enabled_p ())
2433 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434 			     "not vectorized: can't create required "
2435 			     "epilog loop\n");
2436           goto again;
2437         }
2438     }
2439 
2440   /* During peeling, we need to check if number of loop iterations is
2441      enough for both peeled prolog loop and vector loop.  This check
2442      can be merged along with threshold check of loop versioning, so
2443      increase threshold for this case if necessary.  */
2444   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2445     {
2446       poly_uint64 niters_th = 0;
2447 
2448       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2449 	{
2450 	  /* Niters for peeled prolog loop.  */
2451 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2452 	    {
2453 	      struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2454 	      tree vectype
2455 		= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2456 	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2457 	    }
2458 	  else
2459 	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2460 	}
2461 
2462       /* Niters for at least one iteration of vectorized loop.  */
2463       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2464 	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2465       /* One additional iteration because of peeling for gap.  */
2466       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2467 	niters_th += 1;
2468       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2469     }
2470 
2471   gcc_assert (known_eq (vectorization_factor,
2472 			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2473 
2474   /* Ok to vectorize!  */
2475   return true;
2476 
2477 again:
2478   /* Try again with SLP forced off but if we didn't do any SLP there is
2479      no point in re-trying.  */
2480   if (!slp)
2481     return false;
2482 
2483   /* If there are reduction chains re-trying will fail anyway.  */
2484   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2485     return false;
2486 
2487   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2488      via interleaving or lane instructions.  */
2489   slp_instance instance;
2490   slp_tree node;
2491   unsigned i, j;
2492   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2493     {
2494       stmt_vec_info vinfo;
2495       vinfo = vinfo_for_stmt
2496 	  (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2497       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2498 	continue;
2499       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2500       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2501       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2502       if (! vect_store_lanes_supported (vectype, size, false)
2503 	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2504 	 && ! vect_grouped_store_supported (vectype, size))
2505        return false;
2506       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2507 	{
2508 	  vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2509 	  vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2510 	  bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2511 	  size = STMT_VINFO_GROUP_SIZE (vinfo);
2512 	  vectype = STMT_VINFO_VECTYPE (vinfo);
2513 	  if (! vect_load_lanes_supported (vectype, size, false)
2514 	      && ! vect_grouped_load_supported (vectype, single_element_p,
2515 						size))
2516 	    return false;
2517 	}
2518     }
2519 
2520   if (dump_enabled_p ())
2521     dump_printf_loc (MSG_NOTE, vect_location,
2522 		     "re-trying with SLP disabled\n");
2523 
2524   /* Roll back state appropriately.  No SLP this time.  */
2525   slp = false;
2526   /* Restore vectorization factor as it were without SLP.  */
2527   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2528   /* Free the SLP instances.  */
2529   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2530     vect_free_slp_instance (instance);
2531   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2532   /* Reset SLP type to loop_vect on all stmts.  */
2533   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2534     {
2535       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2536       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2537 	   !gsi_end_p (si); gsi_next (&si))
2538 	{
2539 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2540 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2541 	}
2542       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2543 	   !gsi_end_p (si); gsi_next (&si))
2544 	{
2545 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2546 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2547 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2548 	    {
2549 	      stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2550 	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2551 	      for (gimple_stmt_iterator pi
2552 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2553 		   !gsi_end_p (pi); gsi_next (&pi))
2554 		{
2555 		  gimple *pstmt = gsi_stmt (pi);
2556 		  STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2557 		}
2558 	    }
2559 	}
2560     }
2561   /* Free optimized alias test DDRS.  */
2562   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2563   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2564   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2565   /* Reset target cost data.  */
2566   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2567   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2568     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2569   /* Reset accumulated rgroup information.  */
2570   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2571   /* Reset assorted flags.  */
2572   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2573   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2574   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2575   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2576   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2577 
2578   goto start_over;
2579 }
2580 
2581 /* Function vect_analyze_loop.
2582 
2583    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2584    for it.  The different analyses will record information in the
2585    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2586    be vectorized.  */
2587 loop_vec_info
2588 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2589 {
2590   loop_vec_info loop_vinfo;
2591   auto_vector_sizes vector_sizes;
2592 
2593   /* Autodetect first vector size we try.  */
2594   current_vector_size = 0;
2595   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2596   unsigned int next_size = 0;
2597 
2598   if (dump_enabled_p ())
2599     dump_printf_loc (MSG_NOTE, vect_location,
2600 		     "===== analyze_loop_nest =====\n");
2601 
2602   if (loop_outer (loop)
2603       && loop_vec_info_for_loop (loop_outer (loop))
2604       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2605     {
2606       if (dump_enabled_p ())
2607 	dump_printf_loc (MSG_NOTE, vect_location,
2608 			 "outer-loop already vectorized.\n");
2609       return NULL;
2610     }
2611 
2612   poly_uint64 autodetected_vector_size = 0;
2613   while (1)
2614     {
2615       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2616       loop_vinfo = vect_analyze_loop_form (loop);
2617       if (!loop_vinfo)
2618 	{
2619 	  if (dump_enabled_p ())
2620 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2621 			     "bad loop form.\n");
2622 	  return NULL;
2623 	}
2624 
2625       bool fatal = false;
2626 
2627       if (orig_loop_vinfo)
2628 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2629 
2630       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2631 	{
2632 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2633 
2634 	  return loop_vinfo;
2635 	}
2636 
2637       delete loop_vinfo;
2638 
2639       if (next_size == 0)
2640 	autodetected_vector_size = current_vector_size;
2641 
2642       if (next_size < vector_sizes.length ()
2643 	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
2644 	next_size += 1;
2645 
2646       if (fatal
2647 	  || next_size == vector_sizes.length ()
2648 	  || known_eq (current_vector_size, 0U))
2649 	return NULL;
2650 
2651       /* Try the next biggest vector size.  */
2652       current_vector_size = vector_sizes[next_size++];
2653       if (dump_enabled_p ())
2654 	{
2655 	  dump_printf_loc (MSG_NOTE, vect_location,
2656 			   "***** Re-trying analysis with "
2657 			   "vector size ");
2658 	  dump_dec (MSG_NOTE, current_vector_size);
2659 	  dump_printf (MSG_NOTE, "\n");
2660 	}
2661     }
2662 }
2663 
2664 /* Return true if there is an in-order reduction function for CODE, storing
2665    it in *REDUC_FN if so.  */
2666 
2667 static bool
2668 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2669 {
2670   switch (code)
2671     {
2672     case PLUS_EXPR:
2673       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2674       return true;
2675 
2676     default:
2677       return false;
2678     }
2679 }
2680 
2681 /* Function reduction_fn_for_scalar_code
2682 
2683    Input:
2684    CODE - tree_code of a reduction operations.
2685 
2686    Output:
2687    REDUC_FN - the corresponding internal function to be used to reduce the
2688       vector of partial results into a single scalar result, or IFN_LAST
2689       if the operation is a supported reduction operation, but does not have
2690       such an internal function.
2691 
2692    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2693 
2694 static bool
2695 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2696 {
2697   switch (code)
2698     {
2699       case MAX_EXPR:
2700         *reduc_fn = IFN_REDUC_MAX;
2701         return true;
2702 
2703       case MIN_EXPR:
2704         *reduc_fn = IFN_REDUC_MIN;
2705         return true;
2706 
2707       case PLUS_EXPR:
2708         *reduc_fn = IFN_REDUC_PLUS;
2709         return true;
2710 
2711       case BIT_AND_EXPR:
2712 	*reduc_fn = IFN_REDUC_AND;
2713 	return true;
2714 
2715       case BIT_IOR_EXPR:
2716 	*reduc_fn = IFN_REDUC_IOR;
2717 	return true;
2718 
2719       case BIT_XOR_EXPR:
2720 	*reduc_fn = IFN_REDUC_XOR;
2721 	return true;
2722 
2723       case MULT_EXPR:
2724       case MINUS_EXPR:
2725         *reduc_fn = IFN_LAST;
2726         return true;
2727 
2728       default:
2729        return false;
2730     }
2731 }
2732 
2733 /* If there is a neutral value X such that SLP reduction NODE would not
2734    be affected by the introduction of additional X elements, return that X,
2735    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2736    is true if the SLP statements perform a single reduction, false if each
2737    statement performs an independent reduction.  */
2738 
2739 static tree
2740 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2741 			      bool reduc_chain)
2742 {
2743   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2744   gimple *stmt = stmts[0];
2745   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2746   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2747   tree scalar_type = TREE_TYPE (vector_type);
2748   struct loop *loop = gimple_bb (stmt)->loop_father;
2749   gcc_assert (loop);
2750 
2751   switch (code)
2752     {
2753     case WIDEN_SUM_EXPR:
2754     case DOT_PROD_EXPR:
2755     case SAD_EXPR:
2756     case PLUS_EXPR:
2757     case MINUS_EXPR:
2758     case BIT_IOR_EXPR:
2759     case BIT_XOR_EXPR:
2760       return build_zero_cst (scalar_type);
2761 
2762     case MULT_EXPR:
2763       return build_one_cst (scalar_type);
2764 
2765     case BIT_AND_EXPR:
2766       return build_all_ones_cst (scalar_type);
2767 
2768     case MAX_EXPR:
2769     case MIN_EXPR:
2770       /* For MIN/MAX the initial values are neutral.  A reduction chain
2771 	 has only a single initial value, so that value is neutral for
2772 	 all statements.  */
2773       if (reduc_chain)
2774 	return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2775       return NULL_TREE;
2776 
2777     default:
2778       return NULL_TREE;
2779     }
2780 }
2781 
2782 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2783    STMT is printed with a message MSG. */
2784 
2785 static void
2786 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2787 {
2788   dump_printf_loc (msg_type, vect_location, "%s", msg);
2789   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2790 }
2791 
2792 
2793 /* Detect SLP reduction of the form:
2794 
2795    #a1 = phi <a5, a0>
2796    a2 = operation (a1)
2797    a3 = operation (a2)
2798    a4 = operation (a3)
2799    a5 = operation (a4)
2800 
2801    #a = phi <a5>
2802 
2803    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2804    FIRST_STMT is the first reduction stmt in the chain
2805    (a2 = operation (a1)).
2806 
2807    Return TRUE if a reduction chain was detected.  */
2808 
2809 static bool
2810 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2811 		       gimple *first_stmt)
2812 {
2813   struct loop *loop = (gimple_bb (phi))->loop_father;
2814   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2815   enum tree_code code;
2816   gimple *loop_use_stmt = NULL;
2817   stmt_vec_info use_stmt_info;
2818   tree lhs;
2819   imm_use_iterator imm_iter;
2820   use_operand_p use_p;
2821   int nloop_uses, size = 0, n_out_of_loop_uses;
2822   bool found = false;
2823 
2824   if (loop != vect_loop)
2825     return false;
2826 
2827   auto_vec<stmt_vec_info, 8> reduc_chain;
2828   lhs = PHI_RESULT (phi);
2829   code = gimple_assign_rhs_code (first_stmt);
2830   while (1)
2831     {
2832       nloop_uses = 0;
2833       n_out_of_loop_uses = 0;
2834       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2835         {
2836 	  gimple *use_stmt = USE_STMT (use_p);
2837 	  if (is_gimple_debug (use_stmt))
2838 	    continue;
2839 
2840           /* Check if we got back to the reduction phi.  */
2841 	  if (use_stmt == phi)
2842             {
2843 	      loop_use_stmt = use_stmt;
2844               found = true;
2845               break;
2846             }
2847 
2848           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2849             {
2850 	      loop_use_stmt = use_stmt;
2851 	      nloop_uses++;
2852             }
2853            else
2854              n_out_of_loop_uses++;
2855 
2856            /* There are can be either a single use in the loop or two uses in
2857               phi nodes.  */
2858            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2859              return false;
2860         }
2861 
2862       if (found)
2863         break;
2864 
2865       /* We reached a statement with no loop uses.  */
2866       if (nloop_uses == 0)
2867 	return false;
2868 
2869       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2870       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2871         return false;
2872 
2873       if (!is_gimple_assign (loop_use_stmt)
2874 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2875 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2876         return false;
2877 
2878       /* Insert USE_STMT into reduction chain.  */
2879       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2880       reduc_chain.safe_push (use_stmt_info);
2881 
2882       lhs = gimple_assign_lhs (loop_use_stmt);
2883       size++;
2884    }
2885 
2886   if (!found || loop_use_stmt != phi || size < 2)
2887     return false;
2888 
2889   /* Swap the operands, if needed, to make the reduction operand be the second
2890      operand.  */
2891   lhs = PHI_RESULT (phi);
2892   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2893     {
2894       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2895       if (gimple_assign_rhs2 (next_stmt) == lhs)
2896 	{
2897 	  tree op = gimple_assign_rhs1 (next_stmt);
2898 	  gimple *def_stmt = NULL;
2899 
2900           if (TREE_CODE (op) == SSA_NAME)
2901             def_stmt = SSA_NAME_DEF_STMT (op);
2902 
2903 	  /* Check that the other def is either defined in the loop
2904 	     ("vect_internal_def"), or it's an induction (defined by a
2905 	     loop-header phi-node).  */
2906           if (def_stmt
2907               && gimple_bb (def_stmt)
2908 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2909               && (is_gimple_assign (def_stmt)
2910                   || is_gimple_call (def_stmt)
2911                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2912                            == vect_induction_def
2913                   || (gimple_code (def_stmt) == GIMPLE_PHI
2914                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2915                                   == vect_internal_def
2916                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2917 	    {
2918 	      lhs = gimple_assign_lhs (next_stmt);
2919  	      continue;
2920 	    }
2921 
2922 	  return false;
2923 	}
2924       else
2925 	{
2926           tree op = gimple_assign_rhs2 (next_stmt);
2927 	  gimple *def_stmt = NULL;
2928 
2929           if (TREE_CODE (op) == SSA_NAME)
2930             def_stmt = SSA_NAME_DEF_STMT (op);
2931 
2932           /* Check that the other def is either defined in the loop
2933             ("vect_internal_def"), or it's an induction (defined by a
2934             loop-header phi-node).  */
2935           if (def_stmt
2936               && gimple_bb (def_stmt)
2937 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2938               && (is_gimple_assign (def_stmt)
2939                   || is_gimple_call (def_stmt)
2940                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2941                               == vect_induction_def
2942                   || (gimple_code (def_stmt) == GIMPLE_PHI
2943                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2944                                   == vect_internal_def
2945                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2946   	    {
2947 	      if (dump_enabled_p ())
2948 		{
2949 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2950 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2951 		}
2952 
2953 	      swap_ssa_operands (next_stmt,
2954 	 		         gimple_assign_rhs1_ptr (next_stmt),
2955                                  gimple_assign_rhs2_ptr (next_stmt));
2956 	      update_stmt (next_stmt);
2957 
2958 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2959 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2960 	    }
2961 	  else
2962 	    return false;
2963         }
2964 
2965       lhs = gimple_assign_lhs (next_stmt);
2966     }
2967 
2968   /* Build up the actual chain.  */
2969   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2970     {
2971       GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt;
2972       GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt;
2973     }
2974   GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt;
2975   GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2976 
2977   /* Save the chain for further analysis in SLP detection.  */
2978   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt);
2979   GROUP_SIZE (reduc_chain[0]) = size;
2980 
2981   return true;
2982 }
2983 
2984 /* Return true if we need an in-order reduction for operation CODE
2985    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2986    overflow must wrap.  */
2987 
2988 static bool
2989 needs_fold_left_reduction_p (tree type, tree_code code,
2990 			     bool need_wrapping_integral_overflow)
2991 {
2992   /* CHECKME: check for !flag_finite_math_only too?  */
2993   if (SCALAR_FLOAT_TYPE_P (type))
2994     switch (code)
2995       {
2996       case MIN_EXPR:
2997       case MAX_EXPR:
2998 	return false;
2999 
3000       default:
3001 	return !flag_associative_math;
3002       }
3003 
3004   if (INTEGRAL_TYPE_P (type))
3005     {
3006       if (!operation_no_trapping_overflow (type, code))
3007 	return true;
3008       if (need_wrapping_integral_overflow
3009 	  && !TYPE_OVERFLOW_WRAPS (type)
3010 	  && operation_can_overflow (code))
3011 	return true;
3012       return false;
3013     }
3014 
3015   if (SAT_FIXED_POINT_TYPE_P (type))
3016     return true;
3017 
3018   return false;
3019 }
3020 
3021 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3022    reduction operation CODE has a handled computation expression.  */
3023 
3024 bool
3025 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3026 		      enum tree_code code)
3027 {
3028   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3029   auto_bitmap visited;
3030   tree lookfor = PHI_RESULT (phi);
3031   ssa_op_iter curri;
3032   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3033   while (USE_FROM_PTR (curr) != loop_arg)
3034     curr = op_iter_next_use (&curri);
3035   curri.i = curri.numops;
3036   do
3037     {
3038       path.safe_push (std::make_pair (curri, curr));
3039       tree use = USE_FROM_PTR (curr);
3040       if (use == lookfor)
3041 	break;
3042       gimple *def = SSA_NAME_DEF_STMT (use);
3043       if (gimple_nop_p (def)
3044 	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3045 	{
3046 pop:
3047 	  do
3048 	    {
3049 	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3050 	      curri = x.first;
3051 	      curr = x.second;
3052 	      do
3053 		curr = op_iter_next_use (&curri);
3054 	      /* Skip already visited or non-SSA operands (from iterating
3055 	         over PHI args).  */
3056 	      while (curr != NULL_USE_OPERAND_P
3057 		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3058 			 || ! bitmap_set_bit (visited,
3059 					      SSA_NAME_VERSION
3060 					        (USE_FROM_PTR (curr)))));
3061 	    }
3062 	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3063 	  if (curr == NULL_USE_OPERAND_P)
3064 	    break;
3065 	}
3066       else
3067 	{
3068 	  if (gimple_code (def) == GIMPLE_PHI)
3069 	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3070 	  else
3071 	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3072 	  while (curr != NULL_USE_OPERAND_P
3073 		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3074 		     || ! bitmap_set_bit (visited,
3075 					  SSA_NAME_VERSION
3076 					    (USE_FROM_PTR (curr)))))
3077 	    curr = op_iter_next_use (&curri);
3078 	  if (curr == NULL_USE_OPERAND_P)
3079 	    goto pop;
3080 	}
3081     }
3082   while (1);
3083   if (dump_file && (dump_flags & TDF_DETAILS))
3084     {
3085       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3086       unsigned i;
3087       std::pair<ssa_op_iter, use_operand_p> *x;
3088       FOR_EACH_VEC_ELT (path, i, x)
3089 	{
3090 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3091 	  dump_printf (MSG_NOTE, " ");
3092 	}
3093       dump_printf (MSG_NOTE, "\n");
3094     }
3095 
3096   /* Check whether the reduction path detected is valid.  */
3097   bool fail = path.length () == 0;
3098   bool neg = false;
3099   for (unsigned i = 1; i < path.length (); ++i)
3100     {
3101       gimple *use_stmt = USE_STMT (path[i].second);
3102       tree op = USE_FROM_PTR (path[i].second);
3103       if (! has_single_use (op)
3104 	  || ! is_gimple_assign (use_stmt))
3105 	{
3106 	  fail = true;
3107 	  break;
3108 	}
3109       if (gimple_assign_rhs_code (use_stmt) != code)
3110 	{
3111 	  if (code == PLUS_EXPR
3112 	      && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3113 	    {
3114 	      /* Track whether we negate the reduction value each iteration.  */
3115 	      if (gimple_assign_rhs2 (use_stmt) == op)
3116 		neg = ! neg;
3117 	    }
3118 	  else
3119 	    {
3120 	      fail = true;
3121 	      break;
3122 	    }
3123 	}
3124     }
3125   return ! fail && ! neg;
3126 }
3127 
3128 
3129 /* Function vect_is_simple_reduction
3130 
3131    (1) Detect a cross-iteration def-use cycle that represents a simple
3132    reduction computation.  We look for the following pattern:
3133 
3134    loop_header:
3135      a1 = phi < a0, a2 >
3136      a3 = ...
3137      a2 = operation (a3, a1)
3138 
3139    or
3140 
3141    a3 = ...
3142    loop_header:
3143      a1 = phi < a0, a2 >
3144      a2 = operation (a3, a1)
3145 
3146    such that:
3147    1. operation is commutative and associative and it is safe to
3148       change the order of the computation
3149    2. no uses for a2 in the loop (a2 is used out of the loop)
3150    3. no uses of a1 in the loop besides the reduction operation
3151    4. no uses of a1 outside the loop.
3152 
3153    Conditions 1,4 are tested here.
3154    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3155 
3156    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3157    nested cycles.
3158 
3159    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3160    reductions:
3161 
3162      a1 = phi < a0, a2 >
3163      inner loop (def of a3)
3164      a2 = phi < a3 >
3165 
3166    (4) Detect condition expressions, ie:
3167      for (int i = 0; i < N; i++)
3168        if (a[i] < val)
3169 	ret_val = a[i];
3170 
3171 */
3172 
3173 static gimple *
3174 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3175 			  bool *double_reduc,
3176 			  bool need_wrapping_integral_overflow,
3177 			  enum vect_reduction_type *v_reduc_type)
3178 {
3179   struct loop *loop = (gimple_bb (phi))->loop_father;
3180   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3181   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3182   enum tree_code orig_code, code;
3183   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3184   tree type;
3185   int nloop_uses;
3186   tree name;
3187   imm_use_iterator imm_iter;
3188   use_operand_p use_p;
3189   bool phi_def;
3190 
3191   *double_reduc = false;
3192   *v_reduc_type = TREE_CODE_REDUCTION;
3193 
3194   tree phi_name = PHI_RESULT (phi);
3195   /* ???  If there are no uses of the PHI result the inner loop reduction
3196      won't be detected as possibly double-reduction by vectorizable_reduction
3197      because that tries to walk the PHI arg from the preheader edge which
3198      can be constant.  See PR60382.  */
3199   if (has_zero_uses (phi_name))
3200     return NULL;
3201   nloop_uses = 0;
3202   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3203     {
3204       gimple *use_stmt = USE_STMT (use_p);
3205       if (is_gimple_debug (use_stmt))
3206 	continue;
3207 
3208       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3209         {
3210           if (dump_enabled_p ())
3211 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3212 			     "intermediate value used outside loop.\n");
3213 
3214           return NULL;
3215         }
3216 
3217       nloop_uses++;
3218       if (nloop_uses > 1)
3219         {
3220           if (dump_enabled_p ())
3221 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3222 			     "reduction value used in loop.\n");
3223           return NULL;
3224         }
3225 
3226       phi_use_stmt = use_stmt;
3227     }
3228 
3229   edge latch_e = loop_latch_edge (loop);
3230   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3231   if (TREE_CODE (loop_arg) != SSA_NAME)
3232     {
3233       if (dump_enabled_p ())
3234 	{
3235 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236 			   "reduction: not ssa_name: ");
3237 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3238           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3239 	}
3240       return NULL;
3241     }
3242 
3243   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3244   if (is_gimple_assign (def_stmt))
3245     {
3246       name = gimple_assign_lhs (def_stmt);
3247       phi_def = false;
3248     }
3249   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3250     {
3251       name = PHI_RESULT (def_stmt);
3252       phi_def = true;
3253     }
3254   else
3255     {
3256       if (dump_enabled_p ())
3257 	{
3258 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3259 			   "reduction: unhandled reduction operation: ");
3260 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3261 	}
3262       return NULL;
3263     }
3264 
3265   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3266     return NULL;
3267 
3268   nloop_uses = 0;
3269   auto_vec<gphi *, 3> lcphis;
3270   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3271     {
3272       gimple *use_stmt = USE_STMT (use_p);
3273       if (is_gimple_debug (use_stmt))
3274 	continue;
3275       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3276 	nloop_uses++;
3277       else
3278 	/* We can have more than one loop-closed PHI.  */
3279 	lcphis.safe_push (as_a <gphi *> (use_stmt));
3280       if (nloop_uses > 1)
3281 	{
3282 	  if (dump_enabled_p ())
3283 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3284 			     "reduction used in loop.\n");
3285 	  return NULL;
3286 	}
3287     }
3288 
3289   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3290      defined in the inner loop.  */
3291   if (phi_def)
3292     {
3293       op1 = PHI_ARG_DEF (def_stmt, 0);
3294 
3295       if (gimple_phi_num_args (def_stmt) != 1
3296           || TREE_CODE (op1) != SSA_NAME)
3297         {
3298           if (dump_enabled_p ())
3299 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3300 			     "unsupported phi node definition.\n");
3301 
3302           return NULL;
3303         }
3304 
3305       def1 = SSA_NAME_DEF_STMT (op1);
3306       if (gimple_bb (def1)
3307 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3308           && loop->inner
3309           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3310           && is_gimple_assign (def1)
3311 	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3312         {
3313           if (dump_enabled_p ())
3314             report_vect_op (MSG_NOTE, def_stmt,
3315 			    "detected double reduction: ");
3316 
3317           *double_reduc = true;
3318           return def_stmt;
3319         }
3320 
3321       return NULL;
3322     }
3323 
3324   /* If we are vectorizing an inner reduction we are executing that
3325      in the original order only in case we are not dealing with a
3326      double reduction.  */
3327   bool check_reduction = true;
3328   if (flow_loop_nested_p (vect_loop, loop))
3329     {
3330       gphi *lcphi;
3331       unsigned i;
3332       check_reduction = false;
3333       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3334 	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3335 	  {
3336 	    gimple *use_stmt = USE_STMT (use_p);
3337 	    if (is_gimple_debug (use_stmt))
3338 	      continue;
3339 	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3340 	      check_reduction = true;
3341 	  }
3342     }
3343 
3344   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3345   code = orig_code = gimple_assign_rhs_code (def_stmt);
3346 
3347   /* We can handle "res -= x[i]", which is non-associative by
3348      simply rewriting this into "res += -x[i]".  Avoid changing
3349      gimple instruction for the first simple tests and only do this
3350      if we're allowed to change code at all.  */
3351   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3352     code = PLUS_EXPR;
3353 
3354   if (code == COND_EXPR)
3355     {
3356       if (! nested_in_vect_loop)
3357 	*v_reduc_type = COND_REDUCTION;
3358 
3359       op3 = gimple_assign_rhs1 (def_stmt);
3360       if (COMPARISON_CLASS_P (op3))
3361         {
3362           op4 = TREE_OPERAND (op3, 1);
3363           op3 = TREE_OPERAND (op3, 0);
3364         }
3365       if (op3 == phi_name || op4 == phi_name)
3366 	{
3367 	  if (dump_enabled_p ())
3368 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3369 			    "reduction: condition depends on previous"
3370 			    " iteration: ");
3371 	  return NULL;
3372 	}
3373 
3374       op1 = gimple_assign_rhs2 (def_stmt);
3375       op2 = gimple_assign_rhs3 (def_stmt);
3376     }
3377   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3378     {
3379       if (dump_enabled_p ())
3380 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3381 			"reduction: not commutative/associative: ");
3382       return NULL;
3383     }
3384   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3385     {
3386       op1 = gimple_assign_rhs1 (def_stmt);
3387       op2 = gimple_assign_rhs2 (def_stmt);
3388     }
3389   else
3390     {
3391       if (dump_enabled_p ())
3392 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3393 			"reduction: not handled operation: ");
3394       return NULL;
3395     }
3396 
3397   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3398     {
3399       if (dump_enabled_p ())
3400 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3401 			"reduction: both uses not ssa_names: ");
3402 
3403       return NULL;
3404     }
3405 
3406   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3407   if ((TREE_CODE (op1) == SSA_NAME
3408        && !types_compatible_p (type,TREE_TYPE (op1)))
3409       || (TREE_CODE (op2) == SSA_NAME
3410           && !types_compatible_p (type, TREE_TYPE (op2)))
3411       || (op3 && TREE_CODE (op3) == SSA_NAME
3412           && !types_compatible_p (type, TREE_TYPE (op3)))
3413       || (op4 && TREE_CODE (op4) == SSA_NAME
3414           && !types_compatible_p (type, TREE_TYPE (op4))))
3415     {
3416       if (dump_enabled_p ())
3417         {
3418           dump_printf_loc (MSG_NOTE, vect_location,
3419 			   "reduction: multiple types: operation type: ");
3420           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3421           dump_printf (MSG_NOTE, ", operands types: ");
3422           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3423 			     TREE_TYPE (op1));
3424           dump_printf (MSG_NOTE, ",");
3425           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3426 			     TREE_TYPE (op2));
3427           if (op3)
3428             {
3429               dump_printf (MSG_NOTE, ",");
3430               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3431 				 TREE_TYPE (op3));
3432             }
3433 
3434           if (op4)
3435             {
3436               dump_printf (MSG_NOTE, ",");
3437               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3438 				 TREE_TYPE (op4));
3439             }
3440           dump_printf (MSG_NOTE, "\n");
3441         }
3442 
3443       return NULL;
3444     }
3445 
3446   /* Check whether it's ok to change the order of the computation.
3447      Generally, when vectorizing a reduction we change the order of the
3448      computation.  This may change the behavior of the program in some
3449      cases, so we need to check that this is ok.  One exception is when
3450      vectorizing an outer-loop: the inner-loop is executed sequentially,
3451      and therefore vectorizing reductions in the inner-loop during
3452      outer-loop vectorization is safe.  */
3453   if (check_reduction
3454       && *v_reduc_type == TREE_CODE_REDUCTION
3455       && needs_fold_left_reduction_p (type, code,
3456 				      need_wrapping_integral_overflow))
3457     *v_reduc_type = FOLD_LEFT_REDUCTION;
3458 
3459   /* Reduction is safe. We're dealing with one of the following:
3460      1) integer arithmetic and no trapv
3461      2) floating point arithmetic, and special flags permit this optimization
3462      3) nested cycle (i.e., outer loop vectorization).  */
3463   if (TREE_CODE (op1) == SSA_NAME)
3464     def1 = SSA_NAME_DEF_STMT (op1);
3465 
3466   if (TREE_CODE (op2) == SSA_NAME)
3467     def2 = SSA_NAME_DEF_STMT (op2);
3468 
3469   if (code != COND_EXPR
3470       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3471     {
3472       if (dump_enabled_p ())
3473 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3474       return NULL;
3475     }
3476 
3477   /* Check that one def is the reduction def, defined by PHI,
3478      the other def is either defined in the loop ("vect_internal_def"),
3479      or it's an induction (defined by a loop-header phi-node).  */
3480 
3481   if (def2 && def2 == phi
3482       && (code == COND_EXPR
3483 	  || !def1 || gimple_nop_p (def1)
3484 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3485           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3486               && (is_gimple_assign (def1)
3487 		  || is_gimple_call (def1)
3488   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3489                       == vect_induction_def
3490    	          || (gimple_code (def1) == GIMPLE_PHI
3491 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3492                           == vect_internal_def
3493  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
3494     {
3495       if (dump_enabled_p ())
3496 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3497       return def_stmt;
3498     }
3499 
3500   if (def1 && def1 == phi
3501       && (code == COND_EXPR
3502 	  || !def2 || gimple_nop_p (def2)
3503 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3504 	  || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3505 	      && (is_gimple_assign (def2)
3506 		  || is_gimple_call (def2)
3507 		  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3508 		       == vect_induction_def
3509 		  || (gimple_code (def2) == GIMPLE_PHI
3510 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3511 			   == vect_internal_def
3512 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
3513     {
3514       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3515 	{
3516 	  /* Check if we can swap operands (just for simplicity - so that
3517 	     the rest of the code can assume that the reduction variable
3518 	     is always the last (second) argument).  */
3519 	  if (code == COND_EXPR)
3520 	    {
3521 	      /* Swap cond_expr by inverting the condition.  */
3522 	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
3523 	      enum tree_code invert_code = ERROR_MARK;
3524 	      enum tree_code cond_code = TREE_CODE (cond_expr);
3525 
3526 	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3527 		{
3528 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3529 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
3530 		}
3531 	      if (invert_code != ERROR_MARK)
3532 		{
3533 		  TREE_SET_CODE (cond_expr, invert_code);
3534 		  swap_ssa_operands (def_stmt,
3535 				     gimple_assign_rhs2_ptr (def_stmt),
3536 				     gimple_assign_rhs3_ptr (def_stmt));
3537 		}
3538 	      else
3539 		{
3540 		  if (dump_enabled_p ())
3541 		    report_vect_op (MSG_NOTE, def_stmt,
3542 				    "detected reduction: cannot swap operands "
3543 				    "for cond_expr");
3544 		  return NULL;
3545 		}
3546 	    }
3547 	  else
3548 	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3549 			       gimple_assign_rhs2_ptr (def_stmt));
3550 
3551 	  if (dump_enabled_p ())
3552 	    report_vect_op (MSG_NOTE, def_stmt,
3553 			    "detected reduction: need to swap operands: ");
3554 
3555 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3556 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3557         }
3558       else
3559         {
3560           if (dump_enabled_p ())
3561             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3562         }
3563 
3564       return def_stmt;
3565     }
3566 
3567   /* Try to find SLP reduction chain.  */
3568   if (! nested_in_vect_loop
3569       && code != COND_EXPR
3570       && orig_code != MINUS_EXPR
3571       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3572     {
3573       if (dump_enabled_p ())
3574         report_vect_op (MSG_NOTE, def_stmt,
3575 			"reduction: detected reduction chain: ");
3576 
3577       return def_stmt;
3578     }
3579 
3580   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3581   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3582   while (first)
3583     {
3584       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3585       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3586       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3587       first = next;
3588     }
3589 
3590   /* Look for the expression computing loop_arg from loop PHI result.  */
3591   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3592 			    code))
3593     return def_stmt;
3594 
3595   if (dump_enabled_p ())
3596     {
3597       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3598 		      "reduction: unknown pattern: ");
3599     }
3600 
3601   return NULL;
3602 }
3603 
3604 /* Wrapper around vect_is_simple_reduction, which will modify code
3605    in-place if it enables detection of more reductions.  Arguments
3606    as there.  */
3607 
3608 gimple *
3609 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3610 			     bool *double_reduc,
3611 			     bool need_wrapping_integral_overflow)
3612 {
3613   enum vect_reduction_type v_reduc_type;
3614   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3615 					  need_wrapping_integral_overflow,
3616 					  &v_reduc_type);
3617   if (def)
3618     {
3619       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3620       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3621       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3622       reduc_def_info = vinfo_for_stmt (def);
3623       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3624       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3625     }
3626   return def;
3627 }
3628 
3629 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3630 int
3631 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3632                              int *peel_iters_epilogue,
3633                              stmt_vector_for_cost *scalar_cost_vec,
3634 			     stmt_vector_for_cost *prologue_cost_vec,
3635 			     stmt_vector_for_cost *epilogue_cost_vec)
3636 {
3637   int retval = 0;
3638   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3639 
3640   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3641     {
3642       *peel_iters_epilogue = assumed_vf / 2;
3643       if (dump_enabled_p ())
3644         dump_printf_loc (MSG_NOTE, vect_location,
3645 			 "cost model: epilogue peel iters set to vf/2 "
3646 			 "because loop iterations are unknown .\n");
3647 
3648       /* If peeled iterations are known but number of scalar loop
3649          iterations are unknown, count a taken branch per peeled loop.  */
3650       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3651 				 NULL, 0, vect_prologue);
3652       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3653 				 NULL, 0, vect_epilogue);
3654     }
3655   else
3656     {
3657       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3658       peel_iters_prologue = niters < peel_iters_prologue ?
3659                             niters : peel_iters_prologue;
3660       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3661       /* If we need to peel for gaps, but no peeling is required, we have to
3662 	 peel VF iterations.  */
3663       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3664 	*peel_iters_epilogue = assumed_vf;
3665     }
3666 
3667   stmt_info_for_cost *si;
3668   int j;
3669   if (peel_iters_prologue)
3670     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3671 	{
3672 	  stmt_vec_info stmt_info
3673 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3674 	  retval += record_stmt_cost (prologue_cost_vec,
3675 				      si->count * peel_iters_prologue,
3676 				      si->kind, stmt_info, si->misalign,
3677 				      vect_prologue);
3678 	}
3679   if (*peel_iters_epilogue)
3680     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3681 	{
3682 	  stmt_vec_info stmt_info
3683 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3684 	  retval += record_stmt_cost (epilogue_cost_vec,
3685 				      si->count * *peel_iters_epilogue,
3686 				      si->kind, stmt_info, si->misalign,
3687 				      vect_epilogue);
3688 	}
3689 
3690   return retval;
3691 }
3692 
3693 /* Function vect_estimate_min_profitable_iters
3694 
3695    Return the number of iterations required for the vector version of the
3696    loop to be profitable relative to the cost of the scalar version of the
3697    loop.
3698 
3699    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3700    of iterations for vectorization.  -1 value means loop vectorization
3701    is not profitable.  This returned value may be used for dynamic
3702    profitability check.
3703 
3704    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3705    for static check against estimated number of iterations.  */
3706 
3707 static void
3708 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3709 				    int *ret_min_profitable_niters,
3710 				    int *ret_min_profitable_estimate)
3711 {
3712   int min_profitable_iters;
3713   int min_profitable_estimate;
3714   int peel_iters_prologue;
3715   int peel_iters_epilogue;
3716   unsigned vec_inside_cost = 0;
3717   int vec_outside_cost = 0;
3718   unsigned vec_prologue_cost = 0;
3719   unsigned vec_epilogue_cost = 0;
3720   int scalar_single_iter_cost = 0;
3721   int scalar_outside_cost = 0;
3722   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3723   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3724   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3725 
3726   /* Cost model disabled.  */
3727   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3728     {
3729       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3730       *ret_min_profitable_niters = 0;
3731       *ret_min_profitable_estimate = 0;
3732       return;
3733     }
3734 
3735   /* Requires loop versioning tests to handle misalignment.  */
3736   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3737     {
3738       /*  FIXME: Make cost depend on complexity of individual check.  */
3739       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3740       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3741 			    vect_prologue);
3742       dump_printf (MSG_NOTE,
3743                    "cost model: Adding cost of checks for loop "
3744                    "versioning to treat misalignment.\n");
3745     }
3746 
3747   /* Requires loop versioning with alias checks.  */
3748   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3749     {
3750       /*  FIXME: Make cost depend on complexity of individual check.  */
3751       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3752       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3753 			    vect_prologue);
3754       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3755       if (len)
3756 	/* Count LEN - 1 ANDs and LEN comparisons.  */
3757 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3758 			      NULL, 0, vect_prologue);
3759       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3760       if (len)
3761 	{
3762 	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3763 	  unsigned int nstmts = len * 2 - 1;
3764 	  /* +1 for each bias that needs adding.  */
3765 	  for (unsigned int i = 0; i < len; ++i)
3766 	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3767 	      nstmts += 1;
3768 	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3769 				NULL, 0, vect_prologue);
3770 	}
3771       dump_printf (MSG_NOTE,
3772                    "cost model: Adding cost of checks for loop "
3773                    "versioning aliasing.\n");
3774     }
3775 
3776   /* Requires loop versioning with niter checks.  */
3777   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3778     {
3779       /*  FIXME: Make cost depend on complexity of individual check.  */
3780       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3781 			    vect_prologue);
3782       dump_printf (MSG_NOTE,
3783 		   "cost model: Adding cost of checks for loop "
3784 		   "versioning niters.\n");
3785     }
3786 
3787   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3788     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3789 			  vect_prologue);
3790 
3791   /* Count statements in scalar loop.  Using this as scalar cost for a single
3792      iteration for now.
3793 
3794      TODO: Add outer loop support.
3795 
3796      TODO: Consider assigning different costs to different scalar
3797      statements.  */
3798 
3799   scalar_single_iter_cost
3800     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3801 
3802   /* Add additional cost for the peeled instructions in prologue and epilogue
3803      loop.  (For fully-masked loops there will be no peeling.)
3804 
3805      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3806      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3807 
3808      TODO: Build an expression that represents peel_iters for prologue and
3809      epilogue to be used in a run-time test.  */
3810 
3811   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3812     {
3813       peel_iters_prologue = 0;
3814       peel_iters_epilogue = 0;
3815 
3816       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3817 	{
3818 	  /* We need to peel exactly one iteration.  */
3819 	  peel_iters_epilogue += 1;
3820 	  stmt_info_for_cost *si;
3821 	  int j;
3822 	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3823 			    j, si)
3824 	    {
3825 	      struct _stmt_vec_info *stmt_info
3826 		= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3827 	      (void) add_stmt_cost (target_cost_data, si->count,
3828 				    si->kind, stmt_info, si->misalign,
3829 				    vect_epilogue);
3830 	    }
3831 	}
3832     }
3833   else if (npeel < 0)
3834     {
3835       peel_iters_prologue = assumed_vf / 2;
3836       dump_printf (MSG_NOTE, "cost model: "
3837                    "prologue peel iters set to vf/2.\n");
3838 
3839       /* If peeling for alignment is unknown, loop bound of main loop becomes
3840          unknown.  */
3841       peel_iters_epilogue = assumed_vf / 2;
3842       dump_printf (MSG_NOTE, "cost model: "
3843                    "epilogue peel iters set to vf/2 because "
3844                    "peeling for alignment is unknown.\n");
3845 
3846       /* If peeled iterations are unknown, count a taken branch and a not taken
3847          branch per peeled loop. Even if scalar loop iterations are known,
3848          vector iterations are not known since peeled prologue iterations are
3849          not known. Hence guards remain the same.  */
3850       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3851 			    NULL, 0, vect_prologue);
3852       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3853 			    NULL, 0, vect_prologue);
3854       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3855 			    NULL, 0, vect_epilogue);
3856       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3857 			    NULL, 0, vect_epilogue);
3858       stmt_info_for_cost *si;
3859       int j;
3860       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3861 	{
3862 	  struct _stmt_vec_info *stmt_info
3863 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3864 	  (void) add_stmt_cost (target_cost_data,
3865 				si->count * peel_iters_prologue,
3866 				si->kind, stmt_info, si->misalign,
3867 				vect_prologue);
3868 	  (void) add_stmt_cost (target_cost_data,
3869 				si->count * peel_iters_epilogue,
3870 				si->kind, stmt_info, si->misalign,
3871 				vect_epilogue);
3872 	}
3873     }
3874   else
3875     {
3876       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3877       stmt_info_for_cost *si;
3878       int j;
3879       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3880 
3881       prologue_cost_vec.create (2);
3882       epilogue_cost_vec.create (2);
3883       peel_iters_prologue = npeel;
3884 
3885       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3886 					  &peel_iters_epilogue,
3887 					  &LOOP_VINFO_SCALAR_ITERATION_COST
3888 					    (loop_vinfo),
3889 					  &prologue_cost_vec,
3890 					  &epilogue_cost_vec);
3891 
3892       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3893 	{
3894 	  struct _stmt_vec_info *stmt_info
3895 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3896 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3897 				si->misalign, vect_prologue);
3898 	}
3899 
3900       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3901 	{
3902 	  struct _stmt_vec_info *stmt_info
3903 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3904 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3905 				si->misalign, vect_epilogue);
3906 	}
3907 
3908       prologue_cost_vec.release ();
3909       epilogue_cost_vec.release ();
3910     }
3911 
3912   /* FORNOW: The scalar outside cost is incremented in one of the
3913      following ways:
3914 
3915      1. The vectorizer checks for alignment and aliasing and generates
3916      a condition that allows dynamic vectorization.  A cost model
3917      check is ANDED with the versioning condition.  Hence scalar code
3918      path now has the added cost of the versioning check.
3919 
3920        if (cost > th & versioning_check)
3921          jmp to vector code
3922 
3923      Hence run-time scalar is incremented by not-taken branch cost.
3924 
3925      2. The vectorizer then checks if a prologue is required.  If the
3926      cost model check was not done before during versioning, it has to
3927      be done before the prologue check.
3928 
3929        if (cost <= th)
3930          prologue = scalar_iters
3931        if (prologue == 0)
3932          jmp to vector code
3933        else
3934          execute prologue
3935        if (prologue == num_iters)
3936 	 go to exit
3937 
3938      Hence the run-time scalar cost is incremented by a taken branch,
3939      plus a not-taken branch, plus a taken branch cost.
3940 
3941      3. The vectorizer then checks if an epilogue is required.  If the
3942      cost model check was not done before during prologue check, it
3943      has to be done with the epilogue check.
3944 
3945        if (prologue == 0)
3946          jmp to vector code
3947        else
3948          execute prologue
3949        if (prologue == num_iters)
3950 	 go to exit
3951        vector code:
3952          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3953            jmp to epilogue
3954 
3955      Hence the run-time scalar cost should be incremented by 2 taken
3956      branches.
3957 
3958      TODO: The back end may reorder the BBS's differently and reverse
3959      conditions/branch directions.  Change the estimates below to
3960      something more reasonable.  */
3961 
3962   /* If the number of iterations is known and we do not do versioning, we can
3963      decide whether to vectorize at compile time.  Hence the scalar version
3964      do not carry cost model guard costs.  */
3965   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3966       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3967     {
3968       /* Cost model check occurs at versioning.  */
3969       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3970 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3971       else
3972 	{
3973 	  /* Cost model check occurs at prologue generation.  */
3974 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3975 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3976 	      + vect_get_stmt_cost (cond_branch_not_taken);
3977 	  /* Cost model check occurs at epilogue generation.  */
3978 	  else
3979 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3980 	}
3981     }
3982 
3983   /* Complete the target-specific cost calculations.  */
3984   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3985 	       &vec_inside_cost, &vec_epilogue_cost);
3986 
3987   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3988 
3989   if (dump_enabled_p ())
3990     {
3991       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3992       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3993                    vec_inside_cost);
3994       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3995                    vec_prologue_cost);
3996       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3997                    vec_epilogue_cost);
3998       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3999                    scalar_single_iter_cost);
4000       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4001                    scalar_outside_cost);
4002       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4003                    vec_outside_cost);
4004       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4005                    peel_iters_prologue);
4006       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4007                    peel_iters_epilogue);
4008     }
4009 
4010   /* Calculate number of iterations required to make the vector version
4011      profitable, relative to the loop bodies only.  The following condition
4012      must hold true:
4013      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4014      where
4015      SIC = scalar iteration cost, VIC = vector iteration cost,
4016      VOC = vector outside cost, VF = vectorization factor,
4017      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4018      SOC = scalar outside cost for run time cost model check.  */
4019 
4020   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4021     {
4022       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4023 			      * assumed_vf
4024 			      - vec_inside_cost * peel_iters_prologue
4025 			      - vec_inside_cost * peel_iters_epilogue);
4026       if (min_profitable_iters <= 0)
4027         min_profitable_iters = 0;
4028       else
4029 	{
4030 	  min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4031 				   - vec_inside_cost);
4032 
4033 	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4034 	      <= (((int) vec_inside_cost * min_profitable_iters)
4035 		  + (((int) vec_outside_cost - scalar_outside_cost)
4036 		     * assumed_vf)))
4037 	    min_profitable_iters++;
4038 	}
4039     }
4040   /* vector version will never be profitable.  */
4041   else
4042     {
4043       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4044 	warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4045 		    "did not happen for a simd loop");
4046 
4047       if (dump_enabled_p ())
4048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4049 			 "cost model: the vector iteration cost = %d "
4050 			 "divided by the scalar iteration cost = %d "
4051 			 "is greater or equal to the vectorization factor = %d"
4052                          ".\n",
4053 			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4054       *ret_min_profitable_niters = -1;
4055       *ret_min_profitable_estimate = -1;
4056       return;
4057     }
4058 
4059   dump_printf (MSG_NOTE,
4060 	       "  Calculated minimum iters for profitability: %d\n",
4061 	       min_profitable_iters);
4062 
4063   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4064       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4065     /* We want the vectorized loop to execute at least once.  */
4066     min_profitable_iters = assumed_vf + peel_iters_prologue;
4067 
4068   if (dump_enabled_p ())
4069     dump_printf_loc (MSG_NOTE, vect_location,
4070                      "  Runtime profitability threshold = %d\n",
4071                      min_profitable_iters);
4072 
4073   *ret_min_profitable_niters = min_profitable_iters;
4074 
4075   /* Calculate number of iterations required to make the vector version
4076      profitable, relative to the loop bodies only.
4077 
4078      Non-vectorized variant is SIC * niters and it must win over vector
4079      variant on the expected loop trip count.  The following condition must hold true:
4080      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4081 
4082   if (vec_outside_cost <= 0)
4083     min_profitable_estimate = 0;
4084   else
4085     {
4086       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4087 				 * assumed_vf
4088 				 - vec_inside_cost * peel_iters_prologue
4089 				 - vec_inside_cost * peel_iters_epilogue)
4090 				 / ((scalar_single_iter_cost * assumed_vf)
4091 				   - vec_inside_cost);
4092     }
4093   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4094   if (dump_enabled_p ())
4095     dump_printf_loc (MSG_NOTE, vect_location,
4096 		     "  Static estimate profitability threshold = %d\n",
4097 		     min_profitable_estimate);
4098 
4099   *ret_min_profitable_estimate = min_profitable_estimate;
4100 }
4101 
4102 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4103    vector elements (not bits) for a vector with NELT elements.  */
4104 static void
4105 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4106 			      vec_perm_builder *sel)
4107 {
4108   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4109      by vec_perm_indices.  */
4110   sel->new_vector (nelt, 1, 3);
4111   for (unsigned int i = 0; i < 3; i++)
4112     sel->quick_push (i + offset);
4113 }
4114 
4115 /* Checks whether the target supports whole-vector shifts for vectors of mode
4116    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4117    it supports vec_perm_const with masks for all necessary shift amounts.  */
4118 static bool
4119 have_whole_vector_shift (machine_mode mode)
4120 {
4121   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4122     return true;
4123 
4124   /* Variable-length vectors should be handled via the optab.  */
4125   unsigned int nelt;
4126   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4127     return false;
4128 
4129   vec_perm_builder sel;
4130   vec_perm_indices indices;
4131   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4132     {
4133       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4134       indices.new_vector (sel, 2, nelt);
4135       if (!can_vec_perm_const_p (mode, indices, false))
4136 	return false;
4137     }
4138   return true;
4139 }
4140 
4141 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4142    functions. Design better to avoid maintenance issues.  */
4143 
4144 /* Function vect_model_reduction_cost.
4145 
4146    Models cost for a reduction operation, including the vector ops
4147    generated within the strip-mine loop, the initial definition before
4148    the loop, and the epilogue code that must be generated.  */
4149 
4150 static void
4151 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4152 			   int ncopies)
4153 {
4154   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4155   enum tree_code code;
4156   optab optab;
4157   tree vectype;
4158   gimple *orig_stmt;
4159   machine_mode mode;
4160   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4161   struct loop *loop = NULL;
4162   void *target_cost_data;
4163 
4164   if (loop_vinfo)
4165     {
4166       loop = LOOP_VINFO_LOOP (loop_vinfo);
4167       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4168     }
4169   else
4170     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4171 
4172   /* Condition reductions generate two reductions in the loop.  */
4173   vect_reduction_type reduction_type
4174     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4175   if (reduction_type == COND_REDUCTION)
4176     ncopies *= 2;
4177 
4178   vectype = STMT_VINFO_VECTYPE (stmt_info);
4179   mode = TYPE_MODE (vectype);
4180   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4181 
4182   if (!orig_stmt)
4183     orig_stmt = STMT_VINFO_STMT (stmt_info);
4184 
4185   code = gimple_assign_rhs_code (orig_stmt);
4186 
4187   if (reduction_type == EXTRACT_LAST_REDUCTION
4188       || reduction_type == FOLD_LEFT_REDUCTION)
4189     {
4190       /* No extra instructions needed in the prologue.  */
4191       prologue_cost = 0;
4192 
4193       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4194 	/* Count one reduction-like operation per vector.  */
4195 	inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4196 				     stmt_info, 0, vect_body);
4197       else
4198 	{
4199 	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4200 	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4201 	  inside_cost = add_stmt_cost (target_cost_data,  nelements,
4202 				       vec_to_scalar, stmt_info, 0,
4203 				       vect_body);
4204 	  inside_cost += add_stmt_cost (target_cost_data,  nelements,
4205 					scalar_stmt, stmt_info, 0,
4206 					vect_body);
4207 	}
4208     }
4209   else
4210     {
4211       /* Add in cost for initial definition.
4212 	 For cond reduction we have four vectors: initial index, step,
4213 	 initial result of the data reduction, initial value of the index
4214 	 reduction.  */
4215       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4216       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4217 				      scalar_to_vec, stmt_info, 0,
4218 				      vect_prologue);
4219 
4220       /* Cost of reduction op inside loop.  */
4221       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4222 				   stmt_info, 0, vect_body);
4223     }
4224 
4225   /* Determine cost of epilogue code.
4226 
4227      We have a reduction operator that will reduce the vector in one statement.
4228      Also requires scalar extract.  */
4229 
4230   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4231     {
4232       if (reduc_fn != IFN_LAST)
4233 	{
4234 	  if (reduction_type == COND_REDUCTION)
4235 	    {
4236 	      /* An EQ stmt and an COND_EXPR stmt.  */
4237 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
4238 					      vector_stmt, stmt_info, 0,
4239 					      vect_epilogue);
4240 	      /* Reduction of the max index and a reduction of the found
4241 		 values.  */
4242 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
4243 					      vec_to_scalar, stmt_info, 0,
4244 					      vect_epilogue);
4245 	      /* A broadcast of the max value.  */
4246 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4247 					      scalar_to_vec, stmt_info, 0,
4248 					      vect_epilogue);
4249 	    }
4250 	  else
4251 	    {
4252 	      epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4253 					      stmt_info, 0, vect_epilogue);
4254 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4255 					      vec_to_scalar, stmt_info, 0,
4256 					      vect_epilogue);
4257 	    }
4258 	}
4259       else if (reduction_type == COND_REDUCTION)
4260 	{
4261 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4262 	  /* Extraction of scalar elements.  */
4263 	  epilogue_cost += add_stmt_cost (target_cost_data,
4264 					  2 * estimated_nunits,
4265 					  vec_to_scalar, stmt_info, 0,
4266 					  vect_epilogue);
4267 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4268 	  epilogue_cost += add_stmt_cost (target_cost_data,
4269 					  2 * estimated_nunits - 3,
4270 					  scalar_stmt, stmt_info, 0,
4271 					  vect_epilogue);
4272 	}
4273       else if (reduction_type == EXTRACT_LAST_REDUCTION
4274 	       || reduction_type == FOLD_LEFT_REDUCTION)
4275 	/* No extra instructions need in the epilogue.  */
4276 	;
4277       else
4278 	{
4279 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4280 	  tree bitsize =
4281 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4282 	  int element_bitsize = tree_to_uhwi (bitsize);
4283 	  int nelements = vec_size_in_bits / element_bitsize;
4284 
4285 	  if (code == COND_EXPR)
4286 	    code = MAX_EXPR;
4287 
4288 	  optab = optab_for_tree_code (code, vectype, optab_default);
4289 
4290 	  /* We have a whole vector shift available.  */
4291 	  if (optab != unknown_optab
4292 	      && VECTOR_MODE_P (mode)
4293 	      && optab_handler (optab, mode) != CODE_FOR_nothing
4294 	      && have_whole_vector_shift (mode))
4295 	    {
4296 	      /* Final reduction via vector shifts and the reduction operator.
4297 		 Also requires scalar extract.  */
4298 	      epilogue_cost += add_stmt_cost (target_cost_data,
4299 					      exact_log2 (nelements) * 2,
4300 					      vector_stmt, stmt_info, 0,
4301 					      vect_epilogue);
4302 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
4303 					      vec_to_scalar, stmt_info, 0,
4304 					      vect_epilogue);
4305 	    }
4306 	  else
4307 	    /* Use extracts and reduction op for final reduction.  For N
4308 	       elements, we have N extracts and N-1 reduction ops.  */
4309 	    epilogue_cost += add_stmt_cost (target_cost_data,
4310 					    nelements + nelements - 1,
4311 					    vector_stmt, stmt_info, 0,
4312 					    vect_epilogue);
4313 	}
4314     }
4315 
4316   if (dump_enabled_p ())
4317     dump_printf (MSG_NOTE,
4318                  "vect_model_reduction_cost: inside_cost = %d, "
4319                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4320                  prologue_cost, epilogue_cost);
4321 }
4322 
4323 
4324 /* Function vect_model_induction_cost.
4325 
4326    Models cost for induction operations.  */
4327 
4328 static void
4329 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4330 {
4331   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4332   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4333   unsigned inside_cost, prologue_cost;
4334 
4335   if (PURE_SLP_STMT (stmt_info))
4336     return;
4337 
4338   /* loop cost for vec_loop.  */
4339   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4340 			       stmt_info, 0, vect_body);
4341 
4342   /* prologue cost for vec_init and vec_step.  */
4343   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4344 				 stmt_info, 0, vect_prologue);
4345 
4346   if (dump_enabled_p ())
4347     dump_printf_loc (MSG_NOTE, vect_location,
4348                      "vect_model_induction_cost: inside_cost = %d, "
4349                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4350 }
4351 
4352 
4353 
4354 /* Function get_initial_def_for_reduction
4355 
4356    Input:
4357    STMT - a stmt that performs a reduction operation in the loop.
4358    INIT_VAL - the initial value of the reduction variable
4359 
4360    Output:
4361    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4362         of the reduction (used for adjusting the epilog - see below).
4363    Return a vector variable, initialized according to the operation that STMT
4364         performs. This vector will be used as the initial value of the
4365         vector of partial results.
4366 
4367    Option1 (adjust in epilog): Initialize the vector as follows:
4368      add/bit or/xor:    [0,0,...,0,0]
4369      mult/bit and:      [1,1,...,1,1]
4370      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4371    and when necessary (e.g. add/mult case) let the caller know
4372    that it needs to adjust the result by init_val.
4373 
4374    Option2: Initialize the vector as follows:
4375      add/bit or/xor:    [init_val,0,0,...,0]
4376      mult/bit and:      [init_val,1,1,...,1]
4377      min/max/cond_expr: [init_val,init_val,...,init_val]
4378    and no adjustments are needed.
4379 
4380    For example, for the following code:
4381 
4382    s = init_val;
4383    for (i=0;i<n;i++)
4384      s = s + a[i];
4385 
4386    STMT is 's = s + a[i]', and the reduction variable is 's'.
4387    For a vector of 4 units, we want to return either [0,0,0,init_val],
4388    or [0,0,0,0] and let the caller know that it needs to adjust
4389    the result at the end by 'init_val'.
4390 
4391    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4392    initialization vector is simpler (same element in all entries), if
4393    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4394 
4395    A cost model should help decide between these two schemes.  */
4396 
4397 tree
4398 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4399                                tree *adjustment_def)
4400 {
4401   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4402   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4403   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4404   tree scalar_type = TREE_TYPE (init_val);
4405   tree vectype = get_vectype_for_scalar_type (scalar_type);
4406   enum tree_code code = gimple_assign_rhs_code (stmt);
4407   tree def_for_init;
4408   tree init_def;
4409   bool nested_in_vect_loop = false;
4410   REAL_VALUE_TYPE real_init_val = dconst0;
4411   int int_init_val = 0;
4412   gimple *def_stmt = NULL;
4413   gimple_seq stmts = NULL;
4414 
4415   gcc_assert (vectype);
4416 
4417   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4418 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4419 
4420   if (nested_in_vect_loop_p (loop, stmt))
4421     nested_in_vect_loop = true;
4422   else
4423     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4424 
4425   /* In case of double reduction we only create a vector variable to be put
4426      in the reduction phi node.  The actual statement creation is done in
4427      vect_create_epilog_for_reduction.  */
4428   if (adjustment_def && nested_in_vect_loop
4429       && TREE_CODE (init_val) == SSA_NAME
4430       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4431       && gimple_code (def_stmt) == GIMPLE_PHI
4432       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4433       && vinfo_for_stmt (def_stmt)
4434       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4435           == vect_double_reduction_def)
4436     {
4437       *adjustment_def = NULL;
4438       return vect_create_destination_var (init_val, vectype);
4439     }
4440 
4441   vect_reduction_type reduction_type
4442     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4443 
4444   /* In case of a nested reduction do not use an adjustment def as
4445      that case is not supported by the epilogue generation correctly
4446      if ncopies is not one.  */
4447   if (adjustment_def && nested_in_vect_loop)
4448     {
4449       *adjustment_def = NULL;
4450       return vect_get_vec_def_for_operand (init_val, stmt);
4451     }
4452 
4453   switch (code)
4454     {
4455     case WIDEN_SUM_EXPR:
4456     case DOT_PROD_EXPR:
4457     case SAD_EXPR:
4458     case PLUS_EXPR:
4459     case MINUS_EXPR:
4460     case BIT_IOR_EXPR:
4461     case BIT_XOR_EXPR:
4462     case MULT_EXPR:
4463     case BIT_AND_EXPR:
4464       {
4465         /* ADJUSTMENT_DEF is NULL when called from
4466            vect_create_epilog_for_reduction to vectorize double reduction.  */
4467         if (adjustment_def)
4468 	  *adjustment_def = init_val;
4469 
4470         if (code == MULT_EXPR)
4471           {
4472             real_init_val = dconst1;
4473             int_init_val = 1;
4474           }
4475 
4476         if (code == BIT_AND_EXPR)
4477           int_init_val = -1;
4478 
4479         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4480           def_for_init = build_real (scalar_type, real_init_val);
4481         else
4482           def_for_init = build_int_cst (scalar_type, int_init_val);
4483 
4484 	if (adjustment_def)
4485 	  /* Option1: the first element is '0' or '1' as well.  */
4486 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
4487 						   def_for_init);
4488 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4489 	  {
4490 	    /* Option2 (variable length): the first element is INIT_VAL.  */
4491 	    init_def = build_vector_from_val (vectype, def_for_init);
4492 	    gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4493 						      2, init_def, init_val);
4494 	    init_def = make_ssa_name (vectype);
4495 	    gimple_call_set_lhs (call, init_def);
4496 	    gimple_seq_add_stmt (&stmts, call);
4497 	  }
4498 	else
4499 	  {
4500 	    /* Option2: the first element is INIT_VAL.  */
4501 	    tree_vector_builder elts (vectype, 1, 2);
4502 	    elts.quick_push (init_val);
4503 	    elts.quick_push (def_for_init);
4504 	    init_def = gimple_build_vector (&stmts, &elts);
4505 	  }
4506       }
4507       break;
4508 
4509     case MIN_EXPR:
4510     case MAX_EXPR:
4511     case COND_EXPR:
4512       {
4513 	if (adjustment_def)
4514           {
4515 	    *adjustment_def = NULL_TREE;
4516 	    if (reduction_type != COND_REDUCTION
4517 		&& reduction_type != EXTRACT_LAST_REDUCTION)
4518 	      {
4519 		init_def = vect_get_vec_def_for_operand (init_val, stmt);
4520 		break;
4521 	      }
4522 	  }
4523 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4524 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4525       }
4526       break;
4527 
4528     default:
4529       gcc_unreachable ();
4530     }
4531 
4532   if (stmts)
4533     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4534   return init_def;
4535 }
4536 
4537 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4538    NUMBER_OF_VECTORS is the number of vector defs to create.
4539    If NEUTRAL_OP is nonnull, introducing extra elements of that
4540    value will not change the result.  */
4541 
4542 static void
4543 get_initial_defs_for_reduction (slp_tree slp_node,
4544 				vec<tree> *vec_oprnds,
4545 				unsigned int number_of_vectors,
4546 				bool reduc_chain, tree neutral_op)
4547 {
4548   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4549   gimple *stmt = stmts[0];
4550   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4551   unsigned HOST_WIDE_INT nunits;
4552   unsigned j, number_of_places_left_in_vector;
4553   tree vector_type;
4554   unsigned int group_size = stmts.length ();
4555   unsigned int i;
4556   struct loop *loop;
4557 
4558   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4559 
4560   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4561 
4562   loop = (gimple_bb (stmt))->loop_father;
4563   gcc_assert (loop);
4564   edge pe = loop_preheader_edge (loop);
4565 
4566   gcc_assert (!reduc_chain || neutral_op);
4567 
4568   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4569      created vectors. It is greater than 1 if unrolling is performed.
4570 
4571      For example, we have two scalar operands, s1 and s2 (e.g., group of
4572      strided accesses of size two), while NUNITS is four (i.e., four scalars
4573      of this type can be packed in a vector).  The output vector will contain
4574      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4575      will be 2).
4576 
4577      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4578      containing the operands.
4579 
4580      For example, NUNITS is four as before, and the group size is 8
4581      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4582      {s5, s6, s7, s8}.  */
4583 
4584   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4585     nunits = group_size;
4586 
4587   number_of_places_left_in_vector = nunits;
4588   bool constant_p = true;
4589   tree_vector_builder elts (vector_type, nunits, 1);
4590   elts.quick_grow (nunits);
4591   gimple_seq ctor_seq = NULL;
4592   for (j = 0; j < nunits * number_of_vectors; ++j)
4593     {
4594       tree op;
4595       i = j % group_size;
4596       stmt_vinfo = vinfo_for_stmt (stmts[i]);
4597 
4598       /* Get the def before the loop.  In reduction chain we have only
4599 	 one initial value.  Else we have as many as PHIs in the group.  */
4600       if (reduc_chain)
4601 	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4602       else if (((vec_oprnds->length () + 1) * nunits
4603 		- number_of_places_left_in_vector >= group_size)
4604 	       && neutral_op)
4605 	op = neutral_op;
4606       else
4607 	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4608 
4609       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4610       number_of_places_left_in_vector--;
4611       elts[nunits - number_of_places_left_in_vector - 1] = op;
4612       if (!CONSTANT_CLASS_P (op))
4613 	constant_p = false;
4614 
4615       if (number_of_places_left_in_vector == 0)
4616 	{
4617 	  tree init;
4618 	  if (constant_p && !neutral_op
4619 	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4620 	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4621 	    /* Build the vector directly from ELTS.  */
4622 	    init = gimple_build_vector (&ctor_seq, &elts);
4623 	  else if (neutral_op)
4624 	    {
4625 	      /* Build a vector of the neutral value and shift the
4626 		 other elements into place.  */
4627 	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4628 						   neutral_op);
4629 	      int k = nunits;
4630 	      while (k > 0 && elts[k - 1] == neutral_op)
4631 		k -= 1;
4632 	      while (k > 0)
4633 		{
4634 		  k -= 1;
4635 		  gcall *call = gimple_build_call_internal
4636 		      (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4637 		  init = make_ssa_name (vector_type);
4638 		  gimple_call_set_lhs (call, init);
4639 		  gimple_seq_add_stmt (&ctor_seq, call);
4640 		}
4641 	    }
4642 	  else
4643 	    {
4644 	      /* First time round, duplicate ELTS to fill the
4645 		 required number of vectors.  */
4646 	      duplicate_and_interleave (&ctor_seq, vector_type, elts,
4647 					number_of_vectors, *vec_oprnds);
4648 	      break;
4649 	    }
4650 	  vec_oprnds->quick_push (init);
4651 
4652 	  number_of_places_left_in_vector = nunits;
4653 	  elts.new_vector (vector_type, nunits, 1);
4654 	  elts.quick_grow (nunits);
4655 	  constant_p = true;
4656 	}
4657     }
4658   if (ctor_seq != NULL)
4659     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4660 }
4661 
4662 
4663 /* Function vect_create_epilog_for_reduction
4664 
4665    Create code at the loop-epilog to finalize the result of a reduction
4666    computation.
4667 
4668    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4669      reduction statements.
4670    STMT is the scalar reduction stmt that is being vectorized.
4671    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4672      number of elements that we can fit in a vectype (nunits).  In this case
4673      we have to generate more than one vector stmt - i.e - we need to "unroll"
4674      the vector stmt by a factor VF/nunits.  For more details see documentation
4675      in vectorizable_operation.
4676    REDUC_FN is the internal function for the epilog reduction.
4677    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4678      computation.
4679    REDUC_INDEX is the index of the operand in the right hand side of the
4680      statement that is defined by REDUCTION_PHI.
4681    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4682    SLP_NODE is an SLP node containing a group of reduction statements. The
4683      first one in this group is STMT.
4684    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4685      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4686      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4687      any value of the IV in the loop.
4688    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4689    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4690      null if this is not an SLP reduction
4691 
4692    This function:
4693    1. Creates the reduction def-use cycles: sets the arguments for
4694       REDUCTION_PHIS:
4695       The loop-entry argument is the vectorized initial-value of the reduction.
4696       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4697       sums.
4698    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4699       by calling the function specified by REDUC_FN if available, or by
4700       other means (whole-vector shifts or a scalar loop).
4701       The function also creates a new phi node at the loop exit to preserve
4702       loop-closed form, as illustrated below.
4703 
4704      The flow at the entry to this function:
4705 
4706         loop:
4707           vec_def = phi <null, null>            # REDUCTION_PHI
4708           VECT_DEF = vector_stmt                # vectorized form of STMT
4709           s_loop = scalar_stmt                  # (scalar) STMT
4710         loop_exit:
4711           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4712           use <s_out0>
4713           use <s_out0>
4714 
4715      The above is transformed by this function into:
4716 
4717         loop:
4718           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4719           VECT_DEF = vector_stmt                # vectorized form of STMT
4720           s_loop = scalar_stmt                  # (scalar) STMT
4721         loop_exit:
4722           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4723           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4724           v_out2 = reduce <v_out1>
4725           s_out3 = extract_field <v_out2, 0>
4726           s_out4 = adjust_result <s_out3>
4727           use <s_out4>
4728           use <s_out4>
4729 */
4730 
4731 static void
4732 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4733 				  gimple *reduc_def_stmt,
4734 				  int ncopies, internal_fn reduc_fn,
4735 				  vec<gimple *> reduction_phis,
4736                                   bool double_reduc,
4737 				  slp_tree slp_node,
4738 				  slp_instance slp_node_instance,
4739 				  tree induc_val, enum tree_code induc_code,
4740 				  tree neutral_op)
4741 {
4742   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4743   stmt_vec_info prev_phi_info;
4744   tree vectype;
4745   machine_mode mode;
4746   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4747   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4748   basic_block exit_bb;
4749   tree scalar_dest;
4750   tree scalar_type;
4751   gimple *new_phi = NULL, *phi;
4752   gimple_stmt_iterator exit_gsi;
4753   tree vec_dest;
4754   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4755   gimple *epilog_stmt = NULL;
4756   enum tree_code code = gimple_assign_rhs_code (stmt);
4757   gimple *exit_phi;
4758   tree bitsize;
4759   tree adjustment_def = NULL;
4760   tree vec_initial_def = NULL;
4761   tree expr, def, initial_def = NULL;
4762   tree orig_name, scalar_result;
4763   imm_use_iterator imm_iter, phi_imm_iter;
4764   use_operand_p use_p, phi_use_p;
4765   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4766   bool nested_in_vect_loop = false;
4767   auto_vec<gimple *> new_phis;
4768   auto_vec<gimple *> inner_phis;
4769   enum vect_def_type dt = vect_unknown_def_type;
4770   int j, i;
4771   auto_vec<tree> scalar_results;
4772   unsigned int group_size = 1, k, ratio;
4773   auto_vec<tree> vec_initial_defs;
4774   auto_vec<gimple *> phis;
4775   bool slp_reduc = false;
4776   bool direct_slp_reduc;
4777   tree new_phi_result;
4778   gimple *inner_phi = NULL;
4779   tree induction_index = NULL_TREE;
4780 
4781   if (slp_node)
4782     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4783 
4784   if (nested_in_vect_loop_p (loop, stmt))
4785     {
4786       outer_loop = loop;
4787       loop = loop->inner;
4788       nested_in_vect_loop = true;
4789       gcc_assert (!slp_node);
4790     }
4791 
4792   vectype = STMT_VINFO_VECTYPE (stmt_info);
4793   gcc_assert (vectype);
4794   mode = TYPE_MODE (vectype);
4795 
4796   /* 1. Create the reduction def-use cycle:
4797      Set the arguments of REDUCTION_PHIS, i.e., transform
4798 
4799         loop:
4800           vec_def = phi <null, null>            # REDUCTION_PHI
4801           VECT_DEF = vector_stmt                # vectorized form of STMT
4802           ...
4803 
4804      into:
4805 
4806         loop:
4807           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4808           VECT_DEF = vector_stmt                # vectorized form of STMT
4809           ...
4810 
4811      (in case of SLP, do it for all the phis). */
4812 
4813   /* Get the loop-entry arguments.  */
4814   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4815   if (slp_node)
4816     {
4817       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4818       vec_initial_defs.reserve (vec_num);
4819       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4820 				      &vec_initial_defs, vec_num,
4821 				      GROUP_FIRST_ELEMENT (stmt_info),
4822 				      neutral_op);
4823     }
4824   else
4825     {
4826       /* Get at the scalar def before the loop, that defines the initial value
4827 	 of the reduction variable.  */
4828       gimple *def_stmt;
4829       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4830 					   loop_preheader_edge (loop));
4831       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4832 	 and we can't use zero for induc_val, use initial_def.  Similarly
4833 	 for REDUC_MIN and initial_def larger than the base.  */
4834       if (TREE_CODE (initial_def) == INTEGER_CST
4835 	  && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4836 	      == INTEGER_INDUC_COND_REDUCTION)
4837 	  && !integer_zerop (induc_val)
4838 	  && ((induc_code == MAX_EXPR
4839 	       && tree_int_cst_lt (initial_def, induc_val))
4840 	      || (induc_code == MIN_EXPR
4841 		  && tree_int_cst_lt (induc_val, initial_def))))
4842 	induc_val = initial_def;
4843       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4844       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4845 						       &adjustment_def);
4846       vec_initial_defs.create (1);
4847       vec_initial_defs.quick_push (vec_initial_def);
4848     }
4849 
4850   /* Set phi nodes arguments.  */
4851   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4852     {
4853       tree vec_init_def = vec_initial_defs[i];
4854       tree def = vect_defs[i];
4855       for (j = 0; j < ncopies; j++)
4856         {
4857 	  if (j != 0)
4858 	    {
4859 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4860 	      if (nested_in_vect_loop)
4861 		vec_init_def
4862 		  = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4863 						    vec_init_def);
4864 	    }
4865 
4866 	  /* Set the loop-entry arg of the reduction-phi.  */
4867 
4868 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4869 	      == INTEGER_INDUC_COND_REDUCTION)
4870 	    {
4871 	      /* Initialise the reduction phi to zero.  This prevents initial
4872 		 values of non-zero interferring with the reduction op.  */
4873 	      gcc_assert (ncopies == 1);
4874 	      gcc_assert (i == 0);
4875 
4876 	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
4877 	      tree induc_val_vec
4878 		= build_vector_from_val (vec_init_def_type, induc_val);
4879 
4880 	      add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4881 			   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4882 	    }
4883 	  else
4884 	    add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4885 			 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4886 
4887           /* Set the loop-latch arg for the reduction-phi.  */
4888           if (j > 0)
4889             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4890 
4891           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4892 		       UNKNOWN_LOCATION);
4893 
4894           if (dump_enabled_p ())
4895             {
4896               dump_printf_loc (MSG_NOTE, vect_location,
4897 			       "transform reduction: created def-use cycle: ");
4898               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4899               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4900             }
4901         }
4902     }
4903 
4904   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4905      which is updated with the current index of the loop for every match of
4906      the original loop's cond_expr (VEC_STMT).  This results in a vector
4907      containing the last time the condition passed for that vector lane.
4908      The first match will be a 1 to allow 0 to be used for non-matching
4909      indexes.  If there are no matches at all then the vector will be all
4910      zeroes.  */
4911   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4912     {
4913       tree indx_before_incr, indx_after_incr;
4914       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4915 
4916       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4917       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4918 
4919       int scalar_precision
4920 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4921       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4922       tree cr_index_vector_type = build_vector_type
4923 	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4924 
4925       /* First we create a simple vector induction variable which starts
4926 	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4927 	 vector size (STEP).  */
4928 
4929       /* Create a {1,2,3,...} vector.  */
4930       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4931 
4932       /* Create a vector of the step value.  */
4933       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4934       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4935 
4936       /* Create an induction variable.  */
4937       gimple_stmt_iterator incr_gsi;
4938       bool insert_after;
4939       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4940       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4941 		 insert_after, &indx_before_incr, &indx_after_incr);
4942 
4943       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4944 	 filled with zeros (VEC_ZERO).  */
4945 
4946       /* Create a vector of 0s.  */
4947       tree zero = build_zero_cst (cr_index_scalar_type);
4948       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4949 
4950       /* Create a vector phi node.  */
4951       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4952       new_phi = create_phi_node (new_phi_tree, loop->header);
4953       set_vinfo_for_stmt (new_phi,
4954 			  new_stmt_vec_info (new_phi, loop_vinfo));
4955       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4956 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4957 
4958       /* Now take the condition from the loops original cond_expr
4959 	 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4960 	 every match uses values from the induction variable
4961 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4962 	 (NEW_PHI_TREE).
4963 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4964 	 the new cond_expr (INDEX_COND_EXPR).  */
4965 
4966       /* Duplicate the condition from vec_stmt.  */
4967       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4968 
4969       /* Create a conditional, where the condition is taken from vec_stmt
4970 	 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4971 	 else is the phi (NEW_PHI_TREE).  */
4972       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4973 				     ccompare, indx_before_incr,
4974 				     new_phi_tree);
4975       induction_index = make_ssa_name (cr_index_vector_type);
4976       gimple *index_condition = gimple_build_assign (induction_index,
4977 						     index_cond_expr);
4978       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4979       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4980 							loop_vinfo);
4981       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4982       set_vinfo_for_stmt (index_condition, index_vec_info);
4983 
4984       /* Update the phi with the vec cond.  */
4985       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4986 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4987     }
4988 
4989   /* 2. Create epilog code.
4990         The reduction epilog code operates across the elements of the vector
4991         of partial results computed by the vectorized loop.
4992         The reduction epilog code consists of:
4993 
4994         step 1: compute the scalar result in a vector (v_out2)
4995         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4996         step 3: adjust the scalar result (s_out3) if needed.
4997 
4998         Step 1 can be accomplished using one the following three schemes:
4999           (scheme 1) using reduc_fn, if available.
5000           (scheme 2) using whole-vector shifts, if available.
5001           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5002                      combined.
5003 
5004           The overall epilog code looks like this:
5005 
5006           s_out0 = phi <s_loop>         # original EXIT_PHI
5007           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5008           v_out2 = reduce <v_out1>              # step 1
5009           s_out3 = extract_field <v_out2, 0>    # step 2
5010           s_out4 = adjust_result <s_out3>       # step 3
5011 
5012           (step 3 is optional, and steps 1 and 2 may be combined).
5013           Lastly, the uses of s_out0 are replaced by s_out4.  */
5014 
5015 
5016   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5017          v_out1 = phi <VECT_DEF>
5018          Store them in NEW_PHIS.  */
5019 
5020   exit_bb = single_exit (loop)->dest;
5021   prev_phi_info = NULL;
5022   new_phis.create (vect_defs.length ());
5023   FOR_EACH_VEC_ELT (vect_defs, i, def)
5024     {
5025       for (j = 0; j < ncopies; j++)
5026         {
5027 	  tree new_def = copy_ssa_name (def);
5028           phi = create_phi_node (new_def, exit_bb);
5029           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5030           if (j == 0)
5031             new_phis.quick_push (phi);
5032           else
5033 	    {
5034 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
5035 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5036 	    }
5037 
5038           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5039           prev_phi_info = vinfo_for_stmt (phi);
5040         }
5041     }
5042 
5043   /* The epilogue is created for the outer-loop, i.e., for the loop being
5044      vectorized.  Create exit phis for the outer loop.  */
5045   if (double_reduc)
5046     {
5047       loop = outer_loop;
5048       exit_bb = single_exit (loop)->dest;
5049       inner_phis.create (vect_defs.length ());
5050       FOR_EACH_VEC_ELT (new_phis, i, phi)
5051 	{
5052 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
5053 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
5054 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5055 			   PHI_RESULT (phi));
5056 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5057 							    loop_vinfo));
5058 	  inner_phis.quick_push (phi);
5059 	  new_phis[i] = outer_phi;
5060 	  prev_phi_info = vinfo_for_stmt (outer_phi);
5061           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5062             {
5063 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5064 	      new_result = copy_ssa_name (PHI_RESULT (phi));
5065 	      outer_phi = create_phi_node (new_result, exit_bb);
5066 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5067 			       PHI_RESULT (phi));
5068 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5069 								loop_vinfo));
5070 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5071 	      prev_phi_info = vinfo_for_stmt (outer_phi);
5072 	    }
5073 	}
5074     }
5075 
5076   exit_gsi = gsi_after_labels (exit_bb);
5077 
5078   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5079          (i.e. when reduc_fn is not available) and in the final adjustment
5080 	 code (if needed).  Also get the original scalar reduction variable as
5081          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5082          represents a reduction pattern), the tree-code and scalar-def are
5083          taken from the original stmt that the pattern-stmt (STMT) replaces.
5084          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5085          are taken from STMT.  */
5086 
5087   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5088   if (!orig_stmt)
5089     {
5090       /* Regular reduction  */
5091       orig_stmt = stmt;
5092     }
5093   else
5094     {
5095       /* Reduction pattern  */
5096       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5097       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5098       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5099     }
5100 
5101   code = gimple_assign_rhs_code (orig_stmt);
5102   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5103      partial results are added and not subtracted.  */
5104   if (code == MINUS_EXPR)
5105     code = PLUS_EXPR;
5106 
5107   scalar_dest = gimple_assign_lhs (orig_stmt);
5108   scalar_type = TREE_TYPE (scalar_dest);
5109   scalar_results.create (group_size);
5110   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5111   bitsize = TYPE_SIZE (scalar_type);
5112 
5113   /* In case this is a reduction in an inner-loop while vectorizing an outer
5114      loop - we don't need to extract a single scalar result at the end of the
5115      inner-loop (unless it is double reduction, i.e., the use of reduction is
5116      outside the outer-loop).  The final vector of partial results will be used
5117      in the vectorized outer-loop, or reduced to a scalar result at the end of
5118      the outer-loop.  */
5119   if (nested_in_vect_loop && !double_reduc)
5120     goto vect_finalize_reduction;
5121 
5122   /* SLP reduction without reduction chain, e.g.,
5123      # a1 = phi <a2, a0>
5124      # b1 = phi <b2, b0>
5125      a2 = operation (a1)
5126      b2 = operation (b1)  */
5127   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5128 
5129   /* True if we should implement SLP_REDUC using native reduction operations
5130      instead of scalar operations.  */
5131   direct_slp_reduc = (reduc_fn != IFN_LAST
5132 		      && slp_reduc
5133 		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5134 
5135   /* In case of reduction chain, e.g.,
5136      # a1 = phi <a3, a0>
5137      a2 = operation (a1)
5138      a3 = operation (a2),
5139 
5140      we may end up with more than one vector result.  Here we reduce them to
5141      one vector.  */
5142   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5143     {
5144       tree first_vect = PHI_RESULT (new_phis[0]);
5145       gassign *new_vec_stmt = NULL;
5146       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5147       for (k = 1; k < new_phis.length (); k++)
5148         {
5149 	  gimple *next_phi = new_phis[k];
5150           tree second_vect = PHI_RESULT (next_phi);
5151           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5152           new_vec_stmt = gimple_build_assign (tem, code,
5153 					      first_vect, second_vect);
5154           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5155 	  first_vect = tem;
5156         }
5157 
5158       new_phi_result = first_vect;
5159       if (new_vec_stmt)
5160         {
5161           new_phis.truncate (0);
5162           new_phis.safe_push (new_vec_stmt);
5163         }
5164     }
5165   /* Likewise if we couldn't use a single defuse cycle.  */
5166   else if (ncopies > 1)
5167     {
5168       gcc_assert (new_phis.length () == 1);
5169       tree first_vect = PHI_RESULT (new_phis[0]);
5170       gassign *new_vec_stmt = NULL;
5171       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5172       gimple *next_phi = new_phis[0];
5173       for (int k = 1; k < ncopies; ++k)
5174 	{
5175 	  next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5176 	  tree second_vect = PHI_RESULT (next_phi);
5177           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5178           new_vec_stmt = gimple_build_assign (tem, code,
5179 					      first_vect, second_vect);
5180           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5181 	  first_vect = tem;
5182 	}
5183       new_phi_result = first_vect;
5184       new_phis.truncate (0);
5185       new_phis.safe_push (new_vec_stmt);
5186     }
5187   else
5188     new_phi_result = PHI_RESULT (new_phis[0]);
5189 
5190   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5191       && reduc_fn != IFN_LAST)
5192     {
5193       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5194 	 various data values where the condition matched and another vector
5195 	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
5196 	 need to extract the last matching index (which will be the index with
5197 	 highest value) and use this to index into the data vector.
5198 	 For the case where there were no matches, the data vector will contain
5199 	 all default values and the index vector will be all zeros.  */
5200 
5201       /* Get various versions of the type of the vector of indexes.  */
5202       tree index_vec_type = TREE_TYPE (induction_index);
5203       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5204       tree index_scalar_type = TREE_TYPE (index_vec_type);
5205       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5206 	(index_vec_type);
5207 
5208       /* Get an unsigned integer version of the type of the data vector.  */
5209       int scalar_precision
5210 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5211       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5212       tree vectype_unsigned = build_vector_type
5213 	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5214 
5215       /* First we need to create a vector (ZERO_VEC) of zeros and another
5216 	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5217 	 can create using a MAX reduction and then expanding.
5218 	 In the case where the loop never made any matches, the max index will
5219 	 be zero.  */
5220 
5221       /* Vector of {0, 0, 0,...}.  */
5222       tree zero_vec = make_ssa_name (vectype);
5223       tree zero_vec_rhs = build_zero_cst (vectype);
5224       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5225       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5226 
5227       /* Find maximum value from the vector of found indexes.  */
5228       tree max_index = make_ssa_name (index_scalar_type);
5229       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5230 							  1, induction_index);
5231       gimple_call_set_lhs (max_index_stmt, max_index);
5232       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5233 
5234       /* Vector of {max_index, max_index, max_index,...}.  */
5235       tree max_index_vec = make_ssa_name (index_vec_type);
5236       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5237 						      max_index);
5238       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5239 							max_index_vec_rhs);
5240       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5241 
5242       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5243 	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5244 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5245 	 otherwise.  Only one value should match, resulting in a vector
5246 	 (VEC_COND) with one data value and the rest zeros.
5247 	 In the case where the loop never made any matches, every index will
5248 	 match, resulting in a vector with all data values (which will all be
5249 	 the default value).  */
5250 
5251       /* Compare the max index vector to the vector of found indexes to find
5252 	 the position of the max value.  */
5253       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5254       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5255 						      induction_index,
5256 						      max_index_vec);
5257       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5258 
5259       /* Use the compare to choose either values from the data vector or
5260 	 zero.  */
5261       tree vec_cond = make_ssa_name (vectype);
5262       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5263 						   vec_compare, new_phi_result,
5264 						   zero_vec);
5265       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5266 
5267       /* Finally we need to extract the data value from the vector (VEC_COND)
5268 	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5269 	 reduction, but because this doesn't exist, we can use a MAX reduction
5270 	 instead.  The data value might be signed or a float so we need to cast
5271 	 it first.
5272 	 In the case where the loop never made any matches, the data values are
5273 	 all identical, and so will reduce down correctly.  */
5274 
5275       /* Make the matched data values unsigned.  */
5276       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5277       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5278 				       vec_cond);
5279       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5280 							VIEW_CONVERT_EXPR,
5281 							vec_cond_cast_rhs);
5282       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5283 
5284       /* Reduce down to a scalar value.  */
5285       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5286       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5287 							   1, vec_cond_cast);
5288       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5289       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5290 
5291       /* Convert the reduced value back to the result type and set as the
5292 	 result.  */
5293       gimple_seq stmts = NULL;
5294       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5295 			       data_reduc);
5296       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5297       scalar_results.safe_push (new_temp);
5298     }
5299   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5300 	   && reduc_fn == IFN_LAST)
5301     {
5302       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5303 	 idx = 0;
5304          idx_val = induction_index[0];
5305 	 val = data_reduc[0];
5306          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5307 	   if (induction_index[i] > idx_val)
5308 	     val = data_reduc[i], idx_val = induction_index[i];
5309 	 return val;  */
5310 
5311       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5312       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5313       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5314       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5315       /* Enforced by vectorizable_reduction, which ensures we have target
5316 	 support before allowing a conditional reduction on variable-length
5317 	 vectors.  */
5318       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5319       tree idx_val = NULL_TREE, val = NULL_TREE;
5320       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5321 	{
5322 	  tree old_idx_val = idx_val;
5323 	  tree old_val = val;
5324 	  idx_val = make_ssa_name (idx_eltype);
5325 	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5326 					     build3 (BIT_FIELD_REF, idx_eltype,
5327 						     induction_index,
5328 						     bitsize_int (el_size),
5329 						     bitsize_int (off)));
5330 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5331 	  val = make_ssa_name (data_eltype);
5332 	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5333 					     build3 (BIT_FIELD_REF,
5334 						     data_eltype,
5335 						     new_phi_result,
5336 						     bitsize_int (el_size),
5337 						     bitsize_int (off)));
5338 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339 	  if (off != 0)
5340 	    {
5341 	      tree new_idx_val = idx_val;
5342 	      tree new_val = val;
5343 	      if (off != v_size - el_size)
5344 		{
5345 		  new_idx_val = make_ssa_name (idx_eltype);
5346 		  epilog_stmt = gimple_build_assign (new_idx_val,
5347 						     MAX_EXPR, idx_val,
5348 						     old_idx_val);
5349 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350 		}
5351 	      new_val = make_ssa_name (data_eltype);
5352 	      epilog_stmt = gimple_build_assign (new_val,
5353 						 COND_EXPR,
5354 						 build2 (GT_EXPR,
5355 							 boolean_type_node,
5356 							 idx_val,
5357 							 old_idx_val),
5358 						 val, old_val);
5359 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360 	      idx_val = new_idx_val;
5361 	      val = new_val;
5362 	    }
5363 	}
5364       /* Convert the reduced value back to the result type and set as the
5365 	 result.  */
5366       gimple_seq stmts = NULL;
5367       val = gimple_convert (&stmts, scalar_type, val);
5368       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5369       scalar_results.safe_push (val);
5370     }
5371 
5372   /* 2.3 Create the reduction code, using one of the three schemes described
5373          above. In SLP we simply need to extract all the elements from the
5374          vector (without reducing them), so we use scalar shifts.  */
5375   else if (reduc_fn != IFN_LAST && !slp_reduc)
5376     {
5377       tree tmp;
5378       tree vec_elem_type;
5379 
5380       /* Case 1:  Create:
5381          v_out2 = reduc_expr <v_out1>  */
5382 
5383       if (dump_enabled_p ())
5384         dump_printf_loc (MSG_NOTE, vect_location,
5385 			 "Reduce using direct vector reduction.\n");
5386 
5387       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5388       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5389 	{
5390 	  tree tmp_dest
5391 	    = vect_create_destination_var (scalar_dest, vec_elem_type);
5392 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5393 						    new_phi_result);
5394 	  gimple_set_lhs (epilog_stmt, tmp_dest);
5395 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5396 	  gimple_set_lhs (epilog_stmt, new_temp);
5397 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398 
5399 	  epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5400 					     new_temp);
5401 	}
5402       else
5403 	{
5404 	  epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5405 						    new_phi_result);
5406 	  gimple_set_lhs (epilog_stmt, new_scalar_dest);
5407 	}
5408 
5409       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5410       gimple_set_lhs (epilog_stmt, new_temp);
5411       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5412 
5413       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5414 	   == INTEGER_INDUC_COND_REDUCTION)
5415 	  && !operand_equal_p (initial_def, induc_val, 0))
5416 	{
5417 	  /* Earlier we set the initial value to be a vector if induc_val
5418 	     values.  Check the result and if it is induc_val then replace
5419 	     with the original initial value, unless induc_val is
5420 	     the same as initial_def already.  */
5421 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5422 				  induc_val);
5423 
5424 	  tmp = make_ssa_name (new_scalar_dest);
5425 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5426 					     initial_def, new_temp);
5427 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5428 	  new_temp = tmp;
5429 	}
5430 
5431       scalar_results.safe_push (new_temp);
5432     }
5433   else if (direct_slp_reduc)
5434     {
5435       /* Here we create one vector for each of the GROUP_SIZE results,
5436 	 with the elements for other SLP statements replaced with the
5437 	 neutral value.  We can then do a normal reduction on each vector.  */
5438 
5439       /* Enforced by vectorizable_reduction.  */
5440       gcc_assert (new_phis.length () == 1);
5441       gcc_assert (pow2p_hwi (group_size));
5442 
5443       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5444       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5445       gimple_seq seq = NULL;
5446 
5447       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5448 	 and the same element size as VECTYPE.  */
5449       tree index = build_index_vector (vectype, 0, 1);
5450       tree index_type = TREE_TYPE (index);
5451       tree index_elt_type = TREE_TYPE (index_type);
5452       tree mask_type = build_same_sized_truth_vector_type (index_type);
5453 
5454       /* Create a vector that, for each element, identifies which of
5455 	 the GROUP_SIZE results should use it.  */
5456       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5457       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5458 			    build_vector_from_val (index_type, index_mask));
5459 
5460       /* Get a neutral vector value.  This is simply a splat of the neutral
5461 	 scalar value if we have one, otherwise the initial scalar value
5462 	 is itself a neutral value.  */
5463       tree vector_identity = NULL_TREE;
5464       if (neutral_op)
5465 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5466 							neutral_op);
5467       for (unsigned int i = 0; i < group_size; ++i)
5468 	{
5469 	  /* If there's no univeral neutral value, we can use the
5470 	     initial scalar value from the original PHI.  This is used
5471 	     for MIN and MAX reduction, for example.  */
5472 	  if (!neutral_op)
5473 	    {
5474 	      tree scalar_value
5475 		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5476 					 loop_preheader_edge (loop));
5477 	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5478 							      scalar_value);
5479 	    }
5480 
5481 	  /* Calculate the equivalent of:
5482 
5483 	     sel[j] = (index[j] == i);
5484 
5485 	     which selects the elements of NEW_PHI_RESULT that should
5486 	     be included in the result.  */
5487 	  tree compare_val = build_int_cst (index_elt_type, i);
5488 	  compare_val = build_vector_from_val (index_type, compare_val);
5489 	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5490 				   index, compare_val);
5491 
5492 	  /* Calculate the equivalent of:
5493 
5494 	     vec = seq ? new_phi_result : vector_identity;
5495 
5496 	     VEC is now suitable for a full vector reduction.  */
5497 	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5498 				   sel, new_phi_result, vector_identity);
5499 
5500 	  /* Do the reduction and convert it to the appropriate type.  */
5501 	  gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5502 	  tree scalar = make_ssa_name (TREE_TYPE (vectype));
5503 	  gimple_call_set_lhs (call, scalar);
5504 	  gimple_seq_add_stmt (&seq, call);
5505 	  scalar = gimple_convert (&seq, scalar_type, scalar);
5506 	  scalar_results.safe_push (scalar);
5507 	}
5508       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5509     }
5510   else
5511     {
5512       bool reduce_with_shift;
5513       tree vec_temp;
5514 
5515       /* COND reductions all do the final reduction with MAX_EXPR
5516 	 or MIN_EXPR.  */
5517       if (code == COND_EXPR)
5518 	{
5519 	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5520 	      == INTEGER_INDUC_COND_REDUCTION)
5521 	    code = induc_code;
5522 	  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5523 		   == CONST_COND_REDUCTION)
5524 	    code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5525 	  else
5526 	    code = MAX_EXPR;
5527 	}
5528 
5529       /* See if the target wants to do the final (shift) reduction
5530 	 in a vector mode of smaller size and first reduce upper/lower
5531 	 halves against each other.  */
5532       enum machine_mode mode1 = mode;
5533       tree vectype1 = vectype;
5534       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5535       unsigned sz1 = sz;
5536       if (!slp_reduc
5537 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5538 	sz1 = GET_MODE_SIZE (mode1).to_constant ();
5539 
5540       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5541       reduce_with_shift = have_whole_vector_shift (mode1);
5542       if (!VECTOR_MODE_P (mode1))
5543 	reduce_with_shift = false;
5544       else
5545 	{
5546 	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5547 	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5548 	    reduce_with_shift = false;
5549 	}
5550 
5551       /* First reduce the vector to the desired vector size we should
5552 	 do shift reduction on by combining upper and lower halves.  */
5553       new_temp = new_phi_result;
5554       while (sz > sz1)
5555 	{
5556 	  gcc_assert (!slp_reduc);
5557 	  sz /= 2;
5558 	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5559 
5560 	  /* The target has to make sure we support lowpart/highpart
5561 	     extraction, either via direct vector extract or through
5562 	     an integer mode punning.  */
5563 	  tree dst1, dst2;
5564 	  if (convert_optab_handler (vec_extract_optab,
5565 				     TYPE_MODE (TREE_TYPE (new_temp)),
5566 				     TYPE_MODE (vectype1))
5567 	      != CODE_FOR_nothing)
5568 	    {
5569 	      /* Extract sub-vectors directly once vec_extract becomes
5570 		 a conversion optab.  */
5571 	      dst1 = make_ssa_name (vectype1);
5572 	      epilog_stmt
5573 		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5574 					 build3 (BIT_FIELD_REF, vectype1,
5575 						 new_temp, TYPE_SIZE (vectype1),
5576 						 bitsize_int (0)));
5577 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5578 	      dst2 =  make_ssa_name (vectype1);
5579 	      epilog_stmt
5580 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5581 					 build3 (BIT_FIELD_REF, vectype1,
5582 						 new_temp, TYPE_SIZE (vectype1),
5583 						 bitsize_int (sz * BITS_PER_UNIT)));
5584 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5585 	    }
5586 	  else
5587 	    {
5588 	      /* Extract via punning to appropriately sized integer mode
5589 		 vector.  */
5590 	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5591 							    1);
5592 	      tree etype = build_vector_type (eltype, 2);
5593 	      gcc_assert (convert_optab_handler (vec_extract_optab,
5594 						 TYPE_MODE (etype),
5595 						 TYPE_MODE (eltype))
5596 			  != CODE_FOR_nothing);
5597 	      tree tem = make_ssa_name (etype);
5598 	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5599 						 build1 (VIEW_CONVERT_EXPR,
5600 							 etype, new_temp));
5601 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5602 	      new_temp = tem;
5603 	      tem = make_ssa_name (eltype);
5604 	      epilog_stmt
5605 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5606 					 build3 (BIT_FIELD_REF, eltype,
5607 						 new_temp, TYPE_SIZE (eltype),
5608 						 bitsize_int (0)));
5609 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5610 	      dst1 = make_ssa_name (vectype1);
5611 	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5612 						 build1 (VIEW_CONVERT_EXPR,
5613 							 vectype1, tem));
5614 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5615 	      tem = make_ssa_name (eltype);
5616 	      epilog_stmt
5617 		  = gimple_build_assign (tem, BIT_FIELD_REF,
5618 					 build3 (BIT_FIELD_REF, eltype,
5619 						 new_temp, TYPE_SIZE (eltype),
5620 						 bitsize_int (sz * BITS_PER_UNIT)));
5621 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5622 	      dst2 =  make_ssa_name (vectype1);
5623 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5624 						 build1 (VIEW_CONVERT_EXPR,
5625 							 vectype1, tem));
5626 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5627 	    }
5628 
5629 	  new_temp = make_ssa_name (vectype1);
5630 	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5631 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5632 	}
5633 
5634       if (reduce_with_shift && !slp_reduc)
5635 	{
5636 	  int element_bitsize = tree_to_uhwi (bitsize);
5637 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5638 	     for variable-length vectors and also requires direct target support
5639 	     for loop reductions.  */
5640 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5641 	  int nelements = vec_size_in_bits / element_bitsize;
5642 	  vec_perm_builder sel;
5643 	  vec_perm_indices indices;
5644 
5645           int elt_offset;
5646 
5647           tree zero_vec = build_zero_cst (vectype1);
5648           /* Case 2: Create:
5649              for (offset = nelements/2; offset >= 1; offset/=2)
5650                 {
5651                   Create:  va' = vec_shift <va, offset>
5652                   Create:  va = vop <va, va'>
5653                 }  */
5654 
5655           tree rhs;
5656 
5657           if (dump_enabled_p ())
5658             dump_printf_loc (MSG_NOTE, vect_location,
5659 			     "Reduce using vector shifts\n");
5660 
5661 	  mode1 = TYPE_MODE (vectype1);
5662           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5663           for (elt_offset = nelements / 2;
5664                elt_offset >= 1;
5665                elt_offset /= 2)
5666             {
5667 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5668 	      indices.new_vector (sel, 2, nelements);
5669 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5670 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5671 						 new_temp, zero_vec, mask);
5672               new_name = make_ssa_name (vec_dest, epilog_stmt);
5673               gimple_assign_set_lhs (epilog_stmt, new_name);
5674               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5675 
5676 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5677 						 new_temp);
5678               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5679               gimple_assign_set_lhs (epilog_stmt, new_temp);
5680               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5681             }
5682 
5683 	  /* 2.4  Extract the final scalar result.  Create:
5684 	     s_out3 = extract_field <v_out2, bitpos>  */
5685 
5686 	  if (dump_enabled_p ())
5687 	    dump_printf_loc (MSG_NOTE, vect_location,
5688 			     "extract scalar result\n");
5689 
5690 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5691 			bitsize, bitsize_zero_node);
5692 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5693 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5694 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5695 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5696 	  scalar_results.safe_push (new_temp);
5697         }
5698       else
5699         {
5700           /* Case 3: Create:
5701              s = extract_field <v_out2, 0>
5702              for (offset = element_size;
5703                   offset < vector_size;
5704                   offset += element_size;)
5705                {
5706                  Create:  s' = extract_field <v_out2, offset>
5707                  Create:  s = op <s, s'>  // For non SLP cases
5708                }  */
5709 
5710           if (dump_enabled_p ())
5711             dump_printf_loc (MSG_NOTE, vect_location,
5712 			     "Reduce using scalar code.\n");
5713 
5714 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5715 	  int element_bitsize = tree_to_uhwi (bitsize);
5716           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5717             {
5718               int bit_offset;
5719               if (gimple_code (new_phi) == GIMPLE_PHI)
5720                 vec_temp = PHI_RESULT (new_phi);
5721               else
5722                 vec_temp = gimple_assign_lhs (new_phi);
5723               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5724 				 bitsize_zero_node);
5725               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5726               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5727               gimple_assign_set_lhs (epilog_stmt, new_temp);
5728               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5729 
5730               /* In SLP we don't need to apply reduction operation, so we just
5731                  collect s' values in SCALAR_RESULTS.  */
5732               if (slp_reduc)
5733                 scalar_results.safe_push (new_temp);
5734 
5735               for (bit_offset = element_bitsize;
5736                    bit_offset < vec_size_in_bits;
5737                    bit_offset += element_bitsize)
5738                 {
5739                   tree bitpos = bitsize_int (bit_offset);
5740                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5741                                      bitsize, bitpos);
5742 
5743                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5744                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5745                   gimple_assign_set_lhs (epilog_stmt, new_name);
5746                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5747 
5748                   if (slp_reduc)
5749                     {
5750                       /* In SLP we don't need to apply reduction operation, so
5751                          we just collect s' values in SCALAR_RESULTS.  */
5752                       new_temp = new_name;
5753                       scalar_results.safe_push (new_name);
5754                     }
5755                   else
5756                     {
5757 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5758 							 new_name, new_temp);
5759                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5760                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5761                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5762                     }
5763                 }
5764             }
5765 
5766           /* The only case where we need to reduce scalar results in SLP, is
5767              unrolling.  If the size of SCALAR_RESULTS is greater than
5768              GROUP_SIZE, we reduce them combining elements modulo
5769              GROUP_SIZE.  */
5770           if (slp_reduc)
5771             {
5772               tree res, first_res, new_res;
5773 	      gimple *new_stmt;
5774 
5775               /* Reduce multiple scalar results in case of SLP unrolling.  */
5776               for (j = group_size; scalar_results.iterate (j, &res);
5777                    j++)
5778                 {
5779                   first_res = scalar_results[j % group_size];
5780 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
5781 						  first_res, res);
5782                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5783                   gimple_assign_set_lhs (new_stmt, new_res);
5784                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5785                   scalar_results[j % group_size] = new_res;
5786                 }
5787             }
5788           else
5789             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5790             scalar_results.safe_push (new_temp);
5791         }
5792 
5793       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5794 	   == INTEGER_INDUC_COND_REDUCTION)
5795 	  && !operand_equal_p (initial_def, induc_val, 0))
5796 	{
5797 	  /* Earlier we set the initial value to be a vector if induc_val
5798 	     values.  Check the result and if it is induc_val then replace
5799 	     with the original initial value, unless induc_val is
5800 	     the same as initial_def already.  */
5801 	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5802 				  induc_val);
5803 
5804 	  tree tmp = make_ssa_name (new_scalar_dest);
5805 	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5806 					     initial_def, new_temp);
5807 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5808 	  scalar_results[0] = tmp;
5809 	}
5810     }
5811 
5812 vect_finalize_reduction:
5813 
5814   if (double_reduc)
5815     loop = loop->inner;
5816 
5817   /* 2.5 Adjust the final result by the initial value of the reduction
5818 	 variable. (When such adjustment is not needed, then
5819 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5820 	 new_temp = loop_exit_def + adjustment_def  */
5821 
5822   if (adjustment_def)
5823     {
5824       gcc_assert (!slp_reduc);
5825       if (nested_in_vect_loop)
5826 	{
5827           new_phi = new_phis[0];
5828 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5829 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5830 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
5831 	}
5832       else
5833 	{
5834           new_temp = scalar_results[0];
5835 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5836 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
5837 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5838 	}
5839 
5840       epilog_stmt = gimple_build_assign (new_dest, expr);
5841       new_temp = make_ssa_name (new_dest, epilog_stmt);
5842       gimple_assign_set_lhs (epilog_stmt, new_temp);
5843       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5844       if (nested_in_vect_loop)
5845         {
5846           set_vinfo_for_stmt (epilog_stmt,
5847                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5848           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5849                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5850 
5851           if (!double_reduc)
5852             scalar_results.quick_push (new_temp);
5853           else
5854             scalar_results[0] = new_temp;
5855         }
5856       else
5857         scalar_results[0] = new_temp;
5858 
5859       new_phis[0] = epilog_stmt;
5860     }
5861 
5862   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5863           phis with new adjusted scalar results, i.e., replace use <s_out0>
5864           with use <s_out4>.
5865 
5866      Transform:
5867         loop_exit:
5868           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5869           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5870           v_out2 = reduce <v_out1>
5871           s_out3 = extract_field <v_out2, 0>
5872           s_out4 = adjust_result <s_out3>
5873           use <s_out0>
5874           use <s_out0>
5875 
5876      into:
5877 
5878         loop_exit:
5879           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5880           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5881           v_out2 = reduce <v_out1>
5882           s_out3 = extract_field <v_out2, 0>
5883           s_out4 = adjust_result <s_out3>
5884           use <s_out4>
5885           use <s_out4> */
5886 
5887 
5888   /* In SLP reduction chain we reduce vector results into one vector if
5889      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5890      the last stmt in the reduction chain, since we are looking for the loop
5891      exit phi node.  */
5892   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5893     {
5894       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5895       /* Handle reduction patterns.  */
5896       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5897 	dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5898 
5899       scalar_dest = gimple_assign_lhs (dest_stmt);
5900       group_size = 1;
5901     }
5902 
5903   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5904      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5905      need to match SCALAR_RESULTS with corresponding statements.  The first
5906      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5907      the first vector stmt, etc.
5908      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5909   if (group_size > new_phis.length ())
5910     {
5911       ratio = group_size / new_phis.length ();
5912       gcc_assert (!(group_size % new_phis.length ()));
5913     }
5914   else
5915     ratio = 1;
5916 
5917   for (k = 0; k < group_size; k++)
5918     {
5919       if (k % ratio == 0)
5920         {
5921           epilog_stmt = new_phis[k / ratio];
5922           reduction_phi = reduction_phis[k / ratio];
5923 	  if (double_reduc)
5924 	    inner_phi = inner_phis[k / ratio];
5925         }
5926 
5927       if (slp_reduc)
5928         {
5929 	  gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5930 
5931           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5932           /* SLP statements can't participate in patterns.  */
5933           gcc_assert (!orig_stmt);
5934           scalar_dest = gimple_assign_lhs (current_stmt);
5935         }
5936 
5937       phis.create (3);
5938       /* Find the loop-closed-use at the loop exit of the original scalar
5939          result.  (The reduction result is expected to have two immediate uses -
5940          one at the latch block, and one at the loop exit).  */
5941       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5942         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5943 	    && !is_gimple_debug (USE_STMT (use_p)))
5944           phis.safe_push (USE_STMT (use_p));
5945 
5946       /* While we expect to have found an exit_phi because of loop-closed-ssa
5947          form we can end up without one if the scalar cycle is dead.  */
5948 
5949       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5950         {
5951           if (outer_loop)
5952             {
5953               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5954               gphi *vect_phi;
5955 
5956               /* FORNOW. Currently not supporting the case that an inner-loop
5957                  reduction is not used in the outer-loop (but only outside the
5958                  outer-loop), unless it is double reduction.  */
5959               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5960                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5961                           || double_reduc);
5962 
5963 	      if (double_reduc)
5964 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5965 	      else
5966 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5967               if (!double_reduc
5968                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5969                       != vect_double_reduction_def)
5970                 continue;
5971 
5972               /* Handle double reduction:
5973 
5974                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5975                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5976                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5977                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5978 
5979                  At that point the regular reduction (stmt2 and stmt3) is
5980                  already vectorized, as well as the exit phi node, stmt4.
5981                  Here we vectorize the phi node of double reduction, stmt1, and
5982                  update all relevant statements.  */
5983 
5984               /* Go through all the uses of s2 to find double reduction phi
5985                  node, i.e., stmt1 above.  */
5986               orig_name = PHI_RESULT (exit_phi);
5987               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5988                 {
5989                   stmt_vec_info use_stmt_vinfo;
5990                   stmt_vec_info new_phi_vinfo;
5991                   tree vect_phi_init, preheader_arg, vect_phi_res;
5992                   basic_block bb = gimple_bb (use_stmt);
5993 		  gimple *use;
5994 
5995                   /* Check that USE_STMT is really double reduction phi
5996                      node.  */
5997                   if (gimple_code (use_stmt) != GIMPLE_PHI
5998                       || gimple_phi_num_args (use_stmt) != 2
5999                       || bb->loop_father != outer_loop)
6000                     continue;
6001                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6002                   if (!use_stmt_vinfo
6003                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6004                           != vect_double_reduction_def)
6005 		    continue;
6006 
6007                   /* Create vector phi node for double reduction:
6008                      vs1 = phi <vs0, vs2>
6009                      vs1 was created previously in this function by a call to
6010                        vect_get_vec_def_for_operand and is stored in
6011                        vec_initial_def;
6012                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6013                      vs0 is created here.  */
6014 
6015                   /* Create vector phi node.  */
6016                   vect_phi = create_phi_node (vec_initial_def, bb);
6017                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6018                                     loop_vec_info_for_loop (outer_loop));
6019                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6020 
6021                   /* Create vs0 - initial def of the double reduction phi.  */
6022                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6023                                              loop_preheader_edge (outer_loop));
6024                   vect_phi_init = get_initial_def_for_reduction
6025 		    (stmt, preheader_arg, NULL);
6026 
6027                   /* Update phi node arguments with vs0 and vs2.  */
6028                   add_phi_arg (vect_phi, vect_phi_init,
6029                                loop_preheader_edge (outer_loop),
6030                                UNKNOWN_LOCATION);
6031                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6032                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6033                   if (dump_enabled_p ())
6034                     {
6035                       dump_printf_loc (MSG_NOTE, vect_location,
6036 				       "created double reduction phi node: ");
6037                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6038                     }
6039 
6040                   vect_phi_res = PHI_RESULT (vect_phi);
6041 
6042                   /* Replace the use, i.e., set the correct vs1 in the regular
6043                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6044                      loop is redundant.  */
6045                   use = reduction_phi;
6046                   for (j = 0; j < ncopies; j++)
6047                     {
6048                       edge pr_edge = loop_preheader_edge (loop);
6049                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6050                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6051                     }
6052                 }
6053             }
6054         }
6055 
6056       phis.release ();
6057       if (nested_in_vect_loop)
6058         {
6059           if (double_reduc)
6060             loop = outer_loop;
6061           else
6062             continue;
6063         }
6064 
6065       phis.create (3);
6066       /* Find the loop-closed-use at the loop exit of the original scalar
6067          result.  (The reduction result is expected to have two immediate uses,
6068          one at the latch block, and one at the loop exit).  For double
6069          reductions we are looking for exit phis of the outer loop.  */
6070       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6071         {
6072           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6073 	    {
6074 	      if (!is_gimple_debug (USE_STMT (use_p)))
6075 		phis.safe_push (USE_STMT (use_p));
6076 	    }
6077           else
6078             {
6079               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6080                 {
6081                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6082 
6083                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6084                     {
6085                       if (!flow_bb_inside_loop_p (loop,
6086                                              gimple_bb (USE_STMT (phi_use_p)))
6087 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
6088                         phis.safe_push (USE_STMT (phi_use_p));
6089                     }
6090                 }
6091             }
6092         }
6093 
6094       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6095         {
6096           /* Replace the uses:  */
6097           orig_name = PHI_RESULT (exit_phi);
6098           scalar_result = scalar_results[k];
6099           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6100             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6101               SET_USE (use_p, scalar_result);
6102         }
6103 
6104       phis.release ();
6105     }
6106 }
6107 
6108 /* Return a vector of type VECTYPE that is equal to the vector select
6109    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6110    before GSI.  */
6111 
6112 static tree
6113 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6114 		     tree vec, tree identity)
6115 {
6116   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6117   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6118 					  mask, vec, identity);
6119   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6120   return cond;
6121 }
6122 
6123 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6124    order, starting with LHS.  Insert the extraction statements before GSI and
6125    associate the new scalar SSA names with variable SCALAR_DEST.
6126    Return the SSA name for the result.  */
6127 
6128 static tree
6129 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6130 		       tree_code code, tree lhs, tree vector_rhs)
6131 {
6132   tree vectype = TREE_TYPE (vector_rhs);
6133   tree scalar_type = TREE_TYPE (vectype);
6134   tree bitsize = TYPE_SIZE (scalar_type);
6135   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6136   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6137 
6138   for (unsigned HOST_WIDE_INT bit_offset = 0;
6139        bit_offset < vec_size_in_bits;
6140        bit_offset += element_bitsize)
6141     {
6142       tree bitpos = bitsize_int (bit_offset);
6143       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6144 			 bitsize, bitpos);
6145 
6146       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6147       rhs = make_ssa_name (scalar_dest, stmt);
6148       gimple_assign_set_lhs (stmt, rhs);
6149       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6150 
6151       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6152       tree new_name = make_ssa_name (scalar_dest, stmt);
6153       gimple_assign_set_lhs (stmt, new_name);
6154       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6155       lhs = new_name;
6156     }
6157   return lhs;
6158 }
6159 
6160 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6161    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6162    statement.  CODE is the operation performed by STMT and OPS are
6163    its scalar operands.  REDUC_INDEX is the index of the operand in
6164    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6165    implements in-order reduction, or IFN_LAST if we should open-code it.
6166    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6167    that should be used to control the operation in a fully-masked loop.  */
6168 
6169 static bool
6170 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6171 			       gimple **vec_stmt, slp_tree slp_node,
6172 			       gimple *reduc_def_stmt,
6173 			       tree_code code, internal_fn reduc_fn,
6174 			       tree ops[3], tree vectype_in,
6175 			       int reduc_index, vec_loop_masks *masks)
6176 {
6177   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6178   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6180   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6181   gimple *new_stmt = NULL;
6182 
6183   int ncopies;
6184   if (slp_node)
6185     ncopies = 1;
6186   else
6187     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6188 
6189   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6190   gcc_assert (ncopies == 1);
6191   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6192   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6193   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6194 	      == FOLD_LEFT_REDUCTION);
6195 
6196   if (slp_node)
6197     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6198 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
6199 
6200   tree op0 = ops[1 - reduc_index];
6201 
6202   int group_size = 1;
6203   gimple *scalar_dest_def;
6204   auto_vec<tree> vec_oprnds0;
6205   if (slp_node)
6206     {
6207       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6208       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6209       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6210     }
6211   else
6212     {
6213       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6214       vec_oprnds0.create (1);
6215       vec_oprnds0.quick_push (loop_vec_def0);
6216       scalar_dest_def = stmt;
6217     }
6218 
6219   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6220   tree scalar_type = TREE_TYPE (scalar_dest);
6221   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6222 
6223   int vec_num = vec_oprnds0.length ();
6224   gcc_assert (vec_num == 1 || slp_node);
6225   tree vec_elem_type = TREE_TYPE (vectype_out);
6226   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6227 
6228   tree vector_identity = NULL_TREE;
6229   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6230     vector_identity = build_zero_cst (vectype_out);
6231 
6232   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6233   int i;
6234   tree def0;
6235   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6236     {
6237       tree mask = NULL_TREE;
6238       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6239 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6240 
6241       /* Handle MINUS by adding the negative.  */
6242       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6243 	{
6244 	  tree negated = make_ssa_name (vectype_out);
6245 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6246 	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6247 	  def0 = negated;
6248 	}
6249 
6250       if (mask)
6251 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6252 				    vector_identity);
6253 
6254       /* On the first iteration the input is simply the scalar phi
6255 	 result, and for subsequent iterations it is the output of
6256 	 the preceding operation.  */
6257       if (reduc_fn != IFN_LAST)
6258 	{
6259 	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6260 	  /* For chained SLP reductions the output of the previous reduction
6261 	     operation serves as the input of the next. For the final statement
6262 	     the output cannot be a temporary - we reuse the original
6263 	     scalar destination of the last statement.  */
6264 	  if (i != vec_num - 1)
6265 	    {
6266 	      gimple_set_lhs (new_stmt, scalar_dest_var);
6267 	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6268 	      gimple_set_lhs (new_stmt, reduc_var);
6269 	    }
6270 	}
6271       else
6272 	{
6273 	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6274 					     reduc_var, def0);
6275 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6276 	  /* Remove the statement, so that we can use the same code paths
6277 	     as for statements that we've just created.  */
6278 	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6279 	  gsi_remove (&tmp_gsi, true);
6280 	}
6281 
6282       if (i == vec_num - 1)
6283 	{
6284 	  gimple_set_lhs (new_stmt, scalar_dest);
6285 	  vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6286 	}
6287       else
6288 	vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6289 
6290       if (slp_node)
6291 	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6292     }
6293 
6294   if (!slp_node)
6295     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6296 
6297   return true;
6298 }
6299 
6300 /* Function is_nonwrapping_integer_induction.
6301 
6302    Check if STMT (which is part of loop LOOP) both increments and
6303    does not cause overflow.  */
6304 
6305 static bool
6306 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6307 {
6308   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6309   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6310   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6311   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6312   widest_int ni, max_loop_value, lhs_max;
6313   bool overflow = false;
6314 
6315   /* Make sure the loop is integer based.  */
6316   if (TREE_CODE (base) != INTEGER_CST
6317       || TREE_CODE (step) != INTEGER_CST)
6318     return false;
6319 
6320   /* Check that the max size of the loop will not wrap.  */
6321 
6322   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6323     return true;
6324 
6325   if (! max_stmt_executions (loop, &ni))
6326     return false;
6327 
6328   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6329 			    &overflow);
6330   if (overflow)
6331     return false;
6332 
6333   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6334 			    TYPE_SIGN (lhs_type), &overflow);
6335   if (overflow)
6336     return false;
6337 
6338   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6339 	  <= TYPE_PRECISION (lhs_type));
6340 }
6341 
6342 /* Function vectorizable_reduction.
6343 
6344    Check if STMT performs a reduction operation that can be vectorized.
6345    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6346    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6347    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6348 
6349    This function also handles reduction idioms (patterns) that have been
6350    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6351    of this form:
6352      X = pattern_expr (arg0, arg1, ..., X)
6353    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6354    sequence that had been detected and replaced by the pattern-stmt (STMT).
6355 
6356    This function also handles reduction of condition expressions, for example:
6357      for (int i = 0; i < N; i++)
6358        if (a[i] < value)
6359 	 last = a[i];
6360    This is handled by vectorising the loop and creating an additional vector
6361    containing the loop indexes for which "a[i] < value" was true.  In the
6362    function epilogue this is reduced to a single max value and then used to
6363    index into the vector of results.
6364 
6365    In some cases of reduction patterns, the type of the reduction variable X is
6366    different than the type of the other arguments of STMT.
6367    In such cases, the vectype that is used when transforming STMT into a vector
6368    stmt is different than the vectype that is used to determine the
6369    vectorization factor, because it consists of a different number of elements
6370    than the actual number of elements that are being operated upon in parallel.
6371 
6372    For example, consider an accumulation of shorts into an int accumulator.
6373    On some targets it's possible to vectorize this pattern operating on 8
6374    shorts at a time (hence, the vectype for purposes of determining the
6375    vectorization factor should be V8HI); on the other hand, the vectype that
6376    is used to create the vector form is actually V4SI (the type of the result).
6377 
6378    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6379    indicates what is the actual level of parallelism (V8HI in the example), so
6380    that the right vectorization factor would be derived.  This vectype
6381    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6382    be used to create the vectorized stmt.  The right vectype for the vectorized
6383    stmt is obtained from the type of the result X:
6384         get_vectype_for_scalar_type (TREE_TYPE (X))
6385 
6386    This means that, contrary to "regular" reductions (or "regular" stmts in
6387    general), the following equation:
6388       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6389    does *NOT* necessarily hold for reduction patterns.  */
6390 
6391 bool
6392 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6393 			gimple **vec_stmt, slp_tree slp_node,
6394 			slp_instance slp_node_instance)
6395 {
6396   tree vec_dest;
6397   tree scalar_dest;
6398   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6399   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6400   tree vectype_in = NULL_TREE;
6401   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6402   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6403   enum tree_code code, orig_code;
6404   internal_fn reduc_fn;
6405   machine_mode vec_mode;
6406   int op_type;
6407   optab optab;
6408   tree new_temp = NULL_TREE;
6409   gimple *def_stmt;
6410   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6411   gimple *cond_reduc_def_stmt = NULL;
6412   enum tree_code cond_reduc_op_code = ERROR_MARK;
6413   tree scalar_type;
6414   bool is_simple_use;
6415   gimple *orig_stmt;
6416   stmt_vec_info orig_stmt_info = NULL;
6417   int i;
6418   int ncopies;
6419   int epilog_copies;
6420   stmt_vec_info prev_stmt_info, prev_phi_info;
6421   bool single_defuse_cycle = false;
6422   gimple *new_stmt = NULL;
6423   int j;
6424   tree ops[3];
6425   enum vect_def_type dts[3];
6426   bool nested_cycle = false, found_nested_cycle_def = false;
6427   bool double_reduc = false;
6428   basic_block def_bb;
6429   struct loop * def_stmt_loop, *outer_loop = NULL;
6430   tree def_arg;
6431   gimple *def_arg_stmt;
6432   auto_vec<tree> vec_oprnds0;
6433   auto_vec<tree> vec_oprnds1;
6434   auto_vec<tree> vec_oprnds2;
6435   auto_vec<tree> vect_defs;
6436   auto_vec<gimple *> phis;
6437   int vec_num;
6438   tree def0, tem;
6439   bool first_p = true;
6440   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6441   tree cond_reduc_val = NULL_TREE;
6442 
6443   /* Make sure it was already recognized as a reduction computation.  */
6444   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6445       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6446     return false;
6447 
6448   if (nested_in_vect_loop_p (loop, stmt))
6449     {
6450       outer_loop = loop;
6451       loop = loop->inner;
6452       nested_cycle = true;
6453     }
6454 
6455   /* In case of reduction chain we switch to the first stmt in the chain, but
6456      we don't update STMT_INFO, since only the last stmt is marked as reduction
6457      and has reduction properties.  */
6458   if (GROUP_FIRST_ELEMENT (stmt_info)
6459       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6460     {
6461       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6462       first_p = false;
6463     }
6464 
6465   if (gimple_code (stmt) == GIMPLE_PHI)
6466     {
6467       /* Analysis is fully done on the reduction stmt invocation.  */
6468       if (! vec_stmt)
6469 	{
6470 	  if (slp_node)
6471 	    slp_node_instance->reduc_phis = slp_node;
6472 
6473 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6474 	  return true;
6475 	}
6476 
6477       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6478 	/* Leave the scalar phi in place.  Note that checking
6479 	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6480 	   for reductions involving a single statement.  */
6481 	return true;
6482 
6483       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6484       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6485 	reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6486 
6487       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6488 	  == EXTRACT_LAST_REDUCTION)
6489 	/* Leave the scalar phi in place.  */
6490 	return true;
6491 
6492       gcc_assert (is_gimple_assign (reduc_stmt));
6493       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6494 	{
6495 	  tree op = gimple_op (reduc_stmt, k);
6496 	  if (op == gimple_phi_result (stmt))
6497 	    continue;
6498 	  if (k == 1
6499 	      && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6500 	    continue;
6501 	  if (!vectype_in
6502 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6503 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6504 	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6505 	  break;
6506 	}
6507       gcc_assert (vectype_in);
6508 
6509       if (slp_node)
6510 	ncopies = 1;
6511       else
6512 	ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6513 
6514       use_operand_p use_p;
6515       gimple *use_stmt;
6516       if (ncopies > 1
6517 	  && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6518 	      <= vect_used_only_live)
6519 	  && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6520 	  && (use_stmt == reduc_stmt
6521 	      || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6522 		  == reduc_stmt)))
6523 	single_defuse_cycle = true;
6524 
6525       /* Create the destination vector  */
6526       scalar_dest = gimple_assign_lhs (reduc_stmt);
6527       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6528 
6529       if (slp_node)
6530 	/* The size vect_schedule_slp_instance computes is off for us.  */
6531 	vec_num = vect_get_num_vectors
6532 	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6533 	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6534 	   vectype_in);
6535       else
6536 	vec_num = 1;
6537 
6538       /* Generate the reduction PHIs upfront.  */
6539       prev_phi_info = NULL;
6540       for (j = 0; j < ncopies; j++)
6541 	{
6542 	  if (j == 0 || !single_defuse_cycle)
6543 	    {
6544 	      for (i = 0; i < vec_num; i++)
6545 		{
6546 		  /* Create the reduction-phi that defines the reduction
6547 		     operand.  */
6548 		  gimple *new_phi = create_phi_node (vec_dest, loop->header);
6549 		  set_vinfo_for_stmt (new_phi,
6550 				      new_stmt_vec_info (new_phi, loop_vinfo));
6551 
6552 		  if (slp_node)
6553 		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6554 		  else
6555 		    {
6556 		      if (j == 0)
6557 			STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6558 		      else
6559 			STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6560 		      prev_phi_info = vinfo_for_stmt (new_phi);
6561 		    }
6562 		}
6563 	    }
6564 	}
6565 
6566       return true;
6567     }
6568 
6569   /* 1. Is vectorizable reduction?  */
6570   /* Not supportable if the reduction variable is used in the loop, unless
6571      it's a reduction chain.  */
6572   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6573       && !GROUP_FIRST_ELEMENT (stmt_info))
6574     return false;
6575 
6576   /* Reductions that are not used even in an enclosing outer-loop,
6577      are expected to be "live" (used out of the loop).  */
6578   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6579       && !STMT_VINFO_LIVE_P (stmt_info))
6580     return false;
6581 
6582   /* 2. Has this been recognized as a reduction pattern?
6583 
6584      Check if STMT represents a pattern that has been recognized
6585      in earlier analysis stages.  For stmts that represent a pattern,
6586      the STMT_VINFO_RELATED_STMT field records the last stmt in
6587      the original sequence that constitutes the pattern.  */
6588 
6589   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6590   if (orig_stmt)
6591     {
6592       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6593       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6594       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6595     }
6596 
6597   /* 3. Check the operands of the operation.  The first operands are defined
6598         inside the loop body. The last operand is the reduction variable,
6599         which is defined by the loop-header-phi.  */
6600 
6601   gcc_assert (is_gimple_assign (stmt));
6602 
6603   /* Flatten RHS.  */
6604   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6605     {
6606     case GIMPLE_BINARY_RHS:
6607       code = gimple_assign_rhs_code (stmt);
6608       op_type = TREE_CODE_LENGTH (code);
6609       gcc_assert (op_type == binary_op);
6610       ops[0] = gimple_assign_rhs1 (stmt);
6611       ops[1] = gimple_assign_rhs2 (stmt);
6612       break;
6613 
6614     case GIMPLE_TERNARY_RHS:
6615       code = gimple_assign_rhs_code (stmt);
6616       op_type = TREE_CODE_LENGTH (code);
6617       gcc_assert (op_type == ternary_op);
6618       ops[0] = gimple_assign_rhs1 (stmt);
6619       ops[1] = gimple_assign_rhs2 (stmt);
6620       ops[2] = gimple_assign_rhs3 (stmt);
6621       break;
6622 
6623     case GIMPLE_UNARY_RHS:
6624       return false;
6625 
6626     default:
6627       gcc_unreachable ();
6628     }
6629 
6630   if (code == COND_EXPR && slp_node)
6631     return false;
6632 
6633   scalar_dest = gimple_assign_lhs (stmt);
6634   scalar_type = TREE_TYPE (scalar_dest);
6635   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6636       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6637     return false;
6638 
6639   /* Do not try to vectorize bit-precision reductions.  */
6640   if (!type_has_mode_precision_p (scalar_type))
6641     return false;
6642 
6643   /* All uses but the last are expected to be defined in the loop.
6644      The last use is the reduction variable.  In case of nested cycle this
6645      assumption is not true: we use reduc_index to record the index of the
6646      reduction variable.  */
6647   gimple *reduc_def_stmt = NULL;
6648   int reduc_index = -1;
6649   for (i = 0; i < op_type; i++)
6650     {
6651       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6652       if (i == 0 && code == COND_EXPR)
6653         continue;
6654 
6655       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6656 					  &def_stmt, &dts[i], &tem);
6657       dt = dts[i];
6658       gcc_assert (is_simple_use);
6659       if (dt == vect_reduction_def)
6660 	{
6661           reduc_def_stmt = def_stmt;
6662 	  reduc_index = i;
6663 	  continue;
6664 	}
6665       else if (tem)
6666 	{
6667 	  /* To properly compute ncopies we are interested in the widest
6668 	     input type in case we're looking at a widening accumulation.  */
6669 	  if (!vectype_in
6670 	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6671 		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6672 	    vectype_in = tem;
6673 	}
6674 
6675       if (dt != vect_internal_def
6676 	  && dt != vect_external_def
6677 	  && dt != vect_constant_def
6678 	  && dt != vect_induction_def
6679           && !(dt == vect_nested_cycle && nested_cycle))
6680 	return false;
6681 
6682       if (dt == vect_nested_cycle)
6683         {
6684           found_nested_cycle_def = true;
6685           reduc_def_stmt = def_stmt;
6686           reduc_index = i;
6687         }
6688 
6689       if (i == 1 && code == COND_EXPR)
6690 	{
6691 	  /* Record how value of COND_EXPR is defined.  */
6692 	  if (dt == vect_constant_def)
6693 	    {
6694 	      cond_reduc_dt = dt;
6695 	      cond_reduc_val = ops[i];
6696 	    }
6697 	  if (dt == vect_induction_def
6698 	      && def_stmt != NULL
6699 	      && is_nonwrapping_integer_induction (def_stmt, loop))
6700 	    {
6701 	      cond_reduc_dt = dt;
6702 	      cond_reduc_def_stmt = def_stmt;
6703 	    }
6704 	}
6705     }
6706 
6707   if (!vectype_in)
6708     vectype_in = vectype_out;
6709 
6710   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6711      directy used in stmt.  */
6712   if (reduc_index == -1)
6713     {
6714       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6715 	{
6716 	  if (dump_enabled_p ())
6717 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6718 			     "in-order reduction chain without SLP.\n");
6719 	  return false;
6720 	}
6721 
6722       if (orig_stmt)
6723 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6724       else
6725 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6726     }
6727 
6728   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6729     return false;
6730 
6731   if (!(reduc_index == -1
6732 	|| dts[reduc_index] == vect_reduction_def
6733 	|| dts[reduc_index] == vect_nested_cycle
6734 	|| ((dts[reduc_index] == vect_internal_def
6735 	     || dts[reduc_index] == vect_external_def
6736 	     || dts[reduc_index] == vect_constant_def
6737 	     || dts[reduc_index] == vect_induction_def)
6738 	    && nested_cycle && found_nested_cycle_def)))
6739     {
6740       /* For pattern recognized stmts, orig_stmt might be a reduction,
6741 	 but some helper statements for the pattern might not, or
6742 	 might be COND_EXPRs with reduction uses in the condition.  */
6743       gcc_assert (orig_stmt);
6744       return false;
6745     }
6746 
6747   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6748   enum vect_reduction_type v_reduc_type
6749     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6750   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6751 
6752   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6753   /* If we have a condition reduction, see if we can simplify it further.  */
6754   if (v_reduc_type == COND_REDUCTION)
6755     {
6756       /* TODO: We can't yet handle reduction chains, since we need to treat
6757 	 each COND_EXPR in the chain specially, not just the last one.
6758 	 E.g. for:
6759 
6760 	    x_1 = PHI <x_3, ...>
6761 	    x_2 = a_2 ? ... : x_1;
6762 	    x_3 = a_3 ? ... : x_2;
6763 
6764 	 we're interested in the last element in x_3 for which a_2 || a_3
6765 	 is true, whereas the current reduction chain handling would
6766 	 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6767 	 as a reduction operation.  */
6768       if (reduc_index == -1)
6769 	{
6770 	  if (dump_enabled_p ())
6771 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6772 			     "conditional reduction chains not supported\n");
6773 	  return false;
6774 	}
6775 
6776       /* vect_is_simple_reduction ensured that operand 2 is the
6777 	 loop-carried operand.  */
6778       gcc_assert (reduc_index == 2);
6779 
6780       /* Loop peeling modifies initial value of reduction PHI, which
6781 	 makes the reduction stmt to be transformed different to the
6782 	 original stmt analyzed.  We need to record reduction code for
6783 	 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6784 	 it can be used directly at transform stage.  */
6785       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6786 	  || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6787 	{
6788 	  /* Also set the reduction type to CONST_COND_REDUCTION.  */
6789 	  gcc_assert (cond_reduc_dt == vect_constant_def);
6790 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6791 	}
6792       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6793 					       vectype_in, OPTIMIZE_FOR_SPEED))
6794 	{
6795 	  if (dump_enabled_p ())
6796 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797 			     "optimizing condition reduction with"
6798 			     " FOLD_EXTRACT_LAST.\n");
6799 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6800 	}
6801       else if (cond_reduc_dt == vect_induction_def)
6802 	{
6803 	  stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6804 	  tree base
6805 	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6806 	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6807 
6808 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6809 		      && TREE_CODE (step) == INTEGER_CST);
6810 	  cond_reduc_val = NULL_TREE;
6811 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6812 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6813 	    ;
6814 	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6815 	     above base; punt if base is the minimum value of the type for
6816 	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6817 	  else if (tree_int_cst_sgn (step) == -1)
6818 	    {
6819 	      cond_reduc_op_code = MIN_EXPR;
6820 	      if (tree_int_cst_sgn (base) == -1)
6821 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6822 	      else if (tree_int_cst_lt (base,
6823 					TYPE_MAX_VALUE (TREE_TYPE (base))))
6824 		cond_reduc_val
6825 		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6826 	    }
6827 	  else
6828 	    {
6829 	      cond_reduc_op_code = MAX_EXPR;
6830 	      if (tree_int_cst_sgn (base) == 1)
6831 		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6832 	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6833 					base))
6834 		cond_reduc_val
6835 		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6836 	    }
6837 	  if (cond_reduc_val)
6838 	    {
6839 	      if (dump_enabled_p ())
6840 		dump_printf_loc (MSG_NOTE, vect_location,
6841 				 "condition expression based on "
6842 				 "integer induction.\n");
6843 	      STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6844 		= INTEGER_INDUC_COND_REDUCTION;
6845 	    }
6846 	}
6847       else if (cond_reduc_dt == vect_constant_def)
6848 	{
6849 	  enum vect_def_type cond_initial_dt;
6850 	  gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6851 	  tree cond_initial_val
6852 	    = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6853 
6854 	  gcc_assert (cond_reduc_val != NULL_TREE);
6855 	  vect_is_simple_use (cond_initial_val, loop_vinfo,
6856 			      &def_stmt, &cond_initial_dt);
6857 	  if (cond_initial_dt == vect_constant_def
6858 	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6859 				     TREE_TYPE (cond_reduc_val)))
6860 	    {
6861 	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6862 				    cond_initial_val, cond_reduc_val);
6863 	      if (e && (integer_onep (e) || integer_zerop (e)))
6864 		{
6865 		  if (dump_enabled_p ())
6866 		    dump_printf_loc (MSG_NOTE, vect_location,
6867 				     "condition expression based on "
6868 				     "compile time constant.\n");
6869 		  /* Record reduction code at analysis stage.  */
6870 		  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6871 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6872 		  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6873 		    = CONST_COND_REDUCTION;
6874 		}
6875 	    }
6876 	}
6877     }
6878 
6879   if (orig_stmt)
6880     gcc_assert (tmp == orig_stmt
6881 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6882   else
6883     /* We changed STMT to be the first stmt in reduction chain, hence we
6884        check that in this case the first element in the chain is STMT.  */
6885     gcc_assert (stmt == tmp
6886 		|| GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6887 
6888   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6889     return false;
6890 
6891   if (slp_node)
6892     ncopies = 1;
6893   else
6894     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6895 
6896   gcc_assert (ncopies >= 1);
6897 
6898   vec_mode = TYPE_MODE (vectype_in);
6899   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6900 
6901   if (code == COND_EXPR)
6902     {
6903       /* Only call during the analysis stage, otherwise we'll lose
6904 	 STMT_VINFO_TYPE.  */
6905       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6906 						ops[reduc_index], 0, NULL))
6907         {
6908           if (dump_enabled_p ())
6909 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6910 			     "unsupported condition in reduction\n");
6911 	  return false;
6912         }
6913     }
6914   else
6915     {
6916       /* 4. Supportable by target?  */
6917 
6918       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6919 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
6920 	{
6921 	  /* Shifts and rotates are only supported by vectorizable_shifts,
6922 	     not vectorizable_reduction.  */
6923           if (dump_enabled_p ())
6924 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6925 			     "unsupported shift or rotation.\n");
6926 	  return false;
6927 	}
6928 
6929       /* 4.1. check support for the operation in the loop  */
6930       optab = optab_for_tree_code (code, vectype_in, optab_default);
6931       if (!optab)
6932         {
6933           if (dump_enabled_p ())
6934 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6935 			     "no optab.\n");
6936 
6937           return false;
6938         }
6939 
6940       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6941         {
6942           if (dump_enabled_p ())
6943             dump_printf (MSG_NOTE, "op not supported by target.\n");
6944 
6945 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6946 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6947             return false;
6948 
6949           if (dump_enabled_p ())
6950   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6951         }
6952 
6953       /* Worthwhile without SIMD support?  */
6954       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6955 	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6956         {
6957           if (dump_enabled_p ())
6958 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959 			     "not worthwhile without SIMD support.\n");
6960 
6961           return false;
6962         }
6963     }
6964 
6965   /* 4.2. Check support for the epilog operation.
6966 
6967           If STMT represents a reduction pattern, then the type of the
6968           reduction variable may be different than the type of the rest
6969           of the arguments.  For example, consider the case of accumulation
6970           of shorts into an int accumulator; The original code:
6971                         S1: int_a = (int) short_a;
6972           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6973 
6974           was replaced with:
6975                         STMT: int_acc = widen_sum <short_a, int_acc>
6976 
6977           This means that:
6978           1. The tree-code that is used to create the vector operation in the
6979              epilog code (that reduces the partial results) is not the
6980              tree-code of STMT, but is rather the tree-code of the original
6981              stmt from the pattern that STMT is replacing.  I.e, in the example
6982              above we want to use 'widen_sum' in the loop, but 'plus' in the
6983              epilog.
6984           2. The type (mode) we use to check available target support
6985              for the vector operation to be created in the *epilog*, is
6986              determined by the type of the reduction variable (in the example
6987              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6988              However the type (mode) we use to check available target support
6989              for the vector operation to be created *inside the loop*, is
6990              determined by the type of the other arguments to STMT (in the
6991              example we'd check this: optab_handler (widen_sum_optab,
6992 	     vect_short_mode)).
6993 
6994           This is contrary to "regular" reductions, in which the types of all
6995           the arguments are the same as the type of the reduction variable.
6996           For "regular" reductions we can therefore use the same vector type
6997           (and also the same tree-code) when generating the epilog code and
6998           when generating the code inside the loop.  */
6999 
7000   vect_reduction_type reduction_type
7001     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7002   if (orig_stmt
7003       && (reduction_type == TREE_CODE_REDUCTION
7004 	  || reduction_type == FOLD_LEFT_REDUCTION))
7005     {
7006       /* This is a reduction pattern: get the vectype from the type of the
7007          reduction variable, and get the tree-code from orig_stmt.  */
7008       orig_code = gimple_assign_rhs_code (orig_stmt);
7009       gcc_assert (vectype_out);
7010       vec_mode = TYPE_MODE (vectype_out);
7011     }
7012   else
7013     {
7014       /* Regular reduction: use the same vectype and tree-code as used for
7015          the vector code inside the loop can be used for the epilog code. */
7016       orig_code = code;
7017 
7018       if (code == MINUS_EXPR)
7019 	orig_code = PLUS_EXPR;
7020 
7021       /* For simple condition reductions, replace with the actual expression
7022 	 we want to base our reduction around.  */
7023       if (reduction_type == CONST_COND_REDUCTION)
7024 	{
7025 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7026 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7027 	}
7028       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7029 	orig_code = cond_reduc_op_code;
7030     }
7031 
7032   if (nested_cycle)
7033     {
7034       def_bb = gimple_bb (reduc_def_stmt);
7035       def_stmt_loop = def_bb->loop_father;
7036       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7037                                        loop_preheader_edge (def_stmt_loop));
7038       if (TREE_CODE (def_arg) == SSA_NAME
7039           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7040           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7041           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7042           && vinfo_for_stmt (def_arg_stmt)
7043           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7044               == vect_double_reduction_def)
7045         double_reduc = true;
7046     }
7047 
7048   reduc_fn = IFN_LAST;
7049 
7050   if (reduction_type == TREE_CODE_REDUCTION
7051       || reduction_type == FOLD_LEFT_REDUCTION
7052       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7053       || reduction_type == CONST_COND_REDUCTION)
7054     {
7055       if (reduction_type == FOLD_LEFT_REDUCTION
7056 	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
7057 	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7058 	{
7059 	  if (reduc_fn != IFN_LAST
7060 	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7061 						  OPTIMIZE_FOR_SPEED))
7062 	    {
7063 	      if (dump_enabled_p ())
7064 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065 				 "reduc op not supported by target.\n");
7066 
7067 	      reduc_fn = IFN_LAST;
7068 	    }
7069 	}
7070       else
7071 	{
7072 	  if (!nested_cycle || double_reduc)
7073 	    {
7074 	      if (dump_enabled_p ())
7075 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7076 				 "no reduc code for scalar code.\n");
7077 
7078 	      return false;
7079 	    }
7080 	}
7081     }
7082   else if (reduction_type == COND_REDUCTION)
7083     {
7084       int scalar_precision
7085 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7086       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7087       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7088 						nunits_out);
7089 
7090       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7091 					  OPTIMIZE_FOR_SPEED))
7092 	reduc_fn = IFN_REDUC_MAX;
7093     }
7094 
7095   if (reduction_type != EXTRACT_LAST_REDUCTION
7096       && reduc_fn == IFN_LAST
7097       && !nunits_out.is_constant ())
7098     {
7099       if (dump_enabled_p ())
7100 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7101 			 "missing target support for reduction on"
7102 			 " variable-length vectors.\n");
7103       return false;
7104     }
7105 
7106   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7107       && ncopies > 1)
7108     {
7109       if (dump_enabled_p ())
7110 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111 			 "multiple types in double reduction or condition "
7112 			 "reduction.\n");
7113       return false;
7114     }
7115 
7116   /* For SLP reductions, see if there is a neutral value we can use.  */
7117   tree neutral_op = NULL_TREE;
7118   if (slp_node)
7119     neutral_op
7120       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7121 				      GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7122 
7123   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7124     {
7125       /* We can't support in-order reductions of code such as this:
7126 
7127 	   for (int i = 0; i < n1; ++i)
7128 	     for (int j = 0; j < n2; ++j)
7129 	       l += a[j];
7130 
7131 	 since GCC effectively transforms the loop when vectorizing:
7132 
7133 	   for (int i = 0; i < n1 / VF; ++i)
7134 	     for (int j = 0; j < n2; ++j)
7135 	       for (int k = 0; k < VF; ++k)
7136 		 l += a[j];
7137 
7138 	 which is a reassociation of the original operation.  */
7139       if (dump_enabled_p ())
7140 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7141 			 "in-order double reduction not supported.\n");
7142 
7143       return false;
7144     }
7145 
7146   if (reduction_type == FOLD_LEFT_REDUCTION
7147       && slp_node
7148       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7149     {
7150       /* We cannot use in-order reductions in this case because there is
7151 	 an implicit reassociation of the operations involved.  */
7152       if (dump_enabled_p ())
7153 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7154 			 "in-order unchained SLP reductions not supported.\n");
7155       return false;
7156     }
7157 
7158   /* For double reductions, and for SLP reductions with a neutral value,
7159      we construct a variable-length initial vector by loading a vector
7160      full of the neutral value and then shift-and-inserting the start
7161      values into the low-numbered elements.  */
7162   if ((double_reduc || neutral_op)
7163       && !nunits_out.is_constant ()
7164       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7165 					  vectype_out, OPTIMIZE_FOR_SPEED))
7166     {
7167       if (dump_enabled_p ())
7168 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7169 			 "reduction on variable-length vectors requires"
7170 			 " target support for a vector-shift-and-insert"
7171 			 " operation.\n");
7172       return false;
7173     }
7174 
7175   /* Check extra constraints for variable-length unchained SLP reductions.  */
7176   if (STMT_SLP_TYPE (stmt_info)
7177       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7178       && !nunits_out.is_constant ())
7179     {
7180       /* We checked above that we could build the initial vector when
7181 	 there's a neutral element value.  Check here for the case in
7182 	 which each SLP statement has its own initial value and in which
7183 	 that value needs to be repeated for every instance of the
7184 	 statement within the initial vector.  */
7185       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7186       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7187       if (!neutral_op
7188 	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
7189 	{
7190 	  if (dump_enabled_p ())
7191 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7192 			     "unsupported form of SLP reduction for"
7193 			     " variable-length vectors: cannot build"
7194 			     " initial vector.\n");
7195 	  return false;
7196 	}
7197       /* The epilogue code relies on the number of elements being a multiple
7198 	 of the group size.  The duplicate-and-interleave approach to setting
7199 	 up the the initial vector does too.  */
7200       if (!multiple_p (nunits_out, group_size))
7201 	{
7202 	  if (dump_enabled_p ())
7203 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7204 			     "unsupported form of SLP reduction for"
7205 			     " variable-length vectors: the vector size"
7206 			     " is not a multiple of the number of results.\n");
7207 	  return false;
7208 	}
7209     }
7210 
7211   /* In case of widenning multiplication by a constant, we update the type
7212      of the constant to be the type of the other operand.  We check that the
7213      constant fits the type in the pattern recognition pass.  */
7214   if (code == DOT_PROD_EXPR
7215       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7216     {
7217       if (TREE_CODE (ops[0]) == INTEGER_CST)
7218         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7219       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7220         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7221       else
7222         {
7223           if (dump_enabled_p ())
7224 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7225 			     "invalid types in dot-prod\n");
7226 
7227           return false;
7228         }
7229     }
7230 
7231   if (reduction_type == COND_REDUCTION)
7232     {
7233       widest_int ni;
7234 
7235       if (! max_loop_iterations (loop, &ni))
7236 	{
7237 	  if (dump_enabled_p ())
7238 	    dump_printf_loc (MSG_NOTE, vect_location,
7239 			     "loop count not known, cannot create cond "
7240 			     "reduction.\n");
7241 	  return false;
7242 	}
7243       /* Convert backedges to iterations.  */
7244       ni += 1;
7245 
7246       /* The additional index will be the same type as the condition.  Check
7247 	 that the loop can fit into this less one (because we'll use up the
7248 	 zero slot for when there are no matches).  */
7249       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7250       if (wi::geu_p (ni, wi::to_widest (max_index)))
7251 	{
7252 	  if (dump_enabled_p ())
7253 	    dump_printf_loc (MSG_NOTE, vect_location,
7254 			     "loop size is greater than data size.\n");
7255 	  return false;
7256 	}
7257     }
7258 
7259   /* In case the vectorization factor (VF) is bigger than the number
7260      of elements that we can fit in a vectype (nunits), we have to generate
7261      more than one vector stmt - i.e - we need to "unroll" the
7262      vector stmt by a factor VF/nunits.  For more details see documentation
7263      in vectorizable_operation.  */
7264 
7265   /* If the reduction is used in an outer loop we need to generate
7266      VF intermediate results, like so (e.g. for ncopies=2):
7267 	r0 = phi (init, r0)
7268 	r1 = phi (init, r1)
7269 	r0 = x0 + r0;
7270         r1 = x1 + r1;
7271     (i.e. we generate VF results in 2 registers).
7272     In this case we have a separate def-use cycle for each copy, and therefore
7273     for each copy we get the vector def for the reduction variable from the
7274     respective phi node created for this copy.
7275 
7276     Otherwise (the reduction is unused in the loop nest), we can combine
7277     together intermediate results, like so (e.g. for ncopies=2):
7278 	r = phi (init, r)
7279 	r = x0 + r;
7280 	r = x1 + r;
7281    (i.e. we generate VF/2 results in a single register).
7282    In this case for each copy we get the vector def for the reduction variable
7283    from the vectorized reduction operation generated in the previous iteration.
7284 
7285    This only works when we see both the reduction PHI and its only consumer
7286    in vectorizable_reduction and there are no intermediate stmts
7287    participating.  */
7288   use_operand_p use_p;
7289   gimple *use_stmt;
7290   if (ncopies > 1
7291       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7292       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7293       && (use_stmt == stmt
7294 	  || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7295     {
7296       single_defuse_cycle = true;
7297       epilog_copies = 1;
7298     }
7299   else
7300     epilog_copies = ncopies;
7301 
7302   /* If the reduction stmt is one of the patterns that have lane
7303      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7304   if ((ncopies > 1
7305        && ! single_defuse_cycle)
7306       && (code == DOT_PROD_EXPR
7307 	  || code == WIDEN_SUM_EXPR
7308 	  || code == SAD_EXPR))
7309     {
7310       if (dump_enabled_p ())
7311 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312 			 "multi def-use cycle not possible for lane-reducing "
7313 			 "reduction operation\n");
7314       return false;
7315     }
7316 
7317   if (slp_node)
7318     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7319   else
7320     vec_num = 1;
7321 
7322   internal_fn cond_fn = get_conditional_internal_fn (code);
7323   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7324 
7325   if (!vec_stmt) /* transformation not required.  */
7326     {
7327       if (first_p)
7328 	vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7329       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7330 	{
7331 	  if (reduction_type != FOLD_LEFT_REDUCTION
7332 	      && (cond_fn == IFN_LAST
7333 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7334 						      OPTIMIZE_FOR_SPEED)))
7335 	    {
7336 	      if (dump_enabled_p ())
7337 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7338 				 "can't use a fully-masked loop because no"
7339 				 " conditional operation is available.\n");
7340 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7341 	    }
7342 	  else if (reduc_index == -1)
7343 	    {
7344 	      if (dump_enabled_p ())
7345 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7346 				 "can't use a fully-masked loop for chained"
7347 				 " reductions.\n");
7348 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7349 	    }
7350 	  else
7351 	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7352 				   vectype_in);
7353 	}
7354       if (dump_enabled_p ()
7355 	  && reduction_type == FOLD_LEFT_REDUCTION)
7356 	dump_printf_loc (MSG_NOTE, vect_location,
7357 			 "using an in-order (fold-left) reduction.\n");
7358       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7359       return true;
7360     }
7361 
7362   /* Transform.  */
7363 
7364   if (dump_enabled_p ())
7365     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7366 
7367   /* FORNOW: Multiple types are not supported for condition.  */
7368   if (code == COND_EXPR)
7369     gcc_assert (ncopies == 1);
7370 
7371   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7372 
7373   if (reduction_type == FOLD_LEFT_REDUCTION)
7374     return vectorize_fold_left_reduction
7375       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7376        reduc_fn, ops, vectype_in, reduc_index, masks);
7377 
7378   if (reduction_type == EXTRACT_LAST_REDUCTION)
7379     {
7380       gcc_assert (!slp_node);
7381       return vectorizable_condition (stmt, gsi, vec_stmt,
7382 				     NULL, reduc_index, NULL);
7383     }
7384 
7385   /* Create the destination vector  */
7386   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7387 
7388   prev_stmt_info = NULL;
7389   prev_phi_info = NULL;
7390   if (!slp_node)
7391     {
7392       vec_oprnds0.create (1);
7393       vec_oprnds1.create (1);
7394       if (op_type == ternary_op)
7395         vec_oprnds2.create (1);
7396     }
7397 
7398   phis.create (vec_num);
7399   vect_defs.create (vec_num);
7400   if (!slp_node)
7401     vect_defs.quick_push (NULL_TREE);
7402 
7403   if (slp_node)
7404     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7405   else
7406     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7407 
7408   for (j = 0; j < ncopies; j++)
7409     {
7410       if (code == COND_EXPR)
7411         {
7412           gcc_assert (!slp_node);
7413           vectorizable_condition (stmt, gsi, vec_stmt,
7414                                   PHI_RESULT (phis[0]),
7415                                   reduc_index, NULL);
7416           /* Multiple types are not supported for condition.  */
7417           break;
7418         }
7419 
7420       /* Handle uses.  */
7421       if (j == 0)
7422         {
7423 	  if (slp_node)
7424 	    {
7425 	      /* Get vec defs for all the operands except the reduction index,
7426 		 ensuring the ordering of the ops in the vector is kept.  */
7427 	      auto_vec<tree, 3> slp_ops;
7428 	      auto_vec<vec<tree>, 3> vec_defs;
7429 
7430 	      slp_ops.quick_push (ops[0]);
7431 	      slp_ops.quick_push (ops[1]);
7432 	      if (op_type == ternary_op)
7433 		slp_ops.quick_push (ops[2]);
7434 
7435 	      vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7436 
7437 	      vec_oprnds0.safe_splice (vec_defs[0]);
7438 	      vec_defs[0].release ();
7439 	      vec_oprnds1.safe_splice (vec_defs[1]);
7440 	      vec_defs[1].release ();
7441 	      if (op_type == ternary_op)
7442 		{
7443 		  vec_oprnds2.safe_splice (vec_defs[2]);
7444 		  vec_defs[2].release ();
7445 		}
7446 	    }
7447           else
7448 	    {
7449               vec_oprnds0.quick_push
7450 		(vect_get_vec_def_for_operand (ops[0], stmt));
7451               vec_oprnds1.quick_push
7452 		(vect_get_vec_def_for_operand (ops[1], stmt));
7453               if (op_type == ternary_op)
7454 		vec_oprnds2.quick_push
7455 		  (vect_get_vec_def_for_operand (ops[2], stmt));
7456 	    }
7457         }
7458       else
7459         {
7460           if (!slp_node)
7461             {
7462 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7463 
7464 	      if (single_defuse_cycle && reduc_index == 0)
7465 		vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7466 	      else
7467 		vec_oprnds0[0]
7468 		  = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7469 	      if (single_defuse_cycle && reduc_index == 1)
7470 		vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7471 	      else
7472 		vec_oprnds1[0]
7473 		  = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7474 	      if (op_type == ternary_op)
7475 		{
7476 		  if (single_defuse_cycle && reduc_index == 2)
7477 		    vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7478 		  else
7479 		    vec_oprnds2[0]
7480 		      = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7481 		}
7482             }
7483         }
7484 
7485       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7486         {
7487 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7488 	  if (masked_loop_p)
7489 	    {
7490 	      /* Make sure that the reduction accumulator is vop[0].  */
7491 	      if (reduc_index == 1)
7492 		{
7493 		  gcc_assert (commutative_tree_code (code));
7494 		  std::swap (vop[0], vop[1]);
7495 		}
7496 	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7497 					      vectype_in, i * ncopies + j);
7498 	      gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7499 							vop[0], vop[1]);
7500 	      new_temp = make_ssa_name (vec_dest, call);
7501 	      gimple_call_set_lhs (call, new_temp);
7502 	      gimple_call_set_nothrow (call, true);
7503 	      new_stmt = call;
7504 	    }
7505 	  else
7506 	    {
7507 	      if (op_type == ternary_op)
7508 		vop[2] = vec_oprnds2[i];
7509 
7510 	      new_temp = make_ssa_name (vec_dest, new_stmt);
7511 	      new_stmt = gimple_build_assign (new_temp, code,
7512 					      vop[0], vop[1], vop[2]);
7513 	    }
7514 	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
7515 
7516           if (slp_node)
7517             {
7518               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7519               vect_defs.quick_push (new_temp);
7520             }
7521           else
7522             vect_defs[0] = new_temp;
7523         }
7524 
7525       if (slp_node)
7526         continue;
7527 
7528       if (j == 0)
7529 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7530       else
7531 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7532 
7533       prev_stmt_info = vinfo_for_stmt (new_stmt);
7534     }
7535 
7536   /* Finalize the reduction-phi (set its arguments) and create the
7537      epilog reduction code.  */
7538   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7539     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7540 
7541   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7542 				    epilog_copies, reduc_fn, phis,
7543 				    double_reduc, slp_node, slp_node_instance,
7544 				    cond_reduc_val, cond_reduc_op_code,
7545 				    neutral_op);
7546 
7547   return true;
7548 }
7549 
7550 /* Function vect_min_worthwhile_factor.
7551 
7552    For a loop where we could vectorize the operation indicated by CODE,
7553    return the minimum vectorization factor that makes it worthwhile
7554    to use generic vectors.  */
7555 static unsigned int
7556 vect_min_worthwhile_factor (enum tree_code code)
7557 {
7558   switch (code)
7559     {
7560     case PLUS_EXPR:
7561     case MINUS_EXPR:
7562     case NEGATE_EXPR:
7563       return 4;
7564 
7565     case BIT_AND_EXPR:
7566     case BIT_IOR_EXPR:
7567     case BIT_XOR_EXPR:
7568     case BIT_NOT_EXPR:
7569       return 2;
7570 
7571     default:
7572       return INT_MAX;
7573     }
7574 }
7575 
7576 /* Return true if VINFO indicates we are doing loop vectorization and if
7577    it is worth decomposing CODE operations into scalar operations for
7578    that loop's vectorization factor.  */
7579 
7580 bool
7581 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7582 {
7583   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7584   unsigned HOST_WIDE_INT value;
7585   return (loop_vinfo
7586 	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7587 	  && value >= vect_min_worthwhile_factor (code));
7588 }
7589 
7590 /* Function vectorizable_induction
7591 
7592    Check if PHI performs an induction computation that can be vectorized.
7593    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7594    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7595    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7596 
7597 bool
7598 vectorizable_induction (gimple *phi,
7599 			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7600 			gimple **vec_stmt, slp_tree slp_node)
7601 {
7602   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7603   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7604   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7605   unsigned ncopies;
7606   bool nested_in_vect_loop = false;
7607   struct loop *iv_loop;
7608   tree vec_def;
7609   edge pe = loop_preheader_edge (loop);
7610   basic_block new_bb;
7611   tree new_vec, vec_init, vec_step, t;
7612   tree new_name;
7613   gimple *new_stmt;
7614   gphi *induction_phi;
7615   tree induc_def, vec_dest;
7616   tree init_expr, step_expr;
7617   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7618   unsigned i;
7619   tree expr;
7620   gimple_seq stmts;
7621   imm_use_iterator imm_iter;
7622   use_operand_p use_p;
7623   gimple *exit_phi;
7624   edge latch_e;
7625   tree loop_arg;
7626   gimple_stmt_iterator si;
7627   basic_block bb = gimple_bb (phi);
7628 
7629   if (gimple_code (phi) != GIMPLE_PHI)
7630     return false;
7631 
7632   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7633     return false;
7634 
7635   /* Make sure it was recognized as induction computation.  */
7636   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7637     return false;
7638 
7639   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7640   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7641 
7642   if (slp_node)
7643     ncopies = 1;
7644   else
7645     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7646   gcc_assert (ncopies >= 1);
7647 
7648   /* FORNOW. These restrictions should be relaxed.  */
7649   if (nested_in_vect_loop_p (loop, phi))
7650     {
7651       imm_use_iterator imm_iter;
7652       use_operand_p use_p;
7653       gimple *exit_phi;
7654       edge latch_e;
7655       tree loop_arg;
7656 
7657       if (ncopies > 1)
7658 	{
7659 	  if (dump_enabled_p ())
7660 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7661 			     "multiple types in nested loop.\n");
7662 	  return false;
7663 	}
7664 
7665       /* FORNOW: outer loop induction with SLP not supported.  */
7666       if (STMT_SLP_TYPE (stmt_info))
7667 	return false;
7668 
7669       exit_phi = NULL;
7670       latch_e = loop_latch_edge (loop->inner);
7671       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7672       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7673 	{
7674 	  gimple *use_stmt = USE_STMT (use_p);
7675 	  if (is_gimple_debug (use_stmt))
7676 	    continue;
7677 
7678 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7679 	    {
7680 	      exit_phi = use_stmt;
7681 	      break;
7682 	    }
7683 	}
7684       if (exit_phi)
7685 	{
7686 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7687 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7688 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7689 	    {
7690 	      if (dump_enabled_p ())
7691 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7692 				 "inner-loop induction only used outside "
7693 				 "of the outer vectorized loop.\n");
7694 	      return false;
7695 	    }
7696 	}
7697 
7698       nested_in_vect_loop = true;
7699       iv_loop = loop->inner;
7700     }
7701   else
7702     iv_loop = loop;
7703   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7704 
7705   if (slp_node && !nunits.is_constant ())
7706     {
7707       /* The current SLP code creates the initial value element-by-element.  */
7708       if (dump_enabled_p ())
7709 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7710 			 "SLP induction not supported for variable-length"
7711 			 " vectors.\n");
7712       return false;
7713     }
7714 
7715   if (!vec_stmt) /* transformation not required.  */
7716     {
7717       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7718       if (dump_enabled_p ())
7719         dump_printf_loc (MSG_NOTE, vect_location,
7720                          "=== vectorizable_induction ===\n");
7721       vect_model_induction_cost (stmt_info, ncopies);
7722       return true;
7723     }
7724 
7725   /* Transform.  */
7726 
7727   /* Compute a vector variable, initialized with the first VF values of
7728      the induction variable.  E.g., for an iv with IV_PHI='X' and
7729      evolution S, for a vector of 4 units, we want to compute:
7730      [X, X + S, X + 2*S, X + 3*S].  */
7731 
7732   if (dump_enabled_p ())
7733     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7734 
7735   latch_e = loop_latch_edge (iv_loop);
7736   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7737 
7738   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7739   gcc_assert (step_expr != NULL_TREE);
7740 
7741   pe = loop_preheader_edge (iv_loop);
7742   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7743 				     loop_preheader_edge (iv_loop));
7744 
7745   stmts = NULL;
7746   if (!nested_in_vect_loop)
7747     {
7748       /* Convert the initial value to the desired type.  */
7749       tree new_type = TREE_TYPE (vectype);
7750       init_expr = gimple_convert (&stmts, new_type, init_expr);
7751 
7752       /* If we are using the loop mask to "peel" for alignment then we need
7753 	 to adjust the start value here.  */
7754       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7755       if (skip_niters != NULL_TREE)
7756 	{
7757 	  if (FLOAT_TYPE_P (vectype))
7758 	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7759 					skip_niters);
7760 	  else
7761 	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7762 	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7763 					 skip_niters, step_expr);
7764 	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7765 				    init_expr, skip_step);
7766 	}
7767     }
7768 
7769   /* Convert the step to the desired type.  */
7770   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7771 
7772   if (stmts)
7773     {
7774       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7775       gcc_assert (!new_bb);
7776     }
7777 
7778   /* Find the first insertion point in the BB.  */
7779   si = gsi_after_labels (bb);
7780 
7781   /* For SLP induction we have to generate several IVs as for example
7782      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7783      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7784      [VF*S, VF*S, VF*S, VF*S] for all.  */
7785   if (slp_node)
7786     {
7787       /* Enforced above.  */
7788       unsigned int const_nunits = nunits.to_constant ();
7789 
7790       /* Generate [VF*S, VF*S, ... ].  */
7791       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7792 	{
7793 	  expr = build_int_cst (integer_type_node, vf);
7794 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7795 	}
7796       else
7797 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7798       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7799 			      expr, step_expr);
7800       if (! CONSTANT_CLASS_P (new_name))
7801 	new_name = vect_init_vector (phi, new_name,
7802 				     TREE_TYPE (step_expr), NULL);
7803       new_vec = build_vector_from_val (vectype, new_name);
7804       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7805 
7806       /* Now generate the IVs.  */
7807       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7808       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7809       unsigned elts = const_nunits * nvects;
7810       unsigned nivs = least_common_multiple (group_size,
7811 					     const_nunits) / const_nunits;
7812       gcc_assert (elts % group_size == 0);
7813       tree elt = init_expr;
7814       unsigned ivn;
7815       for (ivn = 0; ivn < nivs; ++ivn)
7816 	{
7817 	  tree_vector_builder elts (vectype, const_nunits, 1);
7818 	  stmts = NULL;
7819 	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7820 	    {
7821 	      if (ivn*const_nunits + eltn >= group_size
7822 		  && (ivn * const_nunits + eltn) % group_size == 0)
7823 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7824 				    elt, step_expr);
7825 	      elts.quick_push (elt);
7826 	    }
7827 	  vec_init = gimple_build_vector (&stmts, &elts);
7828 	  if (stmts)
7829 	    {
7830 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7831 	      gcc_assert (!new_bb);
7832 	    }
7833 
7834 	  /* Create the induction-phi that defines the induction-operand.  */
7835 	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7836 	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7837 	  set_vinfo_for_stmt (induction_phi,
7838 			      new_stmt_vec_info (induction_phi, loop_vinfo));
7839 	  induc_def = PHI_RESULT (induction_phi);
7840 
7841 	  /* Create the iv update inside the loop  */
7842 	  vec_def = make_ssa_name (vec_dest);
7843 	  new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7844 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7845 	  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7846 
7847 	  /* Set the arguments of the phi node:  */
7848 	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7849 	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7850 		       UNKNOWN_LOCATION);
7851 
7852 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7853 	}
7854 
7855       /* Re-use IVs when we can.  */
7856       if (ivn < nvects)
7857 	{
7858 	  unsigned vfp
7859 	    = least_common_multiple (group_size, const_nunits) / group_size;
7860 	  /* Generate [VF'*S, VF'*S, ... ].  */
7861 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7862 	    {
7863 	      expr = build_int_cst (integer_type_node, vfp);
7864 	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7865 	    }
7866 	  else
7867 	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7868 	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7869 				  expr, step_expr);
7870 	  if (! CONSTANT_CLASS_P (new_name))
7871 	    new_name = vect_init_vector (phi, new_name,
7872 					 TREE_TYPE (step_expr), NULL);
7873 	  new_vec = build_vector_from_val (vectype, new_name);
7874 	  vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7875 	  for (; ivn < nvects; ++ivn)
7876 	    {
7877 	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7878 	      tree def;
7879 	      if (gimple_code (iv) == GIMPLE_PHI)
7880 		def = gimple_phi_result (iv);
7881 	      else
7882 		def = gimple_assign_lhs (iv);
7883 	      new_stmt = gimple_build_assign (make_ssa_name (vectype),
7884 					      PLUS_EXPR,
7885 					      def, vec_step);
7886 	      if (gimple_code (iv) == GIMPLE_PHI)
7887 		gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7888 	      else
7889 		{
7890 		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7891 		  gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7892 		}
7893 	      set_vinfo_for_stmt (new_stmt,
7894 				  new_stmt_vec_info (new_stmt, loop_vinfo));
7895 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7896 	    }
7897 	}
7898 
7899       return true;
7900     }
7901 
7902   /* Create the vector that holds the initial_value of the induction.  */
7903   if (nested_in_vect_loop)
7904     {
7905       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7906 	 been created during vectorization of previous stmts.  We obtain it
7907 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7908       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7909       /* If the initial value is not of proper type, convert it.  */
7910       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7911 	{
7912 	  new_stmt
7913 	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7914 							  vect_simple_var,
7915 							  "vec_iv_"),
7916 				   VIEW_CONVERT_EXPR,
7917 				   build1 (VIEW_CONVERT_EXPR, vectype,
7918 					   vec_init));
7919 	  vec_init = gimple_assign_lhs (new_stmt);
7920 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7921 						 new_stmt);
7922 	  gcc_assert (!new_bb);
7923 	  set_vinfo_for_stmt (new_stmt,
7924 			      new_stmt_vec_info (new_stmt, loop_vinfo));
7925 	}
7926     }
7927   else
7928     {
7929       /* iv_loop is the loop to be vectorized. Create:
7930 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7931       stmts = NULL;
7932       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7933 
7934       unsigned HOST_WIDE_INT const_nunits;
7935       if (nunits.is_constant (&const_nunits))
7936 	{
7937 	  tree_vector_builder elts (vectype, const_nunits, 1);
7938 	  elts.quick_push (new_name);
7939 	  for (i = 1; i < const_nunits; i++)
7940 	    {
7941 	      /* Create: new_name_i = new_name + step_expr  */
7942 	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7943 				       new_name, step_expr);
7944 	      elts.quick_push (new_name);
7945 	    }
7946 	  /* Create a vector from [new_name_0, new_name_1, ...,
7947 	     new_name_nunits-1]  */
7948 	  vec_init = gimple_build_vector (&stmts, &elts);
7949 	}
7950       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7951 	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7952 	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7953 				 new_name, step_expr);
7954       else
7955 	{
7956 	  /* Build:
7957 	        [base, base, base, ...]
7958 		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7959 	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7960 	  gcc_assert (flag_associative_math);
7961 	  tree index = build_index_vector (vectype, 0, 1);
7962 	  tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7963 							new_name);
7964 	  tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7965 							step_expr);
7966 	  vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7967 	  vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7968 				   vec_init, step_vec);
7969 	  vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7970 				   vec_init, base_vec);
7971 	}
7972 
7973       if (stmts)
7974 	{
7975 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7976 	  gcc_assert (!new_bb);
7977 	}
7978     }
7979 
7980 
7981   /* Create the vector that holds the step of the induction.  */
7982   if (nested_in_vect_loop)
7983     /* iv_loop is nested in the loop to be vectorized. Generate:
7984        vec_step = [S, S, S, S]  */
7985     new_name = step_expr;
7986   else
7987     {
7988       /* iv_loop is the loop to be vectorized. Generate:
7989 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7990       gimple_seq seq = NULL;
7991       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7992 	{
7993 	  expr = build_int_cst (integer_type_node, vf);
7994 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7995 	}
7996       else
7997 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7998       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7999 			       expr, step_expr);
8000       if (seq)
8001 	{
8002 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8003 	  gcc_assert (!new_bb);
8004 	}
8005     }
8006 
8007   t = unshare_expr (new_name);
8008   gcc_assert (CONSTANT_CLASS_P (new_name)
8009 	      || TREE_CODE (new_name) == SSA_NAME);
8010   new_vec = build_vector_from_val (vectype, t);
8011   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8012 
8013 
8014   /* Create the following def-use cycle:
8015      loop prolog:
8016          vec_init = ...
8017 	 vec_step = ...
8018      loop:
8019          vec_iv = PHI <vec_init, vec_loop>
8020          ...
8021          STMT
8022          ...
8023          vec_loop = vec_iv + vec_step;  */
8024 
8025   /* Create the induction-phi that defines the induction-operand.  */
8026   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8027   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8028   set_vinfo_for_stmt (induction_phi,
8029 		      new_stmt_vec_info (induction_phi, loop_vinfo));
8030   induc_def = PHI_RESULT (induction_phi);
8031 
8032   /* Create the iv update inside the loop  */
8033   vec_def = make_ssa_name (vec_dest);
8034   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8035   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8036   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8037 
8038   /* Set the arguments of the phi node:  */
8039   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8040   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8041 	       UNKNOWN_LOCATION);
8042 
8043   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8044 
8045   /* In case that vectorization factor (VF) is bigger than the number
8046      of elements that we can fit in a vectype (nunits), we have to generate
8047      more than one vector stmt - i.e - we need to "unroll" the
8048      vector stmt by a factor VF/nunits.  For more details see documentation
8049      in vectorizable_operation.  */
8050 
8051   if (ncopies > 1)
8052     {
8053       gimple_seq seq = NULL;
8054       stmt_vec_info prev_stmt_vinfo;
8055       /* FORNOW. This restriction should be relaxed.  */
8056       gcc_assert (!nested_in_vect_loop);
8057 
8058       /* Create the vector that holds the step of the induction.  */
8059       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8060 	{
8061 	  expr = build_int_cst (integer_type_node, nunits);
8062 	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8063 	}
8064       else
8065 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8066       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8067 			       expr, step_expr);
8068       if (seq)
8069 	{
8070 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8071 	  gcc_assert (!new_bb);
8072 	}
8073 
8074       t = unshare_expr (new_name);
8075       gcc_assert (CONSTANT_CLASS_P (new_name)
8076 		  || TREE_CODE (new_name) == SSA_NAME);
8077       new_vec = build_vector_from_val (vectype, t);
8078       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8079 
8080       vec_def = induc_def;
8081       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8082       for (i = 1; i < ncopies; i++)
8083 	{
8084 	  /* vec_i = vec_prev + vec_step  */
8085 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8086 					  vec_def, vec_step);
8087 	  vec_def = make_ssa_name (vec_dest, new_stmt);
8088 	  gimple_assign_set_lhs (new_stmt, vec_def);
8089 
8090 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8091 	  set_vinfo_for_stmt (new_stmt,
8092 			      new_stmt_vec_info (new_stmt, loop_vinfo));
8093 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8094 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8095 	}
8096     }
8097 
8098   if (nested_in_vect_loop)
8099     {
8100       /* Find the loop-closed exit-phi of the induction, and record
8101          the final vector of induction results:  */
8102       exit_phi = NULL;
8103       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8104         {
8105 	  gimple *use_stmt = USE_STMT (use_p);
8106 	  if (is_gimple_debug (use_stmt))
8107 	    continue;
8108 
8109 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8110 	    {
8111 	      exit_phi = use_stmt;
8112 	      break;
8113 	    }
8114         }
8115       if (exit_phi)
8116 	{
8117 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8118 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
8119 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
8120 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8121 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
8122 
8123 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8124 	  if (dump_enabled_p ())
8125 	    {
8126 	      dump_printf_loc (MSG_NOTE, vect_location,
8127 			       "vector of inductions after inner-loop:");
8128 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8129 	    }
8130 	}
8131     }
8132 
8133 
8134   if (dump_enabled_p ())
8135     {
8136       dump_printf_loc (MSG_NOTE, vect_location,
8137 		       "transform induction: created def-use cycle: ");
8138       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8139       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8140 			SSA_NAME_DEF_STMT (vec_def), 0);
8141     }
8142 
8143   return true;
8144 }
8145 
8146 /* Function vectorizable_live_operation.
8147 
8148    STMT computes a value that is used outside the loop.  Check if
8149    it can be supported.  */
8150 
8151 bool
8152 vectorizable_live_operation (gimple *stmt,
8153 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8154 			     slp_tree slp_node, int slp_index,
8155 			     gimple **vec_stmt)
8156 {
8157   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8158   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8159   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8160   imm_use_iterator imm_iter;
8161   tree lhs, lhs_type, bitsize, vec_bitsize;
8162   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8163   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8164   int ncopies;
8165   gimple *use_stmt;
8166   auto_vec<tree> vec_oprnds;
8167   int vec_entry = 0;
8168   poly_uint64 vec_index = 0;
8169 
8170   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8171 
8172   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8173     return false;
8174 
8175   /* FORNOW.  CHECKME.  */
8176   if (nested_in_vect_loop_p (loop, stmt))
8177     return false;
8178 
8179   /* If STMT is not relevant and it is a simple assignment and its inputs are
8180      invariant then it can remain in place, unvectorized.  The original last
8181      scalar value that it computes will be used.  */
8182   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8183     {
8184       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8185       if (dump_enabled_p ())
8186 	dump_printf_loc (MSG_NOTE, vect_location,
8187 			 "statement is simple and uses invariant.  Leaving in "
8188 			 "place.\n");
8189       return true;
8190     }
8191 
8192   if (slp_node)
8193     ncopies = 1;
8194   else
8195     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8196 
8197   if (slp_node)
8198     {
8199       gcc_assert (slp_index >= 0);
8200 
8201       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8202       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8203 
8204       /* Get the last occurrence of the scalar index from the concatenation of
8205 	 all the slp vectors. Calculate which slp vector it is and the index
8206 	 within.  */
8207       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8208 
8209       /* Calculate which vector contains the result, and which lane of
8210 	 that vector we need.  */
8211       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8212 	{
8213 	  if (dump_enabled_p ())
8214 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8215 			     "Cannot determine which vector holds the"
8216 			     " final result.\n");
8217 	  return false;
8218 	}
8219     }
8220 
8221   if (!vec_stmt)
8222     {
8223       /* No transformation required.  */
8224       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8225 	{
8226 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8227 					       OPTIMIZE_FOR_SPEED))
8228 	    {
8229 	      if (dump_enabled_p ())
8230 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8231 				 "can't use a fully-masked loop because "
8232 				 "the target doesn't support extract last "
8233 				 "reduction.\n");
8234 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8235 	    }
8236 	  else if (slp_node)
8237 	    {
8238 	      if (dump_enabled_p ())
8239 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8240 				 "can't use a fully-masked loop because an "
8241 				 "SLP statement is live after the loop.\n");
8242 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8243 	    }
8244 	  else if (ncopies > 1)
8245 	    {
8246 	      if (dump_enabled_p ())
8247 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248 				 "can't use a fully-masked loop because"
8249 				 " ncopies is greater than 1.\n");
8250 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8251 	    }
8252 	  else
8253 	    {
8254 	      gcc_assert (ncopies == 1 && !slp_node);
8255 	      vect_record_loop_mask (loop_vinfo,
8256 				     &LOOP_VINFO_MASKS (loop_vinfo),
8257 				     1, vectype);
8258 	    }
8259 	}
8260       return true;
8261     }
8262 
8263   /* If stmt has a related stmt, then use that for getting the lhs.  */
8264   if (is_pattern_stmt_p (stmt_info))
8265     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8266 
8267   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8268 	: gimple_get_lhs (stmt);
8269   lhs_type = TREE_TYPE (lhs);
8270 
8271   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8272 	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8273 	     : TYPE_SIZE (TREE_TYPE (vectype)));
8274   vec_bitsize = TYPE_SIZE (vectype);
8275 
8276   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8277   tree vec_lhs, bitstart;
8278   if (slp_node)
8279     {
8280       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8281 
8282       /* Get the correct slp vectorized stmt.  */
8283       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8284       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8285 	vec_lhs = gimple_phi_result (phi);
8286       else
8287 	vec_lhs = gimple_get_lhs (vec_stmt);
8288 
8289       /* Get entry to use.  */
8290       bitstart = bitsize_int (vec_index);
8291       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8292     }
8293   else
8294     {
8295       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8296       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8297       gcc_checking_assert (ncopies == 1
8298 			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8299 
8300       /* For multiple copies, get the last copy.  */
8301       for (int i = 1; i < ncopies; ++i)
8302 	vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8303 						  vec_lhs);
8304 
8305       /* Get the last lane in the vector.  */
8306       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8307     }
8308 
8309   gimple_seq stmts = NULL;
8310   tree new_tree;
8311   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8312     {
8313       /* Emit:
8314 
8315 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8316 
8317 	 where VEC_LHS is the vectorized live-out result and MASK is
8318 	 the loop mask for the final iteration.  */
8319       gcc_assert (ncopies == 1 && !slp_node);
8320       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8321       tree scalar_res = make_ssa_name (scalar_type);
8322       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8323 				      1, vectype, 0);
8324       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8325 						    2, mask, vec_lhs);
8326       gimple_call_set_lhs (new_stmt, scalar_res);
8327       gimple_seq_add_stmt (&stmts, new_stmt);
8328 
8329       /* Convert the extracted vector element to the required scalar type.  */
8330       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8331     }
8332   else
8333     {
8334       tree bftype = TREE_TYPE (vectype);
8335       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8336 	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8337       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8338       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8339 				       &stmts, true, NULL_TREE);
8340     }
8341 
8342   if (stmts)
8343     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8344 
8345   /* Replace use of lhs with newly computed result.  If the use stmt is a
8346      single arg PHI, just replace all uses of PHI result.  It's necessary
8347      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8348   use_operand_p use_p;
8349   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8350     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8351 	&& !is_gimple_debug (use_stmt))
8352     {
8353       if (gimple_code (use_stmt) == GIMPLE_PHI
8354 	  && gimple_phi_num_args (use_stmt) == 1)
8355 	{
8356 	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8357 	}
8358       else
8359 	{
8360 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8361 	    SET_USE (use_p, new_tree);
8362 	}
8363       update_stmt (use_stmt);
8364     }
8365 
8366   return true;
8367 }
8368 
8369 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8370 
8371 static void
8372 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8373 {
8374   ssa_op_iter op_iter;
8375   imm_use_iterator imm_iter;
8376   def_operand_p def_p;
8377   gimple *ustmt;
8378 
8379   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8380     {
8381       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8382 	{
8383 	  basic_block bb;
8384 
8385 	  if (!is_gimple_debug (ustmt))
8386 	    continue;
8387 
8388 	  bb = gimple_bb (ustmt);
8389 
8390 	  if (!flow_bb_inside_loop_p (loop, bb))
8391 	    {
8392 	      if (gimple_debug_bind_p (ustmt))
8393 		{
8394 		  if (dump_enabled_p ())
8395 		    dump_printf_loc (MSG_NOTE, vect_location,
8396                                      "killing debug use\n");
8397 
8398 		  gimple_debug_bind_reset_value (ustmt);
8399 		  update_stmt (ustmt);
8400 		}
8401 	      else
8402 		gcc_unreachable ();
8403 	    }
8404 	}
8405     }
8406 }
8407 
8408 /* Given loop represented by LOOP_VINFO, return true if computation of
8409    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8410    otherwise.  */
8411 
8412 static bool
8413 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8414 {
8415   /* Constant case.  */
8416   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8417     {
8418       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8419       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8420 
8421       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8422       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8423       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8424 	return true;
8425     }
8426 
8427   widest_int max;
8428   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8429   /* Check the upper bound of loop niters.  */
8430   if (get_max_loop_iterations (loop, &max))
8431     {
8432       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8433       signop sgn = TYPE_SIGN (type);
8434       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8435       if (max < type_max)
8436 	return true;
8437     }
8438   return false;
8439 }
8440 
8441 /* Return a mask type with half the number of elements as TYPE.  */
8442 
8443 tree
8444 vect_halve_mask_nunits (tree type)
8445 {
8446   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8447   return build_truth_vector_type (nunits, current_vector_size);
8448 }
8449 
8450 /* Return a mask type with twice as many elements as TYPE.  */
8451 
8452 tree
8453 vect_double_mask_nunits (tree type)
8454 {
8455   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8456   return build_truth_vector_type (nunits, current_vector_size);
8457 }
8458 
8459 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8460    contain a sequence of NVECTORS masks that each control a vector of type
8461    VECTYPE.  */
8462 
8463 void
8464 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8465 		       unsigned int nvectors, tree vectype)
8466 {
8467   gcc_assert (nvectors != 0);
8468   if (masks->length () < nvectors)
8469     masks->safe_grow_cleared (nvectors);
8470   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8471   /* The number of scalars per iteration and the number of vectors are
8472      both compile-time constants.  */
8473   unsigned int nscalars_per_iter
8474     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8475 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8476   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8477     {
8478       rgm->max_nscalars_per_iter = nscalars_per_iter;
8479       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8480     }
8481 }
8482 
8483 /* Given a complete set of masks MASKS, extract mask number INDEX
8484    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8485    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8486 
8487    See the comment above vec_loop_masks for more details about the mask
8488    arrangement.  */
8489 
8490 tree
8491 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8492 		    unsigned int nvectors, tree vectype, unsigned int index)
8493 {
8494   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8495   tree mask_type = rgm->mask_type;
8496 
8497   /* Populate the rgroup's mask array, if this is the first time we've
8498      used it.  */
8499   if (rgm->masks.is_empty ())
8500     {
8501       rgm->masks.safe_grow_cleared (nvectors);
8502       for (unsigned int i = 0; i < nvectors; ++i)
8503 	{
8504 	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8505 	  /* Provide a dummy definition until the real one is available.  */
8506 	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8507 	  rgm->masks[i] = mask;
8508 	}
8509     }
8510 
8511   tree mask = rgm->masks[index];
8512   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8513 		TYPE_VECTOR_SUBPARTS (vectype)))
8514     {
8515       /* A loop mask for data type X can be reused for data type Y
8516 	 if X has N times more elements than Y and if Y's elements
8517 	 are N times bigger than X's.  In this case each sequence
8518 	 of N elements in the loop mask will be all-zero or all-one.
8519 	 We can then view-convert the mask so that each sequence of
8520 	 N elements is replaced by a single element.  */
8521       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8522 			      TYPE_VECTOR_SUBPARTS (vectype)));
8523       gimple_seq seq = NULL;
8524       mask_type = build_same_sized_truth_vector_type (vectype);
8525       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8526       if (seq)
8527 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8528     }
8529   return mask;
8530 }
8531 
8532 /* Scale profiling counters by estimation for LOOP which is vectorized
8533    by factor VF.  */
8534 
8535 static void
8536 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8537 {
8538   edge preheader = loop_preheader_edge (loop);
8539   /* Reduce loop iterations by the vectorization factor.  */
8540   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8541   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8542 
8543   if (freq_h.nonzero_p ())
8544     {
8545       profile_probability p;
8546 
8547       /* Avoid dropping loop body profile counter to 0 because of zero count
8548 	 in loop's preheader.  */
8549       if (!(freq_e == profile_count::zero ()))
8550         freq_e = freq_e.force_nonzero ();
8551       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8552       scale_loop_frequencies (loop, p);
8553     }
8554 
8555   edge exit_e = single_exit (loop);
8556   exit_e->probability = profile_probability::always ()
8557 				 .apply_scale (1, new_est_niter + 1);
8558 
8559   edge exit_l = single_pred_edge (loop->latch);
8560   profile_probability prob = exit_l->probability;
8561   exit_l->probability = exit_e->probability.invert ();
8562   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8563     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8564 }
8565 
8566 /* Function vect_transform_loop.
8567 
8568    The analysis phase has determined that the loop is vectorizable.
8569    Vectorize the loop - created vectorized stmts to replace the scalar
8570    stmts in the loop, and update the loop exit condition.
8571    Returns scalar epilogue loop if any.  */
8572 
8573 struct loop *
8574 vect_transform_loop (loop_vec_info loop_vinfo)
8575 {
8576   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8577   struct loop *epilogue = NULL;
8578   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8579   int nbbs = loop->num_nodes;
8580   int i;
8581   tree niters_vector = NULL_TREE;
8582   tree step_vector = NULL_TREE;
8583   tree niters_vector_mult_vf = NULL_TREE;
8584   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8585   unsigned int lowest_vf = constant_lower_bound (vf);
8586   bool grouped_store;
8587   bool slp_scheduled = false;
8588   gimple *stmt, *pattern_stmt;
8589   gimple_seq pattern_def_seq = NULL;
8590   gimple_stmt_iterator pattern_def_si = gsi_none ();
8591   bool transform_pattern_stmt = false;
8592   bool check_profitability = false;
8593   unsigned int th;
8594 
8595   if (dump_enabled_p ())
8596     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8597 
8598   /* Use the more conservative vectorization threshold.  If the number
8599      of iterations is constant assume the cost check has been performed
8600      by our caller.  If the threshold makes all loops profitable that
8601      run at least the (estimated) vectorization factor number of times
8602      checking is pointless, too.  */
8603   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8604   if (th >= vect_vf_for_cost (loop_vinfo)
8605       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8606     {
8607       if (dump_enabled_p ())
8608 	dump_printf_loc (MSG_NOTE, vect_location,
8609 			 "Profitability threshold is %d loop iterations.\n",
8610                          th);
8611       check_profitability = true;
8612     }
8613 
8614   /* Make sure there exists a single-predecessor exit bb.  Do this before
8615      versioning.   */
8616   edge e = single_exit (loop);
8617   if (! single_pred_p (e->dest))
8618     {
8619       split_loop_exit_edge (e);
8620       if (dump_enabled_p ())
8621 	dump_printf (MSG_NOTE, "split exit edge\n");
8622     }
8623 
8624   /* Version the loop first, if required, so the profitability check
8625      comes first.  */
8626 
8627   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8628     {
8629       poly_uint64 versioning_threshold
8630 	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8631       if (check_profitability
8632 	  && ordered_p (poly_uint64 (th), versioning_threshold))
8633 	{
8634 	  versioning_threshold = ordered_max (poly_uint64 (th),
8635 					      versioning_threshold);
8636 	  check_profitability = false;
8637 	}
8638       vect_loop_versioning (loop_vinfo, th, check_profitability,
8639 			    versioning_threshold);
8640       check_profitability = false;
8641     }
8642 
8643   /* Make sure there exists a single-predecessor exit bb also on the
8644      scalar loop copy.  Do this after versioning but before peeling
8645      so CFG structure is fine for both scalar and if-converted loop
8646      to make slpeel_duplicate_current_defs_from_edges face matched
8647      loop closed PHI nodes on the exit.  */
8648   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8649     {
8650       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8651       if (! single_pred_p (e->dest))
8652 	{
8653 	  split_loop_exit_edge (e);
8654 	  if (dump_enabled_p ())
8655 	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8656 	}
8657     }
8658 
8659   tree niters = vect_build_loop_niters (loop_vinfo);
8660   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8661   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8662   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8663   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8664 			      &step_vector, &niters_vector_mult_vf, th,
8665 			      check_profitability, niters_no_overflow);
8666 
8667   if (niters_vector == NULL_TREE)
8668     {
8669       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8670 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8671 	  && known_eq (lowest_vf, vf))
8672 	{
8673 	  niters_vector
8674 	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8675 			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8676 	  step_vector = build_one_cst (TREE_TYPE (niters));
8677 	}
8678       else
8679 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8680 				     &step_vector, niters_no_overflow);
8681     }
8682 
8683   /* 1) Make sure the loop header has exactly two entries
8684      2) Make sure we have a preheader basic block.  */
8685 
8686   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8687 
8688   split_edge (loop_preheader_edge (loop));
8689 
8690   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8691       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8692     /* This will deal with any possible peeling.  */
8693     vect_prepare_for_masked_peels (loop_vinfo);
8694 
8695   /* FORNOW: the vectorizer supports only loops which body consist
8696      of one basic block (header + empty latch). When the vectorizer will
8697      support more involved loop forms, the order by which the BBs are
8698      traversed need to be reconsidered.  */
8699 
8700   for (i = 0; i < nbbs; i++)
8701     {
8702       basic_block bb = bbs[i];
8703       stmt_vec_info stmt_info;
8704 
8705       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8706 	   gsi_next (&si))
8707         {
8708 	  gphi *phi = si.phi ();
8709 	  if (dump_enabled_p ())
8710 	    {
8711 	      dump_printf_loc (MSG_NOTE, vect_location,
8712                                "------>vectorizing phi: ");
8713 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8714 	    }
8715 	  stmt_info = vinfo_for_stmt (phi);
8716 	  if (!stmt_info)
8717 	    continue;
8718 
8719 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8720 	    vect_loop_kill_debug_uses (loop, phi);
8721 
8722 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8723 	      && !STMT_VINFO_LIVE_P (stmt_info))
8724 	    continue;
8725 
8726 	  if (STMT_VINFO_VECTYPE (stmt_info)
8727 	      && (maybe_ne
8728 		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8729 	      && dump_enabled_p ())
8730 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8731 
8732 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8733 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8734 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8735 	      && ! PURE_SLP_STMT (stmt_info))
8736 	    {
8737 	      if (dump_enabled_p ())
8738 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8739 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8740 	    }
8741 	}
8742 
8743       pattern_stmt = NULL;
8744       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8745 	   !gsi_end_p (si) || transform_pattern_stmt;)
8746 	{
8747 	  bool is_store;
8748 
8749           if (transform_pattern_stmt)
8750 	    stmt = pattern_stmt;
8751           else
8752 	    {
8753 	      stmt = gsi_stmt (si);
8754 	      /* During vectorization remove existing clobber stmts.  */
8755 	      if (gimple_clobber_p (stmt))
8756 		{
8757 		  unlink_stmt_vdef (stmt);
8758 		  gsi_remove (&si, true);
8759 		  release_defs (stmt);
8760 		  continue;
8761 		}
8762 	    }
8763 
8764 	  if (dump_enabled_p ())
8765 	    {
8766 	      dump_printf_loc (MSG_NOTE, vect_location,
8767 			       "------>vectorizing statement: ");
8768 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8769 	    }
8770 
8771 	  stmt_info = vinfo_for_stmt (stmt);
8772 
8773 	  /* vector stmts created in the outer-loop during vectorization of
8774 	     stmts in an inner-loop may not have a stmt_info, and do not
8775 	     need to be vectorized.  */
8776 	  if (!stmt_info)
8777 	    {
8778 	      gsi_next (&si);
8779 	      continue;
8780 	    }
8781 
8782 	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8783 	    vect_loop_kill_debug_uses (loop, stmt);
8784 
8785 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8786 	      && !STMT_VINFO_LIVE_P (stmt_info))
8787             {
8788               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8789                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8790                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8791                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8792                 {
8793                   stmt = pattern_stmt;
8794                   stmt_info = vinfo_for_stmt (stmt);
8795                 }
8796               else
8797 	        {
8798    	          gsi_next (&si);
8799 	          continue;
8800                 }
8801 	    }
8802           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8803                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8804                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8805                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8806             transform_pattern_stmt = true;
8807 
8808 	  /* If pattern statement has def stmts, vectorize them too.  */
8809 	  if (is_pattern_stmt_p (stmt_info))
8810 	    {
8811 	      if (pattern_def_seq == NULL)
8812 		{
8813 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8814 		  pattern_def_si = gsi_start (pattern_def_seq);
8815 		}
8816 	      else if (!gsi_end_p (pattern_def_si))
8817 		gsi_next (&pattern_def_si);
8818 	      if (pattern_def_seq != NULL)
8819 		{
8820 		  gimple *pattern_def_stmt = NULL;
8821 		  stmt_vec_info pattern_def_stmt_info = NULL;
8822 
8823 		  while (!gsi_end_p (pattern_def_si))
8824 		    {
8825 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
8826 		      pattern_def_stmt_info
8827 			= vinfo_for_stmt (pattern_def_stmt);
8828 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8829 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8830 			break;
8831 		      gsi_next (&pattern_def_si);
8832 		    }
8833 
8834 		  if (!gsi_end_p (pattern_def_si))
8835 		    {
8836 		      if (dump_enabled_p ())
8837 			{
8838 			  dump_printf_loc (MSG_NOTE, vect_location,
8839 					   "==> vectorizing pattern def "
8840 					   "stmt: ");
8841 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8842 					    pattern_def_stmt, 0);
8843 			}
8844 
8845 		      stmt = pattern_def_stmt;
8846 		      stmt_info = pattern_def_stmt_info;
8847 		    }
8848 		  else
8849 		    {
8850 		      pattern_def_si = gsi_none ();
8851 		      transform_pattern_stmt = false;
8852 		    }
8853 		}
8854 	      else
8855 		transform_pattern_stmt = false;
8856             }
8857 
8858 	  if (STMT_VINFO_VECTYPE (stmt_info))
8859 	    {
8860 	      poly_uint64 nunits
8861 		= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8862 	      if (!STMT_SLP_TYPE (stmt_info)
8863 		  && maybe_ne (nunits, vf)
8864 		  && dump_enabled_p ())
8865 		  /* For SLP VF is set according to unrolling factor, and not
8866 		     to vector size, hence for SLP this print is not valid.  */
8867 		dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8868 	    }
8869 
8870 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
8871 	     reached.  */
8872 	  if (STMT_SLP_TYPE (stmt_info))
8873 	    {
8874 	      if (!slp_scheduled)
8875 		{
8876 		  slp_scheduled = true;
8877 
8878 		  if (dump_enabled_p ())
8879 		    dump_printf_loc (MSG_NOTE, vect_location,
8880 				     "=== scheduling SLP instances ===\n");
8881 
8882 		  vect_schedule_slp (loop_vinfo);
8883 		}
8884 
8885 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8886 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8887 		{
8888 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8889 		    {
8890 		      pattern_def_seq = NULL;
8891 		      gsi_next (&si);
8892 		    }
8893 		  continue;
8894 		}
8895 	    }
8896 
8897 	  /* -------- vectorize statement ------------ */
8898 	  if (dump_enabled_p ())
8899 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8900 
8901 	  grouped_store = false;
8902 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8903           if (is_store)
8904             {
8905 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8906 		{
8907 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8908 		     interleaving chain was completed - free all the stores in
8909 		     the chain.  */
8910 		  gsi_next (&si);
8911 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8912 		}
8913 	      else
8914 		{
8915 		  /* Free the attached stmt_vec_info and remove the stmt.  */
8916 		  gimple *store = gsi_stmt (si);
8917 		  free_stmt_vec_info (store);
8918 		  unlink_stmt_vdef (store);
8919 		  gsi_remove (&si, true);
8920 		  release_defs (store);
8921 		}
8922 
8923 	      /* Stores can only appear at the end of pattern statements.  */
8924 	      gcc_assert (!transform_pattern_stmt);
8925 	      pattern_def_seq = NULL;
8926 	    }
8927 	  else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8928 	    {
8929 	      pattern_def_seq = NULL;
8930 	      gsi_next (&si);
8931 	    }
8932 	}		        /* stmts in BB */
8933 
8934       /* Stub out scalar statements that must not survive vectorization.
8935 	 Doing this here helps with grouped statements, or statements that
8936 	 are involved in patterns.  */
8937       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8938 	   !gsi_end_p (gsi); gsi_next (&gsi))
8939 	{
8940 	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8941 	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8942 	    {
8943 	      tree lhs = gimple_get_lhs (call);
8944 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8945 		{
8946 		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8947 		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8948 		  gsi_replace (&gsi, new_stmt, true);
8949 		}
8950 	    }
8951 	}
8952     }				/* BBs in loop */
8953 
8954   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8955      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8956   if (integer_onep (step_vector))
8957     niters_no_overflow = true;
8958   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8959 			   niters_vector_mult_vf, !niters_no_overflow);
8960 
8961   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8962   scale_profile_for_vect_loop (loop, assumed_vf);
8963 
8964   /* True if the final iteration might not handle a full vector's
8965      worth of scalar iterations.  */
8966   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8967   /* The minimum number of iterations performed by the epilogue.  This
8968      is 1 when peeling for gaps because we always need a final scalar
8969      iteration.  */
8970   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8971   /* +1 to convert latch counts to loop iteration counts,
8972      -min_epilogue_iters to remove iterations that cannot be performed
8973        by the vector code.  */
8974   int bias_for_lowest = 1 - min_epilogue_iters;
8975   int bias_for_assumed = bias_for_lowest;
8976   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8977   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8978     {
8979       /* When the amount of peeling is known at compile time, the first
8980 	 iteration will have exactly alignment_npeels active elements.
8981 	 In the worst case it will have at least one.  */
8982       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8983       bias_for_lowest += lowest_vf - min_first_active;
8984       bias_for_assumed += assumed_vf - min_first_active;
8985     }
8986   /* In these calculations the "- 1" converts loop iteration counts
8987      back to latch counts.  */
8988   if (loop->any_upper_bound)
8989     loop->nb_iterations_upper_bound
8990       = (final_iter_may_be_partial
8991 	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8992 			  lowest_vf) - 1
8993 	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8994 			   lowest_vf) - 1);
8995   if (loop->any_likely_upper_bound)
8996     loop->nb_iterations_likely_upper_bound
8997       = (final_iter_may_be_partial
8998 	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8999 			  + bias_for_lowest, lowest_vf) - 1
9000 	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9001 			   + bias_for_lowest, lowest_vf) - 1);
9002   if (loop->any_estimate)
9003     loop->nb_iterations_estimate
9004       = (final_iter_may_be_partial
9005 	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9006 			  assumed_vf) - 1
9007 	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9008 			   assumed_vf) - 1);
9009 
9010   if (dump_enabled_p ())
9011     {
9012       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9013 	{
9014 	  dump_printf_loc (MSG_NOTE, vect_location,
9015 			   "LOOP VECTORIZED\n");
9016 	  if (loop->inner)
9017 	    dump_printf_loc (MSG_NOTE, vect_location,
9018 			     "OUTER LOOP VECTORIZED\n");
9019 	  dump_printf (MSG_NOTE, "\n");
9020 	}
9021       else
9022 	{
9023 	  dump_printf_loc (MSG_NOTE, vect_location,
9024 			   "LOOP EPILOGUE VECTORIZED (VS=");
9025 	  dump_dec (MSG_NOTE, current_vector_size);
9026 	  dump_printf (MSG_NOTE, ")\n");
9027 	}
9028     }
9029 
9030   /* Free SLP instances here because otherwise stmt reference counting
9031      won't work.  */
9032   slp_instance instance;
9033   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9034     vect_free_slp_instance (instance);
9035   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9036   /* Clear-up safelen field since its value is invalid after vectorization
9037      since vectorized loop can have loop-carried dependencies.  */
9038   loop->safelen = 0;
9039 
9040   /* Don't vectorize epilogue for epilogue.  */
9041   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9042     epilogue = NULL;
9043 
9044   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9045     epilogue = NULL;
9046 
9047   if (epilogue)
9048     {
9049       auto_vector_sizes vector_sizes;
9050       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9051       unsigned int next_size = 0;
9052 
9053       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9054 	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9055 	  && known_eq (vf, lowest_vf))
9056 	{
9057 	  unsigned int eiters
9058 	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9059 	       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9060 	  eiters = eiters % lowest_vf;
9061 	  epilogue->nb_iterations_upper_bound = eiters - 1;
9062 
9063 	  unsigned int ratio;
9064 	  while (next_size < vector_sizes.length ()
9065 		 && !(constant_multiple_p (current_vector_size,
9066 					   vector_sizes[next_size], &ratio)
9067 		      && eiters >= lowest_vf / ratio))
9068 	    next_size += 1;
9069 	}
9070       else
9071 	while (next_size < vector_sizes.length ()
9072 	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
9073 	  next_size += 1;
9074 
9075       if (next_size == vector_sizes.length ())
9076 	epilogue = NULL;
9077     }
9078 
9079   if (epilogue)
9080     {
9081       epilogue->force_vectorize = loop->force_vectorize;
9082       epilogue->safelen = loop->safelen;
9083       epilogue->dont_vectorize = false;
9084 
9085       /* We may need to if-convert epilogue to vectorize it.  */
9086       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9087 	tree_if_conversion (epilogue);
9088     }
9089 
9090   return epilogue;
9091 }
9092 
9093 /* The code below is trying to perform simple optimization - revert
9094    if-conversion for masked stores, i.e. if the mask of a store is zero
9095    do not perform it and all stored value producers also if possible.
9096    For example,
9097      for (i=0; i<n; i++)
9098        if (c[i])
9099 	{
9100 	  p1[i] += 1;
9101 	  p2[i] = p3[i] +2;
9102 	}
9103    this transformation will produce the following semi-hammock:
9104 
9105    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9106      {
9107        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9108        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9109        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9110        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9111        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9112        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9113      }
9114 */
9115 
9116 void
9117 optimize_mask_stores (struct loop *loop)
9118 {
9119   basic_block *bbs = get_loop_body (loop);
9120   unsigned nbbs = loop->num_nodes;
9121   unsigned i;
9122   basic_block bb;
9123   struct loop *bb_loop;
9124   gimple_stmt_iterator gsi;
9125   gimple *stmt;
9126   auto_vec<gimple *> worklist;
9127 
9128   vect_location = find_loop_location (loop);
9129   /* Pick up all masked stores in loop if any.  */
9130   for (i = 0; i < nbbs; i++)
9131     {
9132       bb = bbs[i];
9133       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9134 	   gsi_next (&gsi))
9135 	{
9136 	  stmt = gsi_stmt (gsi);
9137 	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9138 	    worklist.safe_push (stmt);
9139 	}
9140     }
9141 
9142   free (bbs);
9143   if (worklist.is_empty ())
9144     return;
9145 
9146   /* Loop has masked stores.  */
9147   while (!worklist.is_empty ())
9148     {
9149       gimple *last, *last_store;
9150       edge e, efalse;
9151       tree mask;
9152       basic_block store_bb, join_bb;
9153       gimple_stmt_iterator gsi_to;
9154       tree vdef, new_vdef;
9155       gphi *phi;
9156       tree vectype;
9157       tree zero;
9158 
9159       last = worklist.pop ();
9160       mask = gimple_call_arg (last, 2);
9161       bb = gimple_bb (last);
9162       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9163 	 the same loop as if_bb.  It could be different to LOOP when two
9164 	 level loop-nest is vectorized and mask_store belongs to the inner
9165 	 one.  */
9166       e = split_block (bb, last);
9167       bb_loop = bb->loop_father;
9168       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9169       join_bb = e->dest;
9170       store_bb = create_empty_bb (bb);
9171       add_bb_to_loop (store_bb, bb_loop);
9172       e->flags = EDGE_TRUE_VALUE;
9173       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9174       /* Put STORE_BB to likely part.  */
9175       efalse->probability = profile_probability::unlikely ();
9176       store_bb->count = efalse->count ();
9177       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9178       if (dom_info_available_p (CDI_DOMINATORS))
9179 	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9180       if (dump_enabled_p ())
9181 	dump_printf_loc (MSG_NOTE, vect_location,
9182 			 "Create new block %d to sink mask stores.",
9183 			 store_bb->index);
9184       /* Create vector comparison with boolean result.  */
9185       vectype = TREE_TYPE (mask);
9186       zero = build_zero_cst (vectype);
9187       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9188       gsi = gsi_last_bb (bb);
9189       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9190       /* Create new PHI node for vdef of the last masked store:
9191 	 .MEM_2 = VDEF <.MEM_1>
9192 	 will be converted to
9193 	 .MEM.3 = VDEF <.MEM_1>
9194 	 and new PHI node will be created in join bb
9195 	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9196       */
9197       vdef = gimple_vdef (last);
9198       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9199       gimple_set_vdef (last, new_vdef);
9200       phi = create_phi_node (vdef, join_bb);
9201       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9202 
9203       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9204       while (true)
9205 	{
9206 	  gimple_stmt_iterator gsi_from;
9207 	  gimple *stmt1 = NULL;
9208 
9209 	  /* Move masked store to STORE_BB.  */
9210 	  last_store = last;
9211 	  gsi = gsi_for_stmt (last);
9212 	  gsi_from = gsi;
9213 	  /* Shift GSI to the previous stmt for further traversal.  */
9214 	  gsi_prev (&gsi);
9215 	  gsi_to = gsi_start_bb (store_bb);
9216 	  gsi_move_before (&gsi_from, &gsi_to);
9217 	  /* Setup GSI_TO to the non-empty block start.  */
9218 	  gsi_to = gsi_start_bb (store_bb);
9219 	  if (dump_enabled_p ())
9220 	    {
9221 	      dump_printf_loc (MSG_NOTE, vect_location,
9222 			       "Move stmt to created bb\n");
9223 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9224 	    }
9225 	  /* Move all stored value producers if possible.  */
9226 	  while (!gsi_end_p (gsi))
9227 	    {
9228 	      tree lhs;
9229 	      imm_use_iterator imm_iter;
9230 	      use_operand_p use_p;
9231 	      bool res;
9232 
9233 	      /* Skip debug statements.  */
9234 	      if (is_gimple_debug (gsi_stmt (gsi)))
9235 		{
9236 		  gsi_prev (&gsi);
9237 		  continue;
9238 		}
9239 	      stmt1 = gsi_stmt (gsi);
9240 	      /* Do not consider statements writing to memory or having
9241 		 volatile operand.  */
9242 	      if (gimple_vdef (stmt1)
9243 		  || gimple_has_volatile_ops (stmt1))
9244 		break;
9245 	      gsi_from = gsi;
9246 	      gsi_prev (&gsi);
9247 	      lhs = gimple_get_lhs (stmt1);
9248 	      if (!lhs)
9249 		break;
9250 
9251 	      /* LHS of vectorized stmt must be SSA_NAME.  */
9252 	      if (TREE_CODE (lhs) != SSA_NAME)
9253 		break;
9254 
9255 	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9256 		{
9257 		  /* Remove dead scalar statement.  */
9258 		  if (has_zero_uses (lhs))
9259 		    {
9260 		      gsi_remove (&gsi_from, true);
9261 		      continue;
9262 		    }
9263 		}
9264 
9265 	      /* Check that LHS does not have uses outside of STORE_BB.  */
9266 	      res = true;
9267 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9268 		{
9269 		  gimple *use_stmt;
9270 		  use_stmt = USE_STMT (use_p);
9271 		  if (is_gimple_debug (use_stmt))
9272 		    continue;
9273 		  if (gimple_bb (use_stmt) != store_bb)
9274 		    {
9275 		      res = false;
9276 		      break;
9277 		    }
9278 		}
9279 	      if (!res)
9280 		break;
9281 
9282 	      if (gimple_vuse (stmt1)
9283 		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
9284 		break;
9285 
9286 	      /* Can move STMT1 to STORE_BB.  */
9287 	      if (dump_enabled_p ())
9288 		{
9289 		  dump_printf_loc (MSG_NOTE, vect_location,
9290 				   "Move stmt to created bb\n");
9291 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9292 		}
9293 	      gsi_move_before (&gsi_from, &gsi_to);
9294 	      /* Shift GSI_TO for further insertion.  */
9295 	      gsi_prev (&gsi_to);
9296 	    }
9297 	  /* Put other masked stores with the same mask to STORE_BB.  */
9298 	  if (worklist.is_empty ()
9299 	      || gimple_call_arg (worklist.last (), 2) != mask
9300 	      || worklist.last () != stmt1)
9301 	    break;
9302 	  last = worklist.pop ();
9303 	}
9304       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9305     }
9306 }
9307