xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-loop.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Loop Vectorization
2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "predict.h"
40 #include "hard-reg-set.h"
41 #include "function.h"
42 #include "dominance.h"
43 #include "cfg.h"
44 #include "cfganal.h"
45 #include "basic-block.h"
46 #include "gimple-pretty-print.h"
47 #include "tree-ssa-alias.h"
48 #include "internal-fn.h"
49 #include "gimple-expr.h"
50 #include "is-a.h"
51 #include "gimple.h"
52 #include "gimplify.h"
53 #include "gimple-iterator.h"
54 #include "gimplify-me.h"
55 #include "gimple-ssa.h"
56 #include "tree-phinodes.h"
57 #include "ssa-iterators.h"
58 #include "stringpool.h"
59 #include "tree-ssanames.h"
60 #include "tree-ssa-loop-ivopts.h"
61 #include "tree-ssa-loop-manip.h"
62 #include "tree-ssa-loop-niter.h"
63 #include "tree-pass.h"
64 #include "cfgloop.h"
65 #include "hashtab.h"
66 #include "rtl.h"
67 #include "flags.h"
68 #include "statistics.h"
69 #include "real.h"
70 #include "fixed-value.h"
71 #include "insn-config.h"
72 #include "expmed.h"
73 #include "dojump.h"
74 #include "explow.h"
75 #include "calls.h"
76 #include "emit-rtl.h"
77 #include "varasm.h"
78 #include "stmt.h"
79 #include "expr.h"
80 #include "recog.h"
81 #include "insn-codes.h"
82 #include "optabs.h"
83 #include "params.h"
84 #include "diagnostic-core.h"
85 #include "tree-chrec.h"
86 #include "tree-scalar-evolution.h"
87 #include "tree-vectorizer.h"
88 #include "target.h"
89 
90 /* Loop Vectorization Pass.
91 
92    This pass tries to vectorize loops.
93 
94    For example, the vectorizer transforms the following simple loop:
95 
96         short a[N]; short b[N]; short c[N]; int i;
97 
98         for (i=0; i<N; i++){
99           a[i] = b[i] + c[i];
100         }
101 
102    as if it was manually vectorized by rewriting the source code into:
103 
104         typedef int __attribute__((mode(V8HI))) v8hi;
105         short a[N];  short b[N]; short c[N];   int i;
106         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
107         v8hi va, vb, vc;
108 
109         for (i=0; i<N/8; i++){
110           vb = pb[i];
111           vc = pc[i];
112           va = vb + vc;
113           pa[i] = va;
114         }
115 
116         The main entry to this pass is vectorize_loops(), in which
117    the vectorizer applies a set of analyses on a given set of loops,
118    followed by the actual vectorization transformation for the loops that
119    had successfully passed the analysis phase.
120         Throughout this pass we make a distinction between two types of
121    data: scalars (which are represented by SSA_NAMES), and memory references
122    ("data-refs").  These two types of data require different handling both
123    during analysis and transformation. The types of data-refs that the
124    vectorizer currently supports are ARRAY_REFS which base is an array DECL
125    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126    accesses are required to have a simple (consecutive) access pattern.
127 
128    Analysis phase:
129    ===============
130         The driver for the analysis phase is vect_analyze_loop().
131    It applies a set of analyses, some of which rely on the scalar evolution
132    analyzer (scev) developed by Sebastian Pop.
133 
134         During the analysis phase the vectorizer records some information
135    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136    loop, as well as general information about the loop as a whole, which is
137    recorded in a "loop_vec_info" struct attached to each loop.
138 
139    Transformation phase:
140    =====================
141         The loop transformation phase scans all the stmts in the loop, and
142    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143    the loop that needs to be vectorized.  It inserts the vector code sequence
144    just before the scalar stmt S, and records a pointer to the vector code
145    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146    attached to S).  This pointer will be used for the vectorization of following
147    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148    otherwise, we rely on dead code elimination for removing it.
149 
150         For example, say stmt S1 was vectorized into stmt VS1:
151 
152    VS1: vb = px[i];
153    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
154    S2:  a = b;
155 
156    To vectorize stmt S2, the vectorizer first finds the stmt that defines
157    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
159    resulting sequence would be:
160 
161    VS1: vb = px[i];
162    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
163    VS2: va = vb;
164    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
165 
166         Operands that are not SSA_NAMEs, are data-refs that appear in
167    load/store operations (like 'x[i]' in S1), and are handled differently.
168 
169    Target modeling:
170    =================
171         Currently the only target specific information that is used is the
172    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173    Targets that can support different sizes of vectors, for now will need
174    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
175    flexibility will be added in the future.
176 
177         Since we only vectorize operations which vector form can be
178    expressed using existing tree codes, to verify that an operation is
179    supported, the vectorizer checks the relevant optab at the relevant
180    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
181    the value found is CODE_FOR_nothing, then there's no target support, and
182    we can't vectorize the stmt.
183 
184    For additional information on this project see:
185    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
186 */
187 
188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
189 
190 /* Function vect_determine_vectorization_factor
191 
192    Determine the vectorization factor (VF).  VF is the number of data elements
193    that are operated upon in parallel in a single iteration of the vectorized
194    loop.  For example, when vectorizing a loop that operates on 4byte elements,
195    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196    elements can fit in a single vector register.
197 
198    We currently support vectorization of loops in which all types operated upon
199    are of the same size.  Therefore this function currently sets VF according to
200    the size of the types operated upon, and fails if there are multiple sizes
201    in the loop.
202 
203    VF is also the factor by which the loop iterations are strip-mined, e.g.:
204    original loop:
205         for (i=0; i<N; i++){
206           a[i] = b[i] + c[i];
207         }
208 
209    vectorized loop:
210         for (i=0; i<N; i+=VF){
211           a[i:VF] = b[i:VF] + c[i:VF];
212         }
213 */
214 
215 static bool
216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
217 {
218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
219   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
220   int nbbs = loop->num_nodes;
221   unsigned int vectorization_factor = 0;
222   tree scalar_type;
223   gphi *phi;
224   tree vectype;
225   unsigned int nunits;
226   stmt_vec_info stmt_info;
227   int i;
228   HOST_WIDE_INT dummy;
229   gimple stmt, pattern_stmt = NULL;
230   gimple_seq pattern_def_seq = NULL;
231   gimple_stmt_iterator pattern_def_si = gsi_none ();
232   bool analyze_pattern_stmt = false;
233 
234   if (dump_enabled_p ())
235     dump_printf_loc (MSG_NOTE, vect_location,
236                      "=== vect_determine_vectorization_factor ===\n");
237 
238   for (i = 0; i < nbbs; i++)
239     {
240       basic_block bb = bbs[i];
241 
242       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
243 	   gsi_next (&si))
244 	{
245 	  phi = si.phi ();
246 	  stmt_info = vinfo_for_stmt (phi);
247 	  if (dump_enabled_p ())
248 	    {
249 	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
250 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
251 	      dump_printf (MSG_NOTE, "\n");
252 	    }
253 
254 	  gcc_assert (stmt_info);
255 
256 	  if (STMT_VINFO_RELEVANT_P (stmt_info))
257             {
258 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
259               scalar_type = TREE_TYPE (PHI_RESULT (phi));
260 
261 	      if (dump_enabled_p ())
262 		{
263 		  dump_printf_loc (MSG_NOTE, vect_location,
264                                    "get vectype for scalar type:  ");
265 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
266                   dump_printf (MSG_NOTE, "\n");
267 		}
268 
269 	      vectype = get_vectype_for_scalar_type (scalar_type);
270 	      if (!vectype)
271 		{
272 		  if (dump_enabled_p ())
273 		    {
274 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275                                        "not vectorized: unsupported "
276                                        "data-type ");
277 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278                                          scalar_type);
279                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280 		    }
281 		  return false;
282 		}
283 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
284 
285 	      if (dump_enabled_p ())
286 		{
287 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
288 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
289                   dump_printf (MSG_NOTE, "\n");
290 		}
291 
292 	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
293 	      if (dump_enabled_p ())
294 		dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
295                                  nunits);
296 
297 	      if (!vectorization_factor
298 		  || (nunits > vectorization_factor))
299 		vectorization_factor = nunits;
300 	    }
301 	}
302 
303       for (gimple_stmt_iterator si = gsi_start_bb (bb);
304 	   !gsi_end_p (si) || analyze_pattern_stmt;)
305         {
306           tree vf_vectype;
307 
308           if (analyze_pattern_stmt)
309 	    stmt = pattern_stmt;
310           else
311             stmt = gsi_stmt (si);
312 
313           stmt_info = vinfo_for_stmt (stmt);
314 
315 	  if (dump_enabled_p ())
316 	    {
317 	      dump_printf_loc (MSG_NOTE, vect_location,
318                                "==> examining statement: ");
319 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
320               dump_printf (MSG_NOTE, "\n");
321 	    }
322 
323 	  gcc_assert (stmt_info);
324 
325 	  /* Skip stmts which do not need to be vectorized.  */
326 	  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
327 	       && !STMT_VINFO_LIVE_P (stmt_info))
328 	      || gimple_clobber_p (stmt))
329             {
330               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
331                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
332                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
333                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
334                 {
335                   stmt = pattern_stmt;
336                   stmt_info = vinfo_for_stmt (pattern_stmt);
337                   if (dump_enabled_p ())
338                     {
339                       dump_printf_loc (MSG_NOTE, vect_location,
340                                        "==> examining pattern statement: ");
341                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
342                       dump_printf (MSG_NOTE, "\n");
343                     }
344                 }
345               else
346 	        {
347 	          if (dump_enabled_p ())
348 	            dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
349                   gsi_next (&si);
350 	          continue;
351                 }
352 	    }
353           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
354                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
355                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
356                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
357             analyze_pattern_stmt = true;
358 
359 	  /* If a pattern statement has def stmts, analyze them too.  */
360 	  if (is_pattern_stmt_p (stmt_info))
361 	    {
362 	      if (pattern_def_seq == NULL)
363 		{
364 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
365 		  pattern_def_si = gsi_start (pattern_def_seq);
366 		}
367 	      else if (!gsi_end_p (pattern_def_si))
368 		gsi_next (&pattern_def_si);
369 	      if (pattern_def_seq != NULL)
370 		{
371 		  gimple pattern_def_stmt = NULL;
372 		  stmt_vec_info pattern_def_stmt_info = NULL;
373 
374 		  while (!gsi_end_p (pattern_def_si))
375 		    {
376 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
377 		      pattern_def_stmt_info
378 			= vinfo_for_stmt (pattern_def_stmt);
379 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
380 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
381 			break;
382 		      gsi_next (&pattern_def_si);
383 		    }
384 
385 		  if (!gsi_end_p (pattern_def_si))
386 		    {
387 		      if (dump_enabled_p ())
388 			{
389 			  dump_printf_loc (MSG_NOTE, vect_location,
390                                            "==> examining pattern def stmt: ");
391 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
392                                             pattern_def_stmt, 0);
393                           dump_printf (MSG_NOTE, "\n");
394 			}
395 
396 		      stmt = pattern_def_stmt;
397 		      stmt_info = pattern_def_stmt_info;
398 		    }
399 		  else
400 		    {
401 		      pattern_def_si = gsi_none ();
402 		      analyze_pattern_stmt = false;
403 		    }
404 		}
405 	      else
406 		analyze_pattern_stmt = false;
407 	    }
408 
409 	  if (gimple_get_lhs (stmt) == NULL_TREE
410 	      /* MASK_STORE has no lhs, but is ok.  */
411 	      && (!is_gimple_call (stmt)
412 		  || !gimple_call_internal_p (stmt)
413 		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
414 	    {
415 	      if (is_gimple_call (stmt))
416 		{
417 		  /* Ignore calls with no lhs.  These must be calls to
418 		     #pragma omp simd functions, and what vectorization factor
419 		     it really needs can't be determined until
420 		     vectorizable_simd_clone_call.  */
421 		  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
422 		    {
423 		      pattern_def_seq = NULL;
424 		      gsi_next (&si);
425 		    }
426 		  continue;
427 		}
428 	      if (dump_enabled_p ())
429 		{
430 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431                                    "not vectorized: irregular stmt.");
432 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
433                                     0);
434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
435 		}
436 	      return false;
437 	    }
438 
439 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
440 	    {
441 	      if (dump_enabled_p ())
442 	        {
443 	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444                                    "not vectorized: vector stmt in loop:");
445 	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
446                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
447 	        }
448 	      return false;
449 	    }
450 
451 	  if (STMT_VINFO_VECTYPE (stmt_info))
452 	    {
453 	      /* The only case when a vectype had been already set is for stmts
454 	         that contain a dataref, or for "pattern-stmts" (stmts
455 		 generated by the vectorizer to represent/replace a certain
456 		 idiom).  */
457 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
458 			  || is_pattern_stmt_p (stmt_info)
459 			  || !gsi_end_p (pattern_def_si));
460 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
461 	    }
462 	  else
463 	    {
464 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
465 	      if (is_gimple_call (stmt)
466 		  && gimple_call_internal_p (stmt)
467 		  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
468 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
469 	      else
470 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
471 	      if (dump_enabled_p ())
472 		{
473 		  dump_printf_loc (MSG_NOTE, vect_location,
474                                    "get vectype for scalar type:  ");
475 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
476                   dump_printf (MSG_NOTE, "\n");
477 		}
478 	      vectype = get_vectype_for_scalar_type (scalar_type);
479 	      if (!vectype)
480 		{
481 		  if (dump_enabled_p ())
482 		    {
483 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484                                        "not vectorized: unsupported "
485                                        "data-type ");
486 		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487                                          scalar_type);
488                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
489 		    }
490 		  return false;
491 		}
492 
493 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
494 
495 	      if (dump_enabled_p ())
496 		{
497 		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
498 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
499                   dump_printf (MSG_NOTE, "\n");
500 		}
501             }
502 
503 	  /* The vectorization factor is according to the smallest
504 	     scalar type (or the largest vector size, but we only
505 	     support one vector size per loop).  */
506 	  scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 						       &dummy);
508 	  if (dump_enabled_p ())
509 	    {
510 	      dump_printf_loc (MSG_NOTE, vect_location,
511                                "get vectype for scalar type:  ");
512 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513               dump_printf (MSG_NOTE, "\n");
514 	    }
515 	  vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 	  if (!vf_vectype)
517 	    {
518 	      if (dump_enabled_p ())
519 		{
520 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521                                    "not vectorized: unsupported data-type ");
522 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523                                      scalar_type);
524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 		}
526 	      return false;
527 	    }
528 
529 	  if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 	       != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531 	    {
532 	      if (dump_enabled_p ())
533 		{
534 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535                                    "not vectorized: different sized vector "
536                                    "types in statement, ");
537 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538                                      vectype);
539 		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541                                      vf_vectype);
542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 		}
544 	      return false;
545 	    }
546 
547 	  if (dump_enabled_p ())
548 	    {
549 	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551               dump_printf (MSG_NOTE, "\n");
552 	    }
553 
554 	  nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 	  if (dump_enabled_p ())
556 	    dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 	  if (!vectorization_factor
558 	      || (nunits > vectorization_factor))
559 	    vectorization_factor = nunits;
560 
561 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562 	    {
563 	      pattern_def_seq = NULL;
564 	      gsi_next (&si);
565 	    }
566         }
567     }
568 
569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
570   if (dump_enabled_p ())
571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572                      vectorization_factor);
573   if (vectorization_factor <= 1)
574     {
575       if (dump_enabled_p ())
576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577                          "not vectorized: unsupported data-type\n");
578       return false;
579     }
580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581 
582   return true;
583 }
584 
585 
586 /* Function vect_is_simple_iv_evolution.
587 
588    FORNOW: A simple evolution of an induction variables in the loop is
589    considered a polynomial evolution.  */
590 
591 static bool
592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
593                              tree * step)
594 {
595   tree init_expr;
596   tree step_expr;
597   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
598   basic_block bb;
599 
600   /* When there is no evolution in this loop, the evolution function
601      is not "simple".  */
602   if (evolution_part == NULL_TREE)
603     return false;
604 
605   /* When the evolution is a polynomial of degree >= 2
606      the evolution function is not "simple".  */
607   if (tree_is_chrec (evolution_part))
608     return false;
609 
610   step_expr = evolution_part;
611   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
612 
613   if (dump_enabled_p ())
614     {
615       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
616       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
617       dump_printf (MSG_NOTE, ",  init: ");
618       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
619       dump_printf (MSG_NOTE, "\n");
620     }
621 
622   *init = init_expr;
623   *step = step_expr;
624 
625   if (TREE_CODE (step_expr) != INTEGER_CST
626       && (TREE_CODE (step_expr) != SSA_NAME
627 	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
628 	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
629 	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
630 	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
631 		  || !flag_associative_math)))
632       && (TREE_CODE (step_expr) != REAL_CST
633 	  || !flag_associative_math))
634     {
635       if (dump_enabled_p ())
636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637                          "step unknown.\n");
638       return false;
639     }
640 
641   return true;
642 }
643 
644 /* Function vect_analyze_scalar_cycles_1.
645 
646    Examine the cross iteration def-use cycles of scalar variables
647    in LOOP.  LOOP_VINFO represents the loop that is now being
648    considered for vectorization (can be LOOP, or an outer-loop
649    enclosing LOOP).  */
650 
651 static void
652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
653 {
654   basic_block bb = loop->header;
655   tree init, step;
656   auto_vec<gimple, 64> worklist;
657   gphi_iterator gsi;
658   bool double_reduc;
659 
660   if (dump_enabled_p ())
661     dump_printf_loc (MSG_NOTE, vect_location,
662                      "=== vect_analyze_scalar_cycles ===\n");
663 
664   /* First - identify all inductions.  Reduction detection assumes that all the
665      inductions have been identified, therefore, this order must not be
666      changed.  */
667   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
668     {
669       gphi *phi = gsi.phi ();
670       tree access_fn = NULL;
671       tree def = PHI_RESULT (phi);
672       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
673 
674       if (dump_enabled_p ())
675 	{
676 	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
677 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
678           dump_printf (MSG_NOTE, "\n");
679 	}
680 
681       /* Skip virtual phi's.  The data dependences that are associated with
682          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
683       if (virtual_operand_p (def))
684 	continue;
685 
686       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
687 
688       /* Analyze the evolution function.  */
689       access_fn = analyze_scalar_evolution (loop, def);
690       if (access_fn)
691 	{
692 	  STRIP_NOPS (access_fn);
693 	  if (dump_enabled_p ())
694 	    {
695 	      dump_printf_loc (MSG_NOTE, vect_location,
696                                "Access function of PHI: ");
697 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
698               dump_printf (MSG_NOTE, "\n");
699 	    }
700 	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
701 	    = evolution_part_in_loop_num (access_fn, loop->num);
702 	}
703 
704       if (!access_fn
705 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
706 	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
707 	      && TREE_CODE (step) != INTEGER_CST))
708 	{
709 	  worklist.safe_push (phi);
710 	  continue;
711 	}
712 
713       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
714 
715       if (dump_enabled_p ())
716 	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
717       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
718     }
719 
720 
721   /* Second - identify all reductions and nested cycles.  */
722   while (worklist.length () > 0)
723     {
724       gimple phi = worklist.pop ();
725       tree def = PHI_RESULT (phi);
726       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
727       gimple reduc_stmt;
728       bool nested_cycle;
729 
730       if (dump_enabled_p ())
731         {
732           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
733           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
734           dump_printf (MSG_NOTE, "\n");
735         }
736 
737       gcc_assert (!virtual_operand_p (def)
738 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
739 
740       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
741       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
742 						&double_reduc);
743       if (reduc_stmt)
744         {
745           if (double_reduc)
746             {
747               if (dump_enabled_p ())
748                 dump_printf_loc (MSG_NOTE, vect_location,
749 				 "Detected double reduction.\n");
750 
751               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
752               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753                                                     vect_double_reduction_def;
754             }
755           else
756             {
757               if (nested_cycle)
758                 {
759                   if (dump_enabled_p ())
760                     dump_printf_loc (MSG_NOTE, vect_location,
761 				     "Detected vectorizable nested cycle.\n");
762 
763                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
764                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
765                                                              vect_nested_cycle;
766                 }
767               else
768                 {
769                   if (dump_enabled_p ())
770                     dump_printf_loc (MSG_NOTE, vect_location,
771 				     "Detected reduction.\n");
772 
773                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
774                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
775                                                            vect_reduction_def;
776                   /* Store the reduction cycles for possible vectorization in
777                      loop-aware SLP.  */
778                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
779                 }
780             }
781         }
782       else
783         if (dump_enabled_p ())
784           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 			   "Unknown def-use cycle pattern.\n");
786     }
787 }
788 
789 
790 /* Function vect_analyze_scalar_cycles.
791 
792    Examine the cross iteration def-use cycles of scalar variables, by
793    analyzing the loop-header PHIs of scalar variables.  Classify each
794    cycle as one of the following: invariant, induction, reduction, unknown.
795    We do that for the loop represented by LOOP_VINFO, and also to its
796    inner-loop, if exists.
797    Examples for scalar cycles:
798 
799    Example1: reduction:
800 
801               loop1:
802               for (i=0; i<N; i++)
803                  sum += a[i];
804 
805    Example2: induction:
806 
807               loop2:
808               for (i=0; i<N; i++)
809                  a[i] = i;  */
810 
811 static void
812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
813 {
814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
815 
816   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
817 
818   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819      Reductions in such inner-loop therefore have different properties than
820      the reductions in the nest that gets vectorized:
821      1. When vectorized, they are executed in the same order as in the original
822         scalar loop, so we can't change the order of computation when
823         vectorizing them.
824      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825         current checks are too strict.  */
826 
827   if (loop->inner)
828     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
829 }
830 
831 
832 /* Function vect_get_loop_niters.
833 
834    Determine how many iterations the loop is executed and place it
835    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
836    in NUMBER_OF_ITERATIONSM1.
837 
838    Return the loop exit condition.  */
839 
840 
841 static gcond *
842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
843 		      tree *number_of_iterationsm1)
844 {
845   tree niters;
846 
847   if (dump_enabled_p ())
848     dump_printf_loc (MSG_NOTE, vect_location,
849 		     "=== get_loop_niters ===\n");
850 
851   niters = number_of_latch_executions (loop);
852   *number_of_iterationsm1 = niters;
853 
854   /* We want the number of loop header executions which is the number
855      of latch executions plus one.
856      ???  For UINT_MAX latch executions this number overflows to zero
857      for loops like do { n++; } while (n != 0);  */
858   if (niters && !chrec_contains_undetermined (niters))
859     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
860 			  build_int_cst (TREE_TYPE (niters), 1));
861   *number_of_iterations = niters;
862 
863   return get_loop_exit_condition (loop);
864 }
865 
866 
867 /* Function bb_in_loop_p
868 
869    Used as predicate for dfs order traversal of the loop bbs.  */
870 
871 static bool
872 bb_in_loop_p (const_basic_block bb, const void *data)
873 {
874   const struct loop *const loop = (const struct loop *)data;
875   if (flow_bb_inside_loop_p (loop, bb))
876     return true;
877   return false;
878 }
879 
880 
881 /* Function new_loop_vec_info.
882 
883    Create and initialize a new loop_vec_info struct for LOOP, as well as
884    stmt_vec_info structs for all the stmts in LOOP.  */
885 
886 static loop_vec_info
887 new_loop_vec_info (struct loop *loop)
888 {
889   loop_vec_info res;
890   basic_block *bbs;
891   gimple_stmt_iterator si;
892   unsigned int i, nbbs;
893 
894   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
895   LOOP_VINFO_LOOP (res) = loop;
896 
897   bbs = get_loop_body (loop);
898 
899   /* Create/Update stmt_info for all stmts in the loop.  */
900   for (i = 0; i < loop->num_nodes; i++)
901     {
902       basic_block bb = bbs[i];
903 
904       /* BBs in a nested inner-loop will have been already processed (because
905          we will have called vect_analyze_loop_form for any nested inner-loop).
906          Therefore, for stmts in an inner-loop we just want to update the
907          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908          loop_info of the outer-loop we are currently considering to vectorize
909          (instead of the loop_info of the inner-loop).
910          For stmts in other BBs we need to create a stmt_info from scratch.  */
911       if (bb->loop_father != loop)
912         {
913           /* Inner-loop bb.  */
914           gcc_assert (loop->inner && bb->loop_father == loop->inner);
915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
916             {
917               gimple phi = gsi_stmt (si);
918               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
919               loop_vec_info inner_loop_vinfo =
920                 STMT_VINFO_LOOP_VINFO (stmt_info);
921               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
922               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
923             }
924           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
925            {
926               gimple stmt = gsi_stmt (si);
927               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928               loop_vec_info inner_loop_vinfo =
929                  STMT_VINFO_LOOP_VINFO (stmt_info);
930               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
931               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
932            }
933         }
934       else
935         {
936           /* bb in current nest.  */
937           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
938             {
939               gimple phi = gsi_stmt (si);
940               gimple_set_uid (phi, 0);
941               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
942             }
943 
944           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
945             {
946               gimple stmt = gsi_stmt (si);
947               gimple_set_uid (stmt, 0);
948               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
949             }
950         }
951     }
952 
953   /* CHECKME: We want to visit all BBs before their successors (except for
954      latch blocks, for which this assertion wouldn't hold).  In the simple
955      case of the loop forms we allow, a dfs order of the BBs would the same
956      as reversed postorder traversal, so we are safe.  */
957 
958    free (bbs);
959    bbs = XCNEWVEC (basic_block, loop->num_nodes);
960    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
961                               bbs, loop->num_nodes, loop);
962    gcc_assert (nbbs == loop->num_nodes);
963 
964   LOOP_VINFO_BBS (res) = bbs;
965   LOOP_VINFO_NITERSM1 (res) = NULL;
966   LOOP_VINFO_NITERS (res) = NULL;
967   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
968   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
969   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
970   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
971   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
972   LOOP_VINFO_VECT_FACTOR (res) = 0;
973   LOOP_VINFO_LOOP_NEST (res).create (3);
974   LOOP_VINFO_DATAREFS (res).create (10);
975   LOOP_VINFO_DDRS (res).create (10 * 10);
976   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
977   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
978 	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
979   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
980 	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
981   LOOP_VINFO_GROUPED_STORES (res).create (10);
982   LOOP_VINFO_REDUCTIONS (res).create (10);
983   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
984   LOOP_VINFO_SLP_INSTANCES (res).create (10);
985   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
986   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
987   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
988   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
989   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
990 
991   return res;
992 }
993 
994 
995 /* Function destroy_loop_vec_info.
996 
997    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998    stmts in the loop.  */
999 
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002 {
1003   struct loop *loop;
1004   basic_block *bbs;
1005   int nbbs;
1006   gimple_stmt_iterator si;
1007   int j;
1008   vec<slp_instance> slp_instances;
1009   slp_instance instance;
1010   bool swapped;
1011 
1012   if (!loop_vinfo)
1013     return;
1014 
1015   loop = LOOP_VINFO_LOOP (loop_vinfo);
1016 
1017   bbs = LOOP_VINFO_BBS (loop_vinfo);
1018   nbbs = clean_stmts ? loop->num_nodes : 0;
1019   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020 
1021   for (j = 0; j < nbbs; j++)
1022     {
1023       basic_block bb = bbs[j];
1024       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025         free_stmt_vec_info (gsi_stmt (si));
1026 
1027       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028         {
1029           gimple stmt = gsi_stmt (si);
1030 
1031 	  /* We may have broken canonical form by moving a constant
1032 	     into RHS1 of a commutative op.  Fix such occurrences.  */
1033 	  if (swapped && is_gimple_assign (stmt))
1034 	    {
1035 	      enum tree_code code = gimple_assign_rhs_code (stmt);
1036 
1037 	      if ((code == PLUS_EXPR
1038 		   || code == POINTER_PLUS_EXPR
1039 		   || code == MULT_EXPR)
1040 		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041 		swap_ssa_operands (stmt,
1042 				   gimple_assign_rhs1_ptr (stmt),
1043 				   gimple_assign_rhs2_ptr (stmt));
1044 	    }
1045 
1046 	  /* Free stmt_vec_info.  */
1047 	  free_stmt_vec_info (stmt);
1048           gsi_next (&si);
1049         }
1050     }
1051 
1052   free (LOOP_VINFO_BBS (loop_vinfo));
1053   vect_destroy_datarefs (loop_vinfo, NULL);
1054   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060     vect_free_slp_instance (instance);
1061 
1062   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066 
1067   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069 
1070   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071 
1072   free (loop_vinfo);
1073   loop->aux = NULL;
1074 }
1075 
1076 
1077 /* Function vect_analyze_loop_1.
1078 
1079    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080    for it. The different analyses will record information in the
1081    loop_vec_info struct.  This is a subset of the analyses applied in
1082    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083    that is now considered for (outer-loop) vectorization.  */
1084 
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1087 {
1088   loop_vec_info loop_vinfo;
1089 
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location,
1092 		     "===== analyze_loop_nest_1 =====\n");
1093 
1094   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1095 
1096   loop_vinfo = vect_analyze_loop_form (loop);
1097   if (!loop_vinfo)
1098     {
1099       if (dump_enabled_p ())
1100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 			 "bad inner-loop form.\n");
1102       return NULL;
1103     }
1104 
1105   return loop_vinfo;
1106 }
1107 
1108 
1109 /* Function vect_analyze_loop_form.
1110 
1111    Verify that certain CFG restrictions hold, including:
1112    - the loop has a pre-header
1113    - the loop has a single entry and exit
1114    - the loop exit condition is simple enough, and the number of iterations
1115      can be analyzed (a countable loop).  */
1116 
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1119 {
1120   loop_vec_info loop_vinfo;
1121   gcond *loop_cond;
1122   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123   loop_vec_info inner_loop_vinfo = NULL;
1124 
1125   if (dump_enabled_p ())
1126     dump_printf_loc (MSG_NOTE, vect_location,
1127 		     "=== vect_analyze_loop_form ===\n");
1128 
1129   /* Different restrictions apply when we are considering an inner-most loop,
1130      vs. an outer (nested) loop.
1131      (FORNOW. May want to relax some of these restrictions in the future).  */
1132 
1133   if (!loop->inner)
1134     {
1135       /* Inner-most loop.  We currently require that the number of BBs is
1136 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1137 	 look like this:
1138 
1139                         (pre-header)
1140                            |
1141                           header <--------+
1142                            | |            |
1143                            | +--> latch --+
1144                            |
1145                         (exit-bb)  */
1146 
1147       if (loop->num_nodes != 2)
1148         {
1149           if (dump_enabled_p ())
1150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 			     "not vectorized: control flow in loop.\n");
1152           return NULL;
1153         }
1154 
1155       if (empty_block_p (loop->header))
1156 	{
1157 	  if (dump_enabled_p ())
1158 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 			     "not vectorized: empty loop.\n");
1160 	  return NULL;
1161 	}
1162     }
1163   else
1164     {
1165       struct loop *innerloop = loop->inner;
1166       edge entryedge;
1167 
1168       /* Nested loop. We currently require that the loop is doubly-nested,
1169 	 contains a single inner loop, and the number of BBs is exactly 5.
1170 	 Vectorizable outer-loops look like this:
1171 
1172 			(pre-header)
1173 			   |
1174 			  header <---+
1175 			   |         |
1176 		          inner-loop |
1177 			   |         |
1178 			  tail ------+
1179 			   |
1180 		        (exit-bb)
1181 
1182 	 The inner-loop has the properties expected of inner-most loops
1183 	 as described above.  */
1184 
1185       if ((loop->inner)->inner || (loop->inner)->next)
1186 	{
1187 	  if (dump_enabled_p ())
1188 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 			     "not vectorized: multiple nested loops.\n");
1190 	  return NULL;
1191 	}
1192 
1193       /* Analyze the inner-loop.  */
1194       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195       if (!inner_loop_vinfo)
1196 	{
1197 	  if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 			     "not vectorized: Bad inner loop.\n");
1200 	  return NULL;
1201 	}
1202 
1203       if (!expr_invariant_in_loop_p (loop,
1204 					LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205 	{
1206 	  if (dump_enabled_p ())
1207 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 			     "not vectorized: inner-loop count not"
1209                              " invariant.\n");
1210 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1211 	  return NULL;
1212 	}
1213 
1214       if (loop->num_nodes != 5)
1215         {
1216 	  if (dump_enabled_p ())
1217 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 			     "not vectorized: control flow in loop.\n");
1219 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1220 	  return NULL;
1221         }
1222 
1223       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224       entryedge = EDGE_PRED (innerloop->header, 0);
1225       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226 	entryedge = EDGE_PRED (innerloop->header, 1);
1227 
1228       if (entryedge->src != loop->header
1229 	  || !single_exit (innerloop)
1230 	  || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1231 	{
1232 	  if (dump_enabled_p ())
1233 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 			     "not vectorized: unsupported outerloop form.\n");
1235 	  destroy_loop_vec_info (inner_loop_vinfo, true);
1236 	  return NULL;
1237 	}
1238 
1239       if (dump_enabled_p ())
1240         dump_printf_loc (MSG_NOTE, vect_location,
1241 			 "Considering outer-loop vectorization.\n");
1242     }
1243 
1244   if (!single_exit (loop)
1245       || EDGE_COUNT (loop->header->preds) != 2)
1246     {
1247       if (dump_enabled_p ())
1248         {
1249           if (!single_exit (loop))
1250 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 			     "not vectorized: multiple exits.\n");
1252           else if (EDGE_COUNT (loop->header->preds) != 2)
1253 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 			     "not vectorized: too many incoming edges.\n");
1255         }
1256       if (inner_loop_vinfo)
1257 	destroy_loop_vec_info (inner_loop_vinfo, true);
1258       return NULL;
1259     }
1260 
1261   /* We assume that the loop exit condition is at the end of the loop. i.e,
1262      that the loop is represented as a do-while (with a proper if-guard
1263      before the loop if needed), where the loop header contains all the
1264      executable statements, and the latch is empty.  */
1265   if (!empty_block_p (loop->latch)
1266       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267     {
1268       if (dump_enabled_p ())
1269 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270 			 "not vectorized: latch block not empty.\n");
1271       if (inner_loop_vinfo)
1272 	destroy_loop_vec_info (inner_loop_vinfo, true);
1273       return NULL;
1274     }
1275 
1276   /* Make sure there exists a single-predecessor exit bb:  */
1277   if (!single_pred_p (single_exit (loop)->dest))
1278     {
1279       edge e = single_exit (loop);
1280       if (!(e->flags & EDGE_ABNORMAL))
1281 	{
1282 	  split_loop_exit_edge (e);
1283 	  if (dump_enabled_p ())
1284 	    dump_printf (MSG_NOTE, "split exit edge.\n");
1285 	}
1286       else
1287 	{
1288 	  if (dump_enabled_p ())
1289 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 			     "not vectorized: abnormal loop exit edge.\n");
1291 	  if (inner_loop_vinfo)
1292 	    destroy_loop_vec_info (inner_loop_vinfo, true);
1293 	  return NULL;
1294 	}
1295     }
1296 
1297   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298 				    &number_of_iterationsm1);
1299   if (!loop_cond)
1300     {
1301       if (dump_enabled_p ())
1302 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 			 "not vectorized: complicated exit condition.\n");
1304       if (inner_loop_vinfo)
1305 	destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308 
1309   if (!number_of_iterations
1310       || chrec_contains_undetermined (number_of_iterations))
1311     {
1312       if (dump_enabled_p ())
1313 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 			 "not vectorized: number of iterations cannot be "
1315 			 "computed.\n");
1316       if (inner_loop_vinfo)
1317 	destroy_loop_vec_info (inner_loop_vinfo, true);
1318       return NULL;
1319     }
1320 
1321   if (integer_zerop (number_of_iterations))
1322     {
1323       if (dump_enabled_p ())
1324 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 			 "not vectorized: number of iterations = 0.\n");
1326       if (inner_loop_vinfo)
1327         destroy_loop_vec_info (inner_loop_vinfo, true);
1328       return NULL;
1329     }
1330 
1331   loop_vinfo = new_loop_vec_info (loop);
1332   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335 
1336   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337     {
1338       if (dump_enabled_p ())
1339         {
1340           dump_printf_loc (MSG_NOTE, vect_location,
1341 			   "Symbolic number of iterations is ");
1342 	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343           dump_printf (MSG_NOTE, "\n");
1344         }
1345     }
1346 
1347   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348 
1349   /* CHECKME: May want to keep it around it in the future.  */
1350   if (inner_loop_vinfo)
1351     destroy_loop_vec_info (inner_loop_vinfo, false);
1352 
1353   gcc_assert (!loop->aux);
1354   loop->aux = loop_vinfo;
1355   return loop_vinfo;
1356 }
1357 
1358 
1359 /* Function vect_analyze_loop_operations.
1360 
1361    Scan the loop stmts and make sure they are all vectorizable.  */
1362 
1363 static bool
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1365 {
1366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368   int nbbs = loop->num_nodes;
1369   unsigned int vectorization_factor = 0;
1370   int i;
1371   stmt_vec_info stmt_info;
1372   bool need_to_vectorize = false;
1373   int min_profitable_iters;
1374   int min_scalar_loop_bound;
1375   unsigned int th;
1376   bool only_slp_in_loop = true, ok;
1377   HOST_WIDE_INT max_niter;
1378   HOST_WIDE_INT estimated_niter;
1379   int min_profitable_estimate;
1380 
1381   if (dump_enabled_p ())
1382     dump_printf_loc (MSG_NOTE, vect_location,
1383 		     "=== vect_analyze_loop_operations ===\n");
1384 
1385   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387   if (slp)
1388     {
1389       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390 	 vectorization factor of the loop is the unrolling factor required by
1391 	 the SLP instances.  If that unrolling factor is 1, we say, that we
1392 	 perform pure SLP on loop - cross iteration parallelism is not
1393 	 exploited.  */
1394       for (i = 0; i < nbbs; i++)
1395 	{
1396 	  basic_block bb = bbs[i];
1397 	  for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398 	       gsi_next (&si))
1399 	    {
1400 	      gimple stmt = gsi_stmt (si);
1401 	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402 	      gcc_assert (stmt_info);
1403 	      if ((STMT_VINFO_RELEVANT_P (stmt_info)
1404 		   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1405 		  && !PURE_SLP_STMT (stmt_info))
1406 		/* STMT needs both SLP and loop-based vectorization.  */
1407 		only_slp_in_loop = false;
1408 	    }
1409 	}
1410 
1411       if (only_slp_in_loop)
1412 	vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413       else
1414 	vectorization_factor = least_common_multiple (vectorization_factor,
1415 				LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1416 
1417       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1418       if (dump_enabled_p ())
1419 	dump_printf_loc (MSG_NOTE, vect_location,
1420 			 "Updating vectorization factor to %d\n",
1421 			 vectorization_factor);
1422     }
1423 
1424   for (i = 0; i < nbbs; i++)
1425     {
1426       basic_block bb = bbs[i];
1427 
1428       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1429 	   gsi_next (&si))
1430         {
1431           gphi *phi = si.phi ();
1432           ok = true;
1433 
1434           stmt_info = vinfo_for_stmt (phi);
1435           if (dump_enabled_p ())
1436             {
1437               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1438               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1439               dump_printf (MSG_NOTE, "\n");
1440             }
1441 
1442           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1443              (i.e., a phi in the tail of the outer-loop).  */
1444           if (! is_loop_header_bb_p (bb))
1445             {
1446               /* FORNOW: we currently don't support the case that these phis
1447                  are not used in the outerloop (unless it is double reduction,
1448                  i.e., this phi is vect_reduction_def), cause this case
1449                  requires to actually do something here.  */
1450               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1451                    || STMT_VINFO_LIVE_P (stmt_info))
1452                   && STMT_VINFO_DEF_TYPE (stmt_info)
1453                      != vect_double_reduction_def)
1454                 {
1455                   if (dump_enabled_p ())
1456 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457 				     "Unsupported loop-closed phi in "
1458 				     "outer-loop.\n");
1459                   return false;
1460                 }
1461 
1462               /* If PHI is used in the outer loop, we check that its operand
1463                  is defined in the inner loop.  */
1464               if (STMT_VINFO_RELEVANT_P (stmt_info))
1465                 {
1466                   tree phi_op;
1467                   gimple op_def_stmt;
1468 
1469                   if (gimple_phi_num_args (phi) != 1)
1470                     return false;
1471 
1472                   phi_op = PHI_ARG_DEF (phi, 0);
1473                   if (TREE_CODE (phi_op) != SSA_NAME)
1474                     return false;
1475 
1476                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1477 		  if (gimple_nop_p (op_def_stmt)
1478 		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1479 		      || !vinfo_for_stmt (op_def_stmt))
1480                     return false;
1481 
1482                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1483                         != vect_used_in_outer
1484                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1485                            != vect_used_in_outer_by_reduction)
1486                     return false;
1487                 }
1488 
1489               continue;
1490             }
1491 
1492           gcc_assert (stmt_info);
1493 
1494           if (STMT_VINFO_LIVE_P (stmt_info))
1495             {
1496               /* FORNOW: not yet supported.  */
1497               if (dump_enabled_p ())
1498 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 				 "not vectorized: value used after loop.\n");
1500               return false;
1501             }
1502 
1503           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1504               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1505             {
1506               /* A scalar-dependence cycle that we don't support.  */
1507               if (dump_enabled_p ())
1508 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509 				 "not vectorized: scalar dependence cycle.\n");
1510               return false;
1511             }
1512 
1513           if (STMT_VINFO_RELEVANT_P (stmt_info))
1514             {
1515               need_to_vectorize = true;
1516               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1517                 ok = vectorizable_induction (phi, NULL, NULL);
1518             }
1519 
1520           if (!ok)
1521             {
1522               if (dump_enabled_p ())
1523                 {
1524 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525 				   "not vectorized: relevant phi not "
1526 				   "supported: ");
1527                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1528                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1529                 }
1530 	      return false;
1531             }
1532         }
1533 
1534       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1535 	   gsi_next (&si))
1536         {
1537           gimple stmt = gsi_stmt (si);
1538 	  if (!gimple_clobber_p (stmt)
1539 	      && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1540 	    return false;
1541         }
1542     } /* bbs */
1543 
1544   /* All operations in the loop are either irrelevant (deal with loop
1545      control, or dead), or only used outside the loop and can be moved
1546      out of the loop (e.g. invariants, inductions).  The loop can be
1547      optimized away by scalar optimizations.  We're better off not
1548      touching this loop.  */
1549   if (!need_to_vectorize)
1550     {
1551       if (dump_enabled_p ())
1552         dump_printf_loc (MSG_NOTE, vect_location,
1553 			 "All the computation can be taken out of the loop.\n");
1554       if (dump_enabled_p ())
1555 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 			 "not vectorized: redundant loop. no profit to "
1557 			 "vectorize.\n");
1558       return false;
1559     }
1560 
1561   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1562     dump_printf_loc (MSG_NOTE, vect_location,
1563 		     "vectorization_factor = %d, niters = "
1564 		     HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1565 		     LOOP_VINFO_INT_NITERS (loop_vinfo));
1566 
1567   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1568        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1569       || ((max_niter = max_stmt_executions_int (loop)) != -1
1570 	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1571     {
1572       if (dump_enabled_p ())
1573 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574 			 "not vectorized: iteration count too small.\n");
1575       if (dump_enabled_p ())
1576 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 			 "not vectorized: iteration count smaller than "
1578 			 "vectorization factor.\n");
1579       return false;
1580     }
1581 
1582   /* Analyze cost.  Decide if worth while to vectorize.  */
1583 
1584   /* Once VF is set, SLP costs should be updated since the number of created
1585      vector stmts depends on VF.  */
1586   vect_update_slp_costs_according_to_vf (loop_vinfo);
1587 
1588   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1589 				      &min_profitable_estimate);
1590   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1591 
1592   if (min_profitable_iters < 0)
1593     {
1594       if (dump_enabled_p ())
1595 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 			 "not vectorized: vectorization not profitable.\n");
1597       if (dump_enabled_p ())
1598 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 			 "not vectorized: vector version will never be "
1600 			 "profitable.\n");
1601       return false;
1602     }
1603 
1604   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1605                             * vectorization_factor) - 1);
1606 
1607 
1608   /* Use the cost model only if it is more conservative than user specified
1609      threshold.  */
1610 
1611   th = (unsigned) min_scalar_loop_bound;
1612   if (min_profitable_iters
1613       && (!min_scalar_loop_bound
1614           || min_profitable_iters > min_scalar_loop_bound))
1615     th = (unsigned) min_profitable_iters;
1616 
1617   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1618 
1619   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1620       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1621     {
1622       if (dump_enabled_p ())
1623 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624 			 "not vectorized: vectorization not profitable.\n");
1625       if (dump_enabled_p ())
1626         dump_printf_loc (MSG_NOTE, vect_location,
1627 			 "not vectorized: iteration count smaller than user "
1628 			 "specified loop bound parameter or minimum profitable "
1629 			 "iterations (whichever is more conservative).\n");
1630       return false;
1631     }
1632 
1633   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1634       && ((unsigned HOST_WIDE_INT) estimated_niter
1635           <= MAX (th, (unsigned)min_profitable_estimate)))
1636     {
1637       if (dump_enabled_p ())
1638 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 			 "not vectorized: estimated iteration count too "
1640                          "small.\n");
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_NOTE, vect_location,
1643 			 "not vectorized: estimated iteration count smaller "
1644                          "than specified loop bound parameter or minimum "
1645                          "profitable iterations (whichever is more "
1646                          "conservative).\n");
1647       return false;
1648     }
1649 
1650   return true;
1651 }
1652 
1653 
1654 /* Function vect_analyze_loop_2.
1655 
1656    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1657    for it.  The different analyses will record information in the
1658    loop_vec_info struct.  */
1659 static bool
1660 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1661 {
1662   bool ok, slp = false;
1663   int max_vf = MAX_VECTORIZATION_FACTOR;
1664   int min_vf = 2;
1665   unsigned int th;
1666   unsigned int n_stmts = 0;
1667 
1668   /* Find all data references in the loop (which correspond to vdefs/vuses)
1669      and analyze their evolution in the loop.  Also adjust the minimal
1670      vectorization factor according to the loads and stores.
1671 
1672      FORNOW: Handle only simple, array references, which
1673      alignment can be forced, and aligned pointer-references.  */
1674 
1675   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1676   if (!ok)
1677     {
1678       if (dump_enabled_p ())
1679 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680 			 "bad data references.\n");
1681       return false;
1682     }
1683 
1684   /* Classify all cross-iteration scalar data-flow cycles.
1685      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1686 
1687   vect_analyze_scalar_cycles (loop_vinfo);
1688 
1689   vect_pattern_recog (loop_vinfo, NULL);
1690 
1691   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1692      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1693 
1694   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1695   if (!ok)
1696     {
1697       if (dump_enabled_p ())
1698 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699 			 "bad data access.\n");
1700       return false;
1701     }
1702 
1703   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1704 
1705   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1706   if (!ok)
1707     {
1708       if (dump_enabled_p ())
1709 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 			 "unexpected pattern.\n");
1711       return false;
1712     }
1713 
1714   /* Analyze data dependences between the data-refs in the loop
1715      and adjust the maximum vectorization factor according to
1716      the dependences.
1717      FORNOW: fail at the first data dependence that we encounter.  */
1718 
1719   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1720   if (!ok
1721       || max_vf < min_vf)
1722     {
1723       if (dump_enabled_p ())
1724 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725 			     "bad data dependence.\n");
1726       return false;
1727     }
1728 
1729   ok = vect_determine_vectorization_factor (loop_vinfo);
1730   if (!ok)
1731     {
1732       if (dump_enabled_p ())
1733 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734 			 "can't determine vectorization factor.\n");
1735       return false;
1736     }
1737   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1738     {
1739       if (dump_enabled_p ())
1740 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741 			 "bad data dependence.\n");
1742       return false;
1743     }
1744 
1745   /* Analyze the alignment of the data-refs in the loop.
1746      Fail if a data reference is found that cannot be vectorized.  */
1747 
1748   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1749   if (!ok)
1750     {
1751       if (dump_enabled_p ())
1752 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753 			 "bad data alignment.\n");
1754       return false;
1755     }
1756 
1757   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1758      It is important to call pruning after vect_analyze_data_ref_accesses,
1759      since we use grouping information gathered by interleaving analysis.  */
1760   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1761   if (!ok)
1762     {
1763       if (dump_enabled_p ())
1764 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765 			 "number of versioning for alias "
1766 			 "run-time tests exceeds %d "
1767 			 "(--param vect-max-version-for-alias-checks)\n",
1768 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1769       return false;
1770     }
1771 
1772   /* This pass will decide on using loop versioning and/or loop peeling in
1773      order to enhance the alignment of data references in the loop.  */
1774 
1775   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1776   if (!ok)
1777     {
1778       if (dump_enabled_p ())
1779 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780 			 "bad data alignment.\n");
1781       return false;
1782     }
1783 
1784   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1785   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1786   if (ok)
1787     {
1788       /* Decide which possible SLP instances to SLP.  */
1789       slp = vect_make_slp_decision (loop_vinfo);
1790 
1791       /* Find stmts that need to be both vectorized and SLPed.  */
1792       vect_detect_hybrid_slp (loop_vinfo);
1793     }
1794   else
1795     return false;
1796 
1797   /* Scan all the operations in the loop and make sure they are
1798      vectorizable.  */
1799 
1800   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1801   if (!ok)
1802     {
1803       if (dump_enabled_p ())
1804 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805 			 "bad operation or unsupported loop bound.\n");
1806       return false;
1807     }
1808 
1809   /* Decide whether we need to create an epilogue loop to handle
1810      remaining scalar iterations.  */
1811   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1812         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1813        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1814 
1815   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1816       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1817     {
1818       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1819 		   - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1820 	  < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1821 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1822     }
1823   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1824 	   || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1825 	       < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1826                /* In case of versioning, check if the maximum number of
1827                   iterations is greater than th.  If they are identical,
1828                   the epilogue is unnecessary.  */
1829 	       && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1830 	            && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1831                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1832 		        (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1833     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1834 
1835   /* If an epilogue loop is required make sure we can create one.  */
1836   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1837       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1838     {
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1841       if (!vect_can_advance_ivs_p (loop_vinfo)
1842 	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1843 					   single_exit (LOOP_VINFO_LOOP
1844 							 (loop_vinfo))))
1845         {
1846           if (dump_enabled_p ())
1847 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 			     "not vectorized: can't create required "
1849 			     "epilog loop\n");
1850           return false;
1851         }
1852     }
1853 
1854   return true;
1855 }
1856 
1857 /* Function vect_analyze_loop.
1858 
1859    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1860    for it.  The different analyses will record information in the
1861    loop_vec_info struct.  */
1862 loop_vec_info
1863 vect_analyze_loop (struct loop *loop)
1864 {
1865   loop_vec_info loop_vinfo;
1866   unsigned int vector_sizes;
1867 
1868   /* Autodetect first vector size we try.  */
1869   current_vector_size = 0;
1870   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1871 
1872   if (dump_enabled_p ())
1873     dump_printf_loc (MSG_NOTE, vect_location,
1874 		     "===== analyze_loop_nest =====\n");
1875 
1876   if (loop_outer (loop)
1877       && loop_vec_info_for_loop (loop_outer (loop))
1878       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1879     {
1880       if (dump_enabled_p ())
1881 	dump_printf_loc (MSG_NOTE, vect_location,
1882 			 "outer-loop already vectorized.\n");
1883       return NULL;
1884     }
1885 
1886   while (1)
1887     {
1888       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1889       loop_vinfo = vect_analyze_loop_form (loop);
1890       if (!loop_vinfo)
1891 	{
1892 	  if (dump_enabled_p ())
1893 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894 			     "bad loop form.\n");
1895 	  return NULL;
1896 	}
1897 
1898       if (vect_analyze_loop_2 (loop_vinfo))
1899 	{
1900 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1901 
1902 	  return loop_vinfo;
1903 	}
1904 
1905       destroy_loop_vec_info (loop_vinfo, true);
1906 
1907       vector_sizes &= ~current_vector_size;
1908       if (vector_sizes == 0
1909 	  || current_vector_size == 0)
1910 	return NULL;
1911 
1912       /* Try the next biggest vector size.  */
1913       current_vector_size = 1 << floor_log2 (vector_sizes);
1914       if (dump_enabled_p ())
1915 	dump_printf_loc (MSG_NOTE, vect_location,
1916 			 "***** Re-trying analysis with "
1917 			 "vector size %d\n", current_vector_size);
1918     }
1919 }
1920 
1921 
1922 /* Function reduction_code_for_scalar_code
1923 
1924    Input:
1925    CODE - tree_code of a reduction operations.
1926 
1927    Output:
1928    REDUC_CODE - the corresponding tree-code to be used to reduce the
1929       vector of partial results into a single scalar result, or ERROR_MARK
1930       if the operation is a supported reduction operation, but does not have
1931       such a tree-code.
1932 
1933    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1934 
1935 static bool
1936 reduction_code_for_scalar_code (enum tree_code code,
1937                                 enum tree_code *reduc_code)
1938 {
1939   switch (code)
1940     {
1941       case MAX_EXPR:
1942         *reduc_code = REDUC_MAX_EXPR;
1943         return true;
1944 
1945       case MIN_EXPR:
1946         *reduc_code = REDUC_MIN_EXPR;
1947         return true;
1948 
1949       case PLUS_EXPR:
1950         *reduc_code = REDUC_PLUS_EXPR;
1951         return true;
1952 
1953       case MULT_EXPR:
1954       case MINUS_EXPR:
1955       case BIT_IOR_EXPR:
1956       case BIT_XOR_EXPR:
1957       case BIT_AND_EXPR:
1958         *reduc_code = ERROR_MARK;
1959         return true;
1960 
1961       default:
1962        return false;
1963     }
1964 }
1965 
1966 
1967 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1968    STMT is printed with a message MSG. */
1969 
1970 static void
1971 report_vect_op (int msg_type, gimple stmt, const char *msg)
1972 {
1973   dump_printf_loc (msg_type, vect_location, "%s", msg);
1974   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1975   dump_printf (msg_type, "\n");
1976 }
1977 
1978 
1979 /* Detect SLP reduction of the form:
1980 
1981    #a1 = phi <a5, a0>
1982    a2 = operation (a1)
1983    a3 = operation (a2)
1984    a4 = operation (a3)
1985    a5 = operation (a4)
1986 
1987    #a = phi <a5>
1988 
1989    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1990    FIRST_STMT is the first reduction stmt in the chain
1991    (a2 = operation (a1)).
1992 
1993    Return TRUE if a reduction chain was detected.  */
1994 
1995 static bool
1996 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1997 {
1998   struct loop *loop = (gimple_bb (phi))->loop_father;
1999   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2000   enum tree_code code;
2001   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2002   stmt_vec_info use_stmt_info, current_stmt_info;
2003   tree lhs;
2004   imm_use_iterator imm_iter;
2005   use_operand_p use_p;
2006   int nloop_uses, size = 0, n_out_of_loop_uses;
2007   bool found = false;
2008 
2009   if (loop != vect_loop)
2010     return false;
2011 
2012   lhs = PHI_RESULT (phi);
2013   code = gimple_assign_rhs_code (first_stmt);
2014   while (1)
2015     {
2016       nloop_uses = 0;
2017       n_out_of_loop_uses = 0;
2018       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2019         {
2020 	  gimple use_stmt = USE_STMT (use_p);
2021 	  if (is_gimple_debug (use_stmt))
2022 	    continue;
2023 
2024           /* Check if we got back to the reduction phi.  */
2025 	  if (use_stmt == phi)
2026             {
2027 	      loop_use_stmt = use_stmt;
2028               found = true;
2029               break;
2030             }
2031 
2032           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2033             {
2034               if (vinfo_for_stmt (use_stmt)
2035                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2036                 {
2037                   loop_use_stmt = use_stmt;
2038                   nloop_uses++;
2039                 }
2040             }
2041            else
2042              n_out_of_loop_uses++;
2043 
2044            /* There are can be either a single use in the loop or two uses in
2045               phi nodes.  */
2046            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2047              return false;
2048         }
2049 
2050       if (found)
2051         break;
2052 
2053       /* We reached a statement with no loop uses.  */
2054       if (nloop_uses == 0)
2055 	return false;
2056 
2057       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2058       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2059         return false;
2060 
2061       if (!is_gimple_assign (loop_use_stmt)
2062 	  || code != gimple_assign_rhs_code (loop_use_stmt)
2063 	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2064         return false;
2065 
2066       /* Insert USE_STMT into reduction chain.  */
2067       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2068       if (current_stmt)
2069         {
2070           current_stmt_info = vinfo_for_stmt (current_stmt);
2071 	  GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2072           GROUP_FIRST_ELEMENT (use_stmt_info)
2073             = GROUP_FIRST_ELEMENT (current_stmt_info);
2074         }
2075       else
2076 	GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2077 
2078       lhs = gimple_assign_lhs (loop_use_stmt);
2079       current_stmt = loop_use_stmt;
2080       size++;
2081    }
2082 
2083   if (!found || loop_use_stmt != phi || size < 2)
2084     return false;
2085 
2086   /* Swap the operands, if needed, to make the reduction operand be the second
2087      operand.  */
2088   lhs = PHI_RESULT (phi);
2089   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2090   while (next_stmt)
2091     {
2092       if (gimple_assign_rhs2 (next_stmt) == lhs)
2093 	{
2094 	  tree op = gimple_assign_rhs1 (next_stmt);
2095           gimple def_stmt = NULL;
2096 
2097           if (TREE_CODE (op) == SSA_NAME)
2098             def_stmt = SSA_NAME_DEF_STMT (op);
2099 
2100 	  /* Check that the other def is either defined in the loop
2101 	     ("vect_internal_def"), or it's an induction (defined by a
2102 	     loop-header phi-node).  */
2103           if (def_stmt
2104               && gimple_bb (def_stmt)
2105 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2106               && (is_gimple_assign (def_stmt)
2107                   || is_gimple_call (def_stmt)
2108                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109                            == vect_induction_def
2110                   || (gimple_code (def_stmt) == GIMPLE_PHI
2111                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2112                                   == vect_internal_def
2113                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2114 	    {
2115 	      lhs = gimple_assign_lhs (next_stmt);
2116 	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2117  	      continue;
2118 	    }
2119 
2120 	  return false;
2121 	}
2122       else
2123 	{
2124           tree op = gimple_assign_rhs2 (next_stmt);
2125           gimple def_stmt = NULL;
2126 
2127           if (TREE_CODE (op) == SSA_NAME)
2128             def_stmt = SSA_NAME_DEF_STMT (op);
2129 
2130           /* Check that the other def is either defined in the loop
2131             ("vect_internal_def"), or it's an induction (defined by a
2132             loop-header phi-node).  */
2133           if (def_stmt
2134               && gimple_bb (def_stmt)
2135 	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2136               && (is_gimple_assign (def_stmt)
2137                   || is_gimple_call (def_stmt)
2138                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2139                               == vect_induction_def
2140                   || (gimple_code (def_stmt) == GIMPLE_PHI
2141                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142                                   == vect_internal_def
2143                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2144   	    {
2145 	      if (dump_enabled_p ())
2146 		{
2147 		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2148 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2149                   dump_printf (MSG_NOTE, "\n");
2150 		}
2151 
2152 	      swap_ssa_operands (next_stmt,
2153 	 		         gimple_assign_rhs1_ptr (next_stmt),
2154                                  gimple_assign_rhs2_ptr (next_stmt));
2155 	      update_stmt (next_stmt);
2156 
2157 	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2158 		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2159 	    }
2160 	  else
2161 	    return false;
2162         }
2163 
2164       lhs = gimple_assign_lhs (next_stmt);
2165       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2166     }
2167 
2168   /* Save the chain for further analysis in SLP detection.  */
2169   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2170   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2171   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2172 
2173   return true;
2174 }
2175 
2176 
2177 /* Function vect_is_simple_reduction_1
2178 
2179    (1) Detect a cross-iteration def-use cycle that represents a simple
2180    reduction computation.  We look for the following pattern:
2181 
2182    loop_header:
2183      a1 = phi < a0, a2 >
2184      a3 = ...
2185      a2 = operation (a3, a1)
2186 
2187    or
2188 
2189    a3 = ...
2190    loop_header:
2191      a1 = phi < a0, a2 >
2192      a2 = operation (a3, a1)
2193 
2194    such that:
2195    1. operation is commutative and associative and it is safe to
2196       change the order of the computation (if CHECK_REDUCTION is true)
2197    2. no uses for a2 in the loop (a2 is used out of the loop)
2198    3. no uses of a1 in the loop besides the reduction operation
2199    4. no uses of a1 outside the loop.
2200 
2201    Conditions 1,4 are tested here.
2202    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2203 
2204    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2205    nested cycles, if CHECK_REDUCTION is false.
2206 
2207    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2208    reductions:
2209 
2210      a1 = phi < a0, a2 >
2211      inner loop (def of a3)
2212      a2 = phi < a3 >
2213 
2214    If MODIFY is true it tries also to rework the code in-place to enable
2215    detection of more reduction patterns.  For the time being we rewrite
2216    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2217 */
2218 
2219 static gimple
2220 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2221 			    bool check_reduction, bool *double_reduc,
2222 			    bool modify)
2223 {
2224   struct loop *loop = (gimple_bb (phi))->loop_father;
2225   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2226   edge latch_e = loop_latch_edge (loop);
2227   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2228   gimple def_stmt, def1 = NULL, def2 = NULL;
2229   enum tree_code orig_code, code;
2230   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2231   tree type;
2232   int nloop_uses;
2233   tree name;
2234   imm_use_iterator imm_iter;
2235   use_operand_p use_p;
2236   bool phi_def;
2237 
2238   *double_reduc = false;
2239 
2240   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2241      otherwise, we assume outer loop vectorization.  */
2242   gcc_assert ((check_reduction && loop == vect_loop)
2243               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2244 
2245   name = PHI_RESULT (phi);
2246   /* ???  If there are no uses of the PHI result the inner loop reduction
2247      won't be detected as possibly double-reduction by vectorizable_reduction
2248      because that tries to walk the PHI arg from the preheader edge which
2249      can be constant.  See PR60382.  */
2250   if (has_zero_uses (name))
2251     return NULL;
2252   nloop_uses = 0;
2253   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2254     {
2255       gimple use_stmt = USE_STMT (use_p);
2256       if (is_gimple_debug (use_stmt))
2257 	continue;
2258 
2259       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2260         {
2261           if (dump_enabled_p ())
2262 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 			     "intermediate value used outside loop.\n");
2264 
2265           return NULL;
2266         }
2267 
2268       if (vinfo_for_stmt (use_stmt)
2269 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2270         nloop_uses++;
2271       if (nloop_uses > 1)
2272         {
2273           if (dump_enabled_p ())
2274 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 			     "reduction used in loop.\n");
2276           return NULL;
2277         }
2278     }
2279 
2280   if (TREE_CODE (loop_arg) != SSA_NAME)
2281     {
2282       if (dump_enabled_p ())
2283 	{
2284 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 			   "reduction: not ssa_name: ");
2286 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2287           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2288 	}
2289       return NULL;
2290     }
2291 
2292   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2293   if (!def_stmt)
2294     {
2295       if (dump_enabled_p ())
2296 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297 			 "reduction: no def_stmt.\n");
2298       return NULL;
2299     }
2300 
2301   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2302     {
2303       if (dump_enabled_p ())
2304         {
2305           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2306           dump_printf (MSG_NOTE, "\n");
2307         }
2308       return NULL;
2309     }
2310 
2311   if (is_gimple_assign (def_stmt))
2312     {
2313       name = gimple_assign_lhs (def_stmt);
2314       phi_def = false;
2315     }
2316   else
2317     {
2318       name = PHI_RESULT (def_stmt);
2319       phi_def = true;
2320     }
2321 
2322   nloop_uses = 0;
2323   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2324     {
2325       gimple use_stmt = USE_STMT (use_p);
2326       if (is_gimple_debug (use_stmt))
2327 	continue;
2328       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2329 	  && vinfo_for_stmt (use_stmt)
2330 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2331 	nloop_uses++;
2332       if (nloop_uses > 1)
2333 	{
2334 	  if (dump_enabled_p ())
2335 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336 			     "reduction used in loop.\n");
2337 	  return NULL;
2338 	}
2339     }
2340 
2341   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2342      defined in the inner loop.  */
2343   if (phi_def)
2344     {
2345       op1 = PHI_ARG_DEF (def_stmt, 0);
2346 
2347       if (gimple_phi_num_args (def_stmt) != 1
2348           || TREE_CODE (op1) != SSA_NAME)
2349         {
2350           if (dump_enabled_p ())
2351 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 			     "unsupported phi node definition.\n");
2353 
2354           return NULL;
2355         }
2356 
2357       def1 = SSA_NAME_DEF_STMT (op1);
2358       if (gimple_bb (def1)
2359 	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2360           && loop->inner
2361           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2362           && is_gimple_assign (def1))
2363         {
2364           if (dump_enabled_p ())
2365             report_vect_op (MSG_NOTE, def_stmt,
2366 			    "detected double reduction: ");
2367 
2368           *double_reduc = true;
2369           return def_stmt;
2370         }
2371 
2372       return NULL;
2373     }
2374 
2375   code = orig_code = gimple_assign_rhs_code (def_stmt);
2376 
2377   /* We can handle "res -= x[i]", which is non-associative by
2378      simply rewriting this into "res += -x[i]".  Avoid changing
2379      gimple instruction for the first simple tests and only do this
2380      if we're allowed to change code at all.  */
2381   if (code == MINUS_EXPR
2382       && modify
2383       && (op1 = gimple_assign_rhs1 (def_stmt))
2384       && TREE_CODE (op1) == SSA_NAME
2385       && SSA_NAME_DEF_STMT (op1) == phi)
2386     code = PLUS_EXPR;
2387 
2388   if (check_reduction
2389       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2390     {
2391       if (dump_enabled_p ())
2392         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2393 			"reduction: not commutative/associative: ");
2394       return NULL;
2395     }
2396 
2397   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2398     {
2399       if (code != COND_EXPR)
2400         {
2401 	  if (dump_enabled_p ())
2402 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403 			    "reduction: not binary operation: ");
2404 
2405           return NULL;
2406         }
2407 
2408       op3 = gimple_assign_rhs1 (def_stmt);
2409       if (COMPARISON_CLASS_P (op3))
2410         {
2411           op4 = TREE_OPERAND (op3, 1);
2412           op3 = TREE_OPERAND (op3, 0);
2413         }
2414 
2415       op1 = gimple_assign_rhs2 (def_stmt);
2416       op2 = gimple_assign_rhs3 (def_stmt);
2417 
2418       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2419         {
2420           if (dump_enabled_p ())
2421             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422 			    "reduction: uses not ssa_names: ");
2423 
2424           return NULL;
2425         }
2426     }
2427   else
2428     {
2429       op1 = gimple_assign_rhs1 (def_stmt);
2430       op2 = gimple_assign_rhs2 (def_stmt);
2431 
2432       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2433         {
2434           if (dump_enabled_p ())
2435 	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2436 			    "reduction: uses not ssa_names: ");
2437 
2438           return NULL;
2439         }
2440    }
2441 
2442   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2443   if ((TREE_CODE (op1) == SSA_NAME
2444        && !types_compatible_p (type,TREE_TYPE (op1)))
2445       || (TREE_CODE (op2) == SSA_NAME
2446           && !types_compatible_p (type, TREE_TYPE (op2)))
2447       || (op3 && TREE_CODE (op3) == SSA_NAME
2448           && !types_compatible_p (type, TREE_TYPE (op3)))
2449       || (op4 && TREE_CODE (op4) == SSA_NAME
2450           && !types_compatible_p (type, TREE_TYPE (op4))))
2451     {
2452       if (dump_enabled_p ())
2453         {
2454           dump_printf_loc (MSG_NOTE, vect_location,
2455 			   "reduction: multiple types: operation type: ");
2456           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2457           dump_printf (MSG_NOTE, ", operands types: ");
2458           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459 			     TREE_TYPE (op1));
2460           dump_printf (MSG_NOTE, ",");
2461           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2462 			     TREE_TYPE (op2));
2463           if (op3)
2464             {
2465               dump_printf (MSG_NOTE, ",");
2466               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2467 				 TREE_TYPE (op3));
2468             }
2469 
2470           if (op4)
2471             {
2472               dump_printf (MSG_NOTE, ",");
2473               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2474 				 TREE_TYPE (op4));
2475             }
2476           dump_printf (MSG_NOTE, "\n");
2477         }
2478 
2479       return NULL;
2480     }
2481 
2482   /* Check that it's ok to change the order of the computation.
2483      Generally, when vectorizing a reduction we change the order of the
2484      computation.  This may change the behavior of the program in some
2485      cases, so we need to check that this is ok.  One exception is when
2486      vectorizing an outer-loop: the inner-loop is executed sequentially,
2487      and therefore vectorizing reductions in the inner-loop during
2488      outer-loop vectorization is safe.  */
2489 
2490   /* CHECKME: check for !flag_finite_math_only too?  */
2491   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2492       && check_reduction)
2493     {
2494       /* Changing the order of operations changes the semantics.  */
2495       if (dump_enabled_p ())
2496 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2497 			"reduction: unsafe fp math optimization: ");
2498       return NULL;
2499     }
2500   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2501 	   && check_reduction)
2502     {
2503       /* Changing the order of operations changes the semantics.  */
2504       if (dump_enabled_p ())
2505 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2506 			"reduction: unsafe int math optimization: ");
2507       return NULL;
2508     }
2509   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2510     {
2511       /* Changing the order of operations changes the semantics.  */
2512       if (dump_enabled_p ())
2513 	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2514 			"reduction: unsafe fixed-point math optimization: ");
2515       return NULL;
2516     }
2517 
2518   /* If we detected "res -= x[i]" earlier, rewrite it into
2519      "res += -x[i]" now.  If this turns out to be useless reassoc
2520      will clean it up again.  */
2521   if (orig_code == MINUS_EXPR)
2522     {
2523       tree rhs = gimple_assign_rhs2 (def_stmt);
2524       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2525       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2526       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2527       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2528 							  loop_info, NULL));
2529       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2530       gimple_assign_set_rhs2 (def_stmt, negrhs);
2531       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2532       update_stmt (def_stmt);
2533     }
2534 
2535   /* Reduction is safe. We're dealing with one of the following:
2536      1) integer arithmetic and no trapv
2537      2) floating point arithmetic, and special flags permit this optimization
2538      3) nested cycle (i.e., outer loop vectorization).  */
2539   if (TREE_CODE (op1) == SSA_NAME)
2540     def1 = SSA_NAME_DEF_STMT (op1);
2541 
2542   if (TREE_CODE (op2) == SSA_NAME)
2543     def2 = SSA_NAME_DEF_STMT (op2);
2544 
2545   if (code != COND_EXPR
2546       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2547     {
2548       if (dump_enabled_p ())
2549 	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2550       return NULL;
2551     }
2552 
2553   /* Check that one def is the reduction def, defined by PHI,
2554      the other def is either defined in the loop ("vect_internal_def"),
2555      or it's an induction (defined by a loop-header phi-node).  */
2556 
2557   if (def2 && def2 == phi
2558       && (code == COND_EXPR
2559 	  || !def1 || gimple_nop_p (def1)
2560 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2561           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2562               && (is_gimple_assign (def1)
2563 		  || is_gimple_call (def1)
2564   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565                       == vect_induction_def
2566    	          || (gimple_code (def1) == GIMPLE_PHI
2567 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2568                           == vect_internal_def
2569  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
2570     {
2571       if (dump_enabled_p ())
2572 	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2573       return def_stmt;
2574     }
2575 
2576   if (def1 && def1 == phi
2577       && (code == COND_EXPR
2578 	  || !def2 || gimple_nop_p (def2)
2579 	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2580           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2581  	      && (is_gimple_assign (def2)
2582 		  || is_gimple_call (def2)
2583 	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584                       == vect_induction_def
2585  	          || (gimple_code (def2) == GIMPLE_PHI
2586 		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2587                           == vect_internal_def
2588 		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
2589     {
2590       if (check_reduction)
2591         {
2592           /* Swap operands (just for simplicity - so that the rest of the code
2593 	     can assume that the reduction variable is always the last (second)
2594 	     argument).  */
2595           if (dump_enabled_p ())
2596 	    report_vect_op (MSG_NOTE, def_stmt,
2597 	  	            "detected reduction: need to swap operands: ");
2598 
2599           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2600  			     gimple_assign_rhs2_ptr (def_stmt));
2601 
2602 	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2603 	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2604         }
2605       else
2606         {
2607           if (dump_enabled_p ())
2608             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2609         }
2610 
2611       return def_stmt;
2612     }
2613 
2614   /* Try to find SLP reduction chain.  */
2615   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2616     {
2617       if (dump_enabled_p ())
2618         report_vect_op (MSG_NOTE, def_stmt,
2619 			"reduction: detected reduction chain: ");
2620 
2621       return def_stmt;
2622     }
2623 
2624   if (dump_enabled_p ())
2625     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2626 		    "reduction: unknown pattern: ");
2627 
2628   return NULL;
2629 }
2630 
2631 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2632    in-place.  Arguments as there.  */
2633 
2634 static gimple
2635 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2636                           bool check_reduction, bool *double_reduc)
2637 {
2638   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2639 				     double_reduc, false);
2640 }
2641 
2642 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2643    in-place if it enables detection of more reductions.  Arguments
2644    as there.  */
2645 
2646 gimple
2647 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2648                           bool check_reduction, bool *double_reduc)
2649 {
2650   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2651 				     double_reduc, true);
2652 }
2653 
2654 /* Calculate the cost of one scalar iteration of the loop.  */
2655 int
2656 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2657 				       stmt_vector_for_cost *scalar_cost_vec)
2658 {
2659   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2660   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2661   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2662   int innerloop_iters, i;
2663 
2664   /* Count statements in scalar loop.  Using this as scalar cost for a single
2665      iteration for now.
2666 
2667      TODO: Add outer loop support.
2668 
2669      TODO: Consider assigning different costs to different scalar
2670      statements.  */
2671 
2672   /* FORNOW.  */
2673   innerloop_iters = 1;
2674   if (loop->inner)
2675     innerloop_iters = 50; /* FIXME */
2676 
2677   for (i = 0; i < nbbs; i++)
2678     {
2679       gimple_stmt_iterator si;
2680       basic_block bb = bbs[i];
2681 
2682       if (bb->loop_father == loop->inner)
2683         factor = innerloop_iters;
2684       else
2685         factor = 1;
2686 
2687       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2688         {
2689           gimple stmt = gsi_stmt (si);
2690           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2691 
2692           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2693             continue;
2694 
2695           /* Skip stmts that are not vectorized inside the loop.  */
2696           if (stmt_info
2697               && !STMT_VINFO_RELEVANT_P (stmt_info)
2698               && (!STMT_VINFO_LIVE_P (stmt_info)
2699                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2700 	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2701             continue;
2702 
2703 	  vect_cost_for_stmt kind;
2704           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2705             {
2706               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2707                kind = scalar_load;
2708              else
2709                kind = scalar_store;
2710             }
2711           else
2712             kind = scalar_stmt;
2713 
2714 	  scalar_single_iter_cost
2715 	    += record_stmt_cost (scalar_cost_vec, factor, kind,
2716 				 NULL, 0, vect_prologue);
2717         }
2718     }
2719   return scalar_single_iter_cost;
2720 }
2721 
2722 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2723 int
2724 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2725                              int *peel_iters_epilogue,
2726                              stmt_vector_for_cost *scalar_cost_vec,
2727 			     stmt_vector_for_cost *prologue_cost_vec,
2728 			     stmt_vector_for_cost *epilogue_cost_vec)
2729 {
2730   int retval = 0;
2731   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2732 
2733   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2734     {
2735       *peel_iters_epilogue = vf/2;
2736       if (dump_enabled_p ())
2737         dump_printf_loc (MSG_NOTE, vect_location,
2738 			 "cost model: epilogue peel iters set to vf/2 "
2739 			 "because loop iterations are unknown .\n");
2740 
2741       /* If peeled iterations are known but number of scalar loop
2742          iterations are unknown, count a taken branch per peeled loop.  */
2743       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2744 				 NULL, 0, vect_prologue);
2745       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2746 				 NULL, 0, vect_epilogue);
2747     }
2748   else
2749     {
2750       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2751       peel_iters_prologue = niters < peel_iters_prologue ?
2752                             niters : peel_iters_prologue;
2753       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2754       /* If we need to peel for gaps, but no peeling is required, we have to
2755 	 peel VF iterations.  */
2756       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2757         *peel_iters_epilogue = vf;
2758     }
2759 
2760   stmt_info_for_cost *si;
2761   int j;
2762   if (peel_iters_prologue)
2763     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2764       retval += record_stmt_cost (prologue_cost_vec,
2765 				  si->count * peel_iters_prologue,
2766 				  si->kind, NULL, si->misalign,
2767 				  vect_prologue);
2768   if (*peel_iters_epilogue)
2769     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2770       retval += record_stmt_cost (epilogue_cost_vec,
2771 				  si->count * *peel_iters_epilogue,
2772 				  si->kind, NULL, si->misalign,
2773 				  vect_epilogue);
2774 
2775   return retval;
2776 }
2777 
2778 /* Function vect_estimate_min_profitable_iters
2779 
2780    Return the number of iterations required for the vector version of the
2781    loop to be profitable relative to the cost of the scalar version of the
2782    loop.  */
2783 
2784 static void
2785 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2786 				    int *ret_min_profitable_niters,
2787 				    int *ret_min_profitable_estimate)
2788 {
2789   int min_profitable_iters;
2790   int min_profitable_estimate;
2791   int peel_iters_prologue;
2792   int peel_iters_epilogue;
2793   unsigned vec_inside_cost = 0;
2794   int vec_outside_cost = 0;
2795   unsigned vec_prologue_cost = 0;
2796   unsigned vec_epilogue_cost = 0;
2797   int scalar_single_iter_cost = 0;
2798   int scalar_outside_cost = 0;
2799   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2801   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2802 
2803   /* Cost model disabled.  */
2804   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2805     {
2806       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2807       *ret_min_profitable_niters = 0;
2808       *ret_min_profitable_estimate = 0;
2809       return;
2810     }
2811 
2812   /* Requires loop versioning tests to handle misalignment.  */
2813   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2814     {
2815       /*  FIXME: Make cost depend on complexity of individual check.  */
2816       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2817       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2818 			    vect_prologue);
2819       dump_printf (MSG_NOTE,
2820                    "cost model: Adding cost of checks for loop "
2821                    "versioning to treat misalignment.\n");
2822     }
2823 
2824   /* Requires loop versioning with alias checks.  */
2825   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2826     {
2827       /*  FIXME: Make cost depend on complexity of individual check.  */
2828       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
2829       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2830 			    vect_prologue);
2831       dump_printf (MSG_NOTE,
2832                    "cost model: Adding cost of checks for loop "
2833                    "versioning aliasing.\n");
2834     }
2835 
2836   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2837       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2838     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2839 			  vect_prologue);
2840 
2841   /* Count statements in scalar loop.  Using this as scalar cost for a single
2842      iteration for now.
2843 
2844      TODO: Add outer loop support.
2845 
2846      TODO: Consider assigning different costs to different scalar
2847      statements.  */
2848 
2849   auto_vec<stmt_info_for_cost> scalar_cost_vec;
2850   scalar_single_iter_cost
2851      = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2852 
2853   /* Add additional cost for the peeled instructions in prologue and epilogue
2854      loop.
2855 
2856      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2857      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2858 
2859      TODO: Build an expression that represents peel_iters for prologue and
2860      epilogue to be used in a run-time test.  */
2861 
2862   if (npeel  < 0)
2863     {
2864       peel_iters_prologue = vf/2;
2865       dump_printf (MSG_NOTE, "cost model: "
2866                    "prologue peel iters set to vf/2.\n");
2867 
2868       /* If peeling for alignment is unknown, loop bound of main loop becomes
2869          unknown.  */
2870       peel_iters_epilogue = vf/2;
2871       dump_printf (MSG_NOTE, "cost model: "
2872                    "epilogue peel iters set to vf/2 because "
2873                    "peeling for alignment is unknown.\n");
2874 
2875       /* If peeled iterations are unknown, count a taken branch and a not taken
2876          branch per peeled loop. Even if scalar loop iterations are known,
2877          vector iterations are not known since peeled prologue iterations are
2878          not known. Hence guards remain the same.  */
2879       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2880 			    NULL, 0, vect_prologue);
2881       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2882 			    NULL, 0, vect_prologue);
2883       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2884 			    NULL, 0, vect_epilogue);
2885       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2886 			    NULL, 0, vect_epilogue);
2887       stmt_info_for_cost *si;
2888       int j;
2889       FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2890 	{
2891 	  struct _stmt_vec_info *stmt_info
2892 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2893 	  (void) add_stmt_cost (target_cost_data,
2894 				si->count * peel_iters_prologue,
2895 				si->kind, stmt_info, si->misalign,
2896 				vect_prologue);
2897 	  (void) add_stmt_cost (target_cost_data,
2898 				si->count * peel_iters_epilogue,
2899 				si->kind, stmt_info, si->misalign,
2900 				vect_epilogue);
2901 	}
2902     }
2903   else
2904     {
2905       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2906       stmt_info_for_cost *si;
2907       int j;
2908       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2909 
2910       prologue_cost_vec.create (2);
2911       epilogue_cost_vec.create (2);
2912       peel_iters_prologue = npeel;
2913 
2914       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2915 					  &peel_iters_epilogue,
2916 					  &scalar_cost_vec,
2917 					  &prologue_cost_vec,
2918 					  &epilogue_cost_vec);
2919 
2920       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2921 	{
2922 	  struct _stmt_vec_info *stmt_info
2923 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2924 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2925 				si->misalign, vect_prologue);
2926 	}
2927 
2928       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2929 	{
2930 	  struct _stmt_vec_info *stmt_info
2931 	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2932 	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2933 				si->misalign, vect_epilogue);
2934 	}
2935 
2936       prologue_cost_vec.release ();
2937       epilogue_cost_vec.release ();
2938     }
2939 
2940   /* FORNOW: The scalar outside cost is incremented in one of the
2941      following ways:
2942 
2943      1. The vectorizer checks for alignment and aliasing and generates
2944      a condition that allows dynamic vectorization.  A cost model
2945      check is ANDED with the versioning condition.  Hence scalar code
2946      path now has the added cost of the versioning check.
2947 
2948        if (cost > th & versioning_check)
2949          jmp to vector code
2950 
2951      Hence run-time scalar is incremented by not-taken branch cost.
2952 
2953      2. The vectorizer then checks if a prologue is required.  If the
2954      cost model check was not done before during versioning, it has to
2955      be done before the prologue check.
2956 
2957        if (cost <= th)
2958          prologue = scalar_iters
2959        if (prologue == 0)
2960          jmp to vector code
2961        else
2962          execute prologue
2963        if (prologue == num_iters)
2964 	 go to exit
2965 
2966      Hence the run-time scalar cost is incremented by a taken branch,
2967      plus a not-taken branch, plus a taken branch cost.
2968 
2969      3. The vectorizer then checks if an epilogue is required.  If the
2970      cost model check was not done before during prologue check, it
2971      has to be done with the epilogue check.
2972 
2973        if (prologue == 0)
2974          jmp to vector code
2975        else
2976          execute prologue
2977        if (prologue == num_iters)
2978 	 go to exit
2979        vector code:
2980          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2981            jmp to epilogue
2982 
2983      Hence the run-time scalar cost should be incremented by 2 taken
2984      branches.
2985 
2986      TODO: The back end may reorder the BBS's differently and reverse
2987      conditions/branch directions.  Change the estimates below to
2988      something more reasonable.  */
2989 
2990   /* If the number of iterations is known and we do not do versioning, we can
2991      decide whether to vectorize at compile time.  Hence the scalar version
2992      do not carry cost model guard costs.  */
2993   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2994       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2995       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2996     {
2997       /* Cost model check occurs at versioning.  */
2998       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2999           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3000 	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3001       else
3002 	{
3003 	  /* Cost model check occurs at prologue generation.  */
3004 	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3005 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3006 	      + vect_get_stmt_cost (cond_branch_not_taken);
3007 	  /* Cost model check occurs at epilogue generation.  */
3008 	  else
3009 	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3010 	}
3011     }
3012 
3013   /* Complete the target-specific cost calculations.  */
3014   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3015 	       &vec_inside_cost, &vec_epilogue_cost);
3016 
3017   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3018 
3019   if (dump_enabled_p ())
3020     {
3021       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3022       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3023                    vec_inside_cost);
3024       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3025                    vec_prologue_cost);
3026       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3027                    vec_epilogue_cost);
3028       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3029                    scalar_single_iter_cost);
3030       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3031                    scalar_outside_cost);
3032       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3033                    vec_outside_cost);
3034       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3035                    peel_iters_prologue);
3036       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3037                    peel_iters_epilogue);
3038     }
3039 
3040   /* Calculate number of iterations required to make the vector version
3041      profitable, relative to the loop bodies only.  The following condition
3042      must hold true:
3043      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3044      where
3045      SIC = scalar iteration cost, VIC = vector iteration cost,
3046      VOC = vector outside cost, VF = vectorization factor,
3047      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3048      SOC = scalar outside cost for run time cost model check.  */
3049 
3050   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3051     {
3052       if (vec_outside_cost <= 0)
3053         min_profitable_iters = 1;
3054       else
3055         {
3056           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3057 				  - vec_inside_cost * peel_iters_prologue
3058                                   - vec_inside_cost * peel_iters_epilogue)
3059                                  / ((scalar_single_iter_cost * vf)
3060                                     - vec_inside_cost);
3061 
3062           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3063               <= (((int) vec_inside_cost * min_profitable_iters)
3064                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3065             min_profitable_iters++;
3066         }
3067     }
3068   /* vector version will never be profitable.  */
3069   else
3070     {
3071       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3072 	warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3073 		    "did not happen for a simd loop");
3074 
3075       if (dump_enabled_p ())
3076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3077 			 "cost model: the vector iteration cost = %d "
3078 			 "divided by the scalar iteration cost = %d "
3079 			 "is greater or equal to the vectorization factor = %d"
3080                          ".\n",
3081 			 vec_inside_cost, scalar_single_iter_cost, vf);
3082       *ret_min_profitable_niters = -1;
3083       *ret_min_profitable_estimate = -1;
3084       return;
3085     }
3086 
3087   dump_printf (MSG_NOTE,
3088 	       "  Calculated minimum iters for profitability: %d\n",
3089 	       min_profitable_iters);
3090 
3091   min_profitable_iters =
3092 	min_profitable_iters < vf ? vf : min_profitable_iters;
3093 
3094   /* Because the condition we create is:
3095      if (niters <= min_profitable_iters)
3096        then skip the vectorized loop.  */
3097   min_profitable_iters--;
3098 
3099   if (dump_enabled_p ())
3100     dump_printf_loc (MSG_NOTE, vect_location,
3101                      "  Runtime profitability threshold = %d\n",
3102                      min_profitable_iters);
3103 
3104   *ret_min_profitable_niters = min_profitable_iters;
3105 
3106   /* Calculate number of iterations required to make the vector version
3107      profitable, relative to the loop bodies only.
3108 
3109      Non-vectorized variant is SIC * niters and it must win over vector
3110      variant on the expected loop trip count.  The following condition must hold true:
3111      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3112 
3113   if (vec_outside_cost <= 0)
3114     min_profitable_estimate = 1;
3115   else
3116     {
3117       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3118 				 - vec_inside_cost * peel_iters_prologue
3119 				 - vec_inside_cost * peel_iters_epilogue)
3120 				 / ((scalar_single_iter_cost * vf)
3121 				   - vec_inside_cost);
3122     }
3123   min_profitable_estimate --;
3124   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3125   if (dump_enabled_p ())
3126     dump_printf_loc (MSG_NOTE, vect_location,
3127                      "  Static estimate profitability threshold = %d\n",
3128                       min_profitable_iters);
3129 
3130   *ret_min_profitable_estimate = min_profitable_estimate;
3131 }
3132 
3133 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3134    vector elements (not bits) for a vector of mode MODE.  */
3135 static void
3136 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3137 			      unsigned char *sel)
3138 {
3139   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3140 
3141   for (i = 0; i < nelt; i++)
3142     sel[i] = (i + offset) & (2*nelt - 1);
3143 }
3144 
3145 /* Checks whether the target supports whole-vector shifts for vectors of mode
3146    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3147    it supports vec_perm_const with masks for all necessary shift amounts.  */
3148 static bool
3149 have_whole_vector_shift (enum machine_mode mode)
3150 {
3151   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3152     return true;
3153 
3154   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3155     return false;
3156 
3157   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3158   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3159 
3160   for (i = nelt/2; i >= 1; i/=2)
3161     {
3162       calc_vec_perm_mask_for_shift (mode, i, sel);
3163       if (!can_vec_perm_p (mode, false, sel))
3164 	return false;
3165     }
3166   return true;
3167 }
3168 
3169 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3170    functions. Design better to avoid maintenance issues.  */
3171 
3172 /* Function vect_model_reduction_cost.
3173 
3174    Models cost for a reduction operation, including the vector ops
3175    generated within the strip-mine loop, the initial definition before
3176    the loop, and the epilogue code that must be generated.  */
3177 
3178 static bool
3179 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3180 			   int ncopies)
3181 {
3182   int prologue_cost = 0, epilogue_cost = 0;
3183   enum tree_code code;
3184   optab optab;
3185   tree vectype;
3186   gimple stmt, orig_stmt;
3187   tree reduction_op;
3188   machine_mode mode;
3189   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3190   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3191   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3192 
3193   /* Cost of reduction op inside loop.  */
3194   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3195 					stmt_info, 0, vect_body);
3196   stmt = STMT_VINFO_STMT (stmt_info);
3197 
3198   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3199     {
3200     case GIMPLE_SINGLE_RHS:
3201       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3202       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3203       break;
3204     case GIMPLE_UNARY_RHS:
3205       reduction_op = gimple_assign_rhs1 (stmt);
3206       break;
3207     case GIMPLE_BINARY_RHS:
3208       reduction_op = gimple_assign_rhs2 (stmt);
3209       break;
3210     case GIMPLE_TERNARY_RHS:
3211       reduction_op = gimple_assign_rhs3 (stmt);
3212       break;
3213     default:
3214       gcc_unreachable ();
3215     }
3216 
3217   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3218   if (!vectype)
3219     {
3220       if (dump_enabled_p ())
3221         {
3222 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223 			   "unsupported data-type ");
3224           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3225 			     TREE_TYPE (reduction_op));
3226           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3227         }
3228       return false;
3229    }
3230 
3231   mode = TYPE_MODE (vectype);
3232   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3233 
3234   if (!orig_stmt)
3235     orig_stmt = STMT_VINFO_STMT (stmt_info);
3236 
3237   code = gimple_assign_rhs_code (orig_stmt);
3238 
3239   /* Add in cost for initial definition.  */
3240   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3241 				  stmt_info, 0, vect_prologue);
3242 
3243   /* Determine cost of epilogue code.
3244 
3245      We have a reduction operator that will reduce the vector in one statement.
3246      Also requires scalar extract.  */
3247 
3248   if (!nested_in_vect_loop_p (loop, orig_stmt))
3249     {
3250       if (reduc_code != ERROR_MARK)
3251 	{
3252 	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3253 					  stmt_info, 0, vect_epilogue);
3254 	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3255 					  stmt_info, 0, vect_epilogue);
3256 	}
3257       else
3258 	{
3259 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3260 	  tree bitsize =
3261 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3262 	  int element_bitsize = tree_to_uhwi (bitsize);
3263 	  int nelements = vec_size_in_bits / element_bitsize;
3264 
3265 	  optab = optab_for_tree_code (code, vectype, optab_default);
3266 
3267 	  /* We have a whole vector shift available.  */
3268 	  if (VECTOR_MODE_P (mode)
3269 	      && optab_handler (optab, mode) != CODE_FOR_nothing
3270 	      && have_whole_vector_shift (mode))
3271 	    {
3272 	      /* Final reduction via vector shifts and the reduction operator.
3273 		 Also requires scalar extract.  */
3274 	      epilogue_cost += add_stmt_cost (target_cost_data,
3275 					      exact_log2 (nelements) * 2,
3276 					      vector_stmt, stmt_info, 0,
3277 					      vect_epilogue);
3278 	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3279 					      vec_to_scalar, stmt_info, 0,
3280 					      vect_epilogue);
3281 	    }
3282 	  else
3283 	    /* Use extracts and reduction op for final reduction.  For N
3284 	       elements, we have N extracts and N-1 reduction ops.  */
3285 	    epilogue_cost += add_stmt_cost (target_cost_data,
3286 					    nelements + nelements - 1,
3287 					    vector_stmt, stmt_info, 0,
3288 					    vect_epilogue);
3289 	}
3290     }
3291 
3292   if (dump_enabled_p ())
3293     dump_printf (MSG_NOTE,
3294                  "vect_model_reduction_cost: inside_cost = %d, "
3295                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3296                  prologue_cost, epilogue_cost);
3297 
3298   return true;
3299 }
3300 
3301 
3302 /* Function vect_model_induction_cost.
3303 
3304    Models cost for induction operations.  */
3305 
3306 static void
3307 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3308 {
3309   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3310   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3311   unsigned inside_cost, prologue_cost;
3312 
3313   /* loop cost for vec_loop.  */
3314   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3315 			       stmt_info, 0, vect_body);
3316 
3317   /* prologue cost for vec_init and vec_step.  */
3318   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3319 				 stmt_info, 0, vect_prologue);
3320 
3321   if (dump_enabled_p ())
3322     dump_printf_loc (MSG_NOTE, vect_location,
3323                      "vect_model_induction_cost: inside_cost = %d, "
3324                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3325 }
3326 
3327 
3328 /* Function get_initial_def_for_induction
3329 
3330    Input:
3331    STMT - a stmt that performs an induction operation in the loop.
3332    IV_PHI - the initial value of the induction variable
3333 
3334    Output:
3335    Return a vector variable, initialized with the first VF values of
3336    the induction variable.  E.g., for an iv with IV_PHI='X' and
3337    evolution S, for a vector of 4 units, we want to return:
3338    [X, X + S, X + 2*S, X + 3*S].  */
3339 
3340 static tree
3341 get_initial_def_for_induction (gimple iv_phi)
3342 {
3343   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3344   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3345   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3346   tree vectype;
3347   int nunits;
3348   edge pe = loop_preheader_edge (loop);
3349   struct loop *iv_loop;
3350   basic_block new_bb;
3351   tree new_vec, vec_init, vec_step, t;
3352   tree new_var;
3353   tree new_name;
3354   gimple init_stmt, new_stmt;
3355   gphi *induction_phi;
3356   tree induc_def, vec_def, vec_dest;
3357   tree init_expr, step_expr;
3358   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3359   int i;
3360   int ncopies;
3361   tree expr;
3362   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3363   bool nested_in_vect_loop = false;
3364   gimple_seq stmts = NULL;
3365   imm_use_iterator imm_iter;
3366   use_operand_p use_p;
3367   gimple exit_phi;
3368   edge latch_e;
3369   tree loop_arg;
3370   gimple_stmt_iterator si;
3371   basic_block bb = gimple_bb (iv_phi);
3372   tree stepvectype;
3373   tree resvectype;
3374 
3375   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3376   if (nested_in_vect_loop_p (loop, iv_phi))
3377     {
3378       nested_in_vect_loop = true;
3379       iv_loop = loop->inner;
3380     }
3381   else
3382     iv_loop = loop;
3383   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3384 
3385   latch_e = loop_latch_edge (iv_loop);
3386   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3387 
3388   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3389   gcc_assert (step_expr != NULL_TREE);
3390 
3391   pe = loop_preheader_edge (iv_loop);
3392   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3393 				     loop_preheader_edge (iv_loop));
3394 
3395   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3396   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3397   gcc_assert (vectype);
3398   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3399   ncopies = vf / nunits;
3400 
3401   gcc_assert (phi_info);
3402   gcc_assert (ncopies >= 1);
3403 
3404   /* Convert the step to the desired type.  */
3405   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3406 						  step_expr),
3407 				    &stmts, true, NULL_TREE);
3408   if (stmts)
3409     {
3410       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3411       gcc_assert (!new_bb);
3412     }
3413 
3414   /* Find the first insertion point in the BB.  */
3415   si = gsi_after_labels (bb);
3416 
3417   /* Create the vector that holds the initial_value of the induction.  */
3418   if (nested_in_vect_loop)
3419     {
3420       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3421 	 been created during vectorization of previous stmts.  We obtain it
3422 	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3423       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3424       /* If the initial value is not of proper type, convert it.  */
3425       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3426 	{
3427 	  new_stmt
3428 	    = gimple_build_assign (vect_get_new_vect_var (vectype,
3429 							  vect_simple_var,
3430 							  "vec_iv_"),
3431 				   VIEW_CONVERT_EXPR,
3432 				   build1 (VIEW_CONVERT_EXPR, vectype,
3433 					   vec_init));
3434 	  vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3435 	  gimple_assign_set_lhs (new_stmt, vec_init);
3436 	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3437 						 new_stmt);
3438 	  gcc_assert (!new_bb);
3439 	  set_vinfo_for_stmt (new_stmt,
3440 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3441 	}
3442     }
3443   else
3444     {
3445       vec<constructor_elt, va_gc> *v;
3446 
3447       /* iv_loop is the loop to be vectorized. Create:
3448 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3449       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3450 				       vect_scalar_var, "var_");
3451       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3452 						     init_expr),
3453 				       &stmts, false, new_var);
3454       if (stmts)
3455 	{
3456 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3457 	  gcc_assert (!new_bb);
3458 	}
3459 
3460       vec_alloc (v, nunits);
3461       bool constant_p = is_gimple_min_invariant (new_name);
3462       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3463       for (i = 1; i < nunits; i++)
3464 	{
3465 	  /* Create: new_name_i = new_name + step_expr  */
3466 	  new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3467 				  new_name, step_expr);
3468 	  if (!is_gimple_min_invariant (new_name))
3469 	    {
3470 	      init_stmt = gimple_build_assign (new_var, new_name);
3471 	      new_name = make_ssa_name (new_var, init_stmt);
3472 	      gimple_assign_set_lhs (init_stmt, new_name);
3473 	      new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3474 	      gcc_assert (!new_bb);
3475 	      if (dump_enabled_p ())
3476 		{
3477 		  dump_printf_loc (MSG_NOTE, vect_location,
3478 				   "created new init_stmt: ");
3479 		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3480                   dump_printf (MSG_NOTE, "\n");
3481 		}
3482 	      constant_p = false;
3483 	    }
3484 	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3485 	}
3486       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3487       if (constant_p)
3488 	new_vec = build_vector_from_ctor (vectype, v);
3489       else
3490 	new_vec = build_constructor (vectype, v);
3491       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3492     }
3493 
3494 
3495   /* Create the vector that holds the step of the induction.  */
3496   if (nested_in_vect_loop)
3497     /* iv_loop is nested in the loop to be vectorized. Generate:
3498        vec_step = [S, S, S, S]  */
3499     new_name = step_expr;
3500   else
3501     {
3502       /* iv_loop is the loop to be vectorized. Generate:
3503 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3504       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3505 	{
3506 	  expr = build_int_cst (integer_type_node, vf);
3507 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
3508 	}
3509       else
3510 	expr = build_int_cst (TREE_TYPE (step_expr), vf);
3511       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3512 			      expr, step_expr);
3513       if (TREE_CODE (step_expr) == SSA_NAME)
3514 	new_name = vect_init_vector (iv_phi, new_name,
3515 				     TREE_TYPE (step_expr), NULL);
3516     }
3517 
3518   t = unshare_expr (new_name);
3519   gcc_assert (CONSTANT_CLASS_P (new_name)
3520 	      || TREE_CODE (new_name) == SSA_NAME);
3521   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3522   gcc_assert (stepvectype);
3523   new_vec = build_vector_from_val (stepvectype, t);
3524   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3525 
3526 
3527   /* Create the following def-use cycle:
3528      loop prolog:
3529          vec_init = ...
3530 	 vec_step = ...
3531      loop:
3532          vec_iv = PHI <vec_init, vec_loop>
3533          ...
3534          STMT
3535          ...
3536          vec_loop = vec_iv + vec_step;  */
3537 
3538   /* Create the induction-phi that defines the induction-operand.  */
3539   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3540   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3541   set_vinfo_for_stmt (induction_phi,
3542 		      new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3543   induc_def = PHI_RESULT (induction_phi);
3544 
3545   /* Create the iv update inside the loop  */
3546   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3547   vec_def = make_ssa_name (vec_dest, new_stmt);
3548   gimple_assign_set_lhs (new_stmt, vec_def);
3549   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3550   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3551                                                    NULL));
3552 
3553   /* Set the arguments of the phi node:  */
3554   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3555   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3556 	       UNKNOWN_LOCATION);
3557 
3558 
3559   /* In case that vectorization factor (VF) is bigger than the number
3560      of elements that we can fit in a vectype (nunits), we have to generate
3561      more than one vector stmt - i.e - we need to "unroll" the
3562      vector stmt by a factor VF/nunits.  For more details see documentation
3563      in vectorizable_operation.  */
3564 
3565   if (ncopies > 1)
3566     {
3567       stmt_vec_info prev_stmt_vinfo;
3568       /* FORNOW. This restriction should be relaxed.  */
3569       gcc_assert (!nested_in_vect_loop);
3570 
3571       /* Create the vector that holds the step of the induction.  */
3572       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3573 	{
3574 	  expr = build_int_cst (integer_type_node, nunits);
3575 	  expr = fold_convert (TREE_TYPE (step_expr), expr);
3576 	}
3577       else
3578 	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3579       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3580 			      expr, step_expr);
3581       if (TREE_CODE (step_expr) == SSA_NAME)
3582 	new_name = vect_init_vector (iv_phi, new_name,
3583 				     TREE_TYPE (step_expr), NULL);
3584       t = unshare_expr (new_name);
3585       gcc_assert (CONSTANT_CLASS_P (new_name)
3586 		  || TREE_CODE (new_name) == SSA_NAME);
3587       new_vec = build_vector_from_val (stepvectype, t);
3588       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3589 
3590       vec_def = induc_def;
3591       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3592       for (i = 1; i < ncopies; i++)
3593 	{
3594 	  /* vec_i = vec_prev + vec_step  */
3595 	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3596 					  vec_def, vec_step);
3597 	  vec_def = make_ssa_name (vec_dest, new_stmt);
3598 	  gimple_assign_set_lhs (new_stmt, vec_def);
3599 
3600 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3601 	  if (!useless_type_conversion_p (resvectype, vectype))
3602 	    {
3603 	      new_stmt
3604 		= gimple_build_assign
3605 			(vect_get_new_vect_var (resvectype, vect_simple_var,
3606 						"vec_iv_"),
3607 			 VIEW_CONVERT_EXPR,
3608 			 build1 (VIEW_CONVERT_EXPR, resvectype,
3609 				 gimple_assign_lhs (new_stmt)));
3610 	      gimple_assign_set_lhs (new_stmt,
3611 				     make_ssa_name
3612 				       (gimple_assign_lhs (new_stmt), new_stmt));
3613 	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3614 	    }
3615 	  set_vinfo_for_stmt (new_stmt,
3616 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3617 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3618 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3619 	}
3620     }
3621 
3622   if (nested_in_vect_loop)
3623     {
3624       /* Find the loop-closed exit-phi of the induction, and record
3625          the final vector of induction results:  */
3626       exit_phi = NULL;
3627       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3628         {
3629 	  gimple use_stmt = USE_STMT (use_p);
3630 	  if (is_gimple_debug (use_stmt))
3631 	    continue;
3632 
3633 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3634 	    {
3635 	      exit_phi = use_stmt;
3636 	      break;
3637 	    }
3638         }
3639       if (exit_phi)
3640 	{
3641 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3642 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
3643 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
3644 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3645 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
3646 
3647 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3648 	  if (dump_enabled_p ())
3649 	    {
3650 	      dump_printf_loc (MSG_NOTE, vect_location,
3651 			       "vector of inductions after inner-loop:");
3652 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3653               dump_printf (MSG_NOTE, "\n");
3654 	    }
3655 	}
3656     }
3657 
3658 
3659   if (dump_enabled_p ())
3660     {
3661       dump_printf_loc (MSG_NOTE, vect_location,
3662 		       "transform induction: created def-use cycle: ");
3663       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3664       dump_printf (MSG_NOTE, "\n");
3665       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3666 			SSA_NAME_DEF_STMT (vec_def), 0);
3667       dump_printf (MSG_NOTE, "\n");
3668     }
3669 
3670   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3671   if (!useless_type_conversion_p (resvectype, vectype))
3672     {
3673       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3674 							     vect_simple_var,
3675 							     "vec_iv_"),
3676 				      VIEW_CONVERT_EXPR,
3677 				      build1 (VIEW_CONVERT_EXPR, resvectype,
3678 					      induc_def));
3679       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3680       gimple_assign_set_lhs (new_stmt, induc_def);
3681       si = gsi_after_labels (bb);
3682       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3683       set_vinfo_for_stmt (new_stmt,
3684 			  new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3685       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3686 	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3687     }
3688 
3689   return induc_def;
3690 }
3691 
3692 
3693 /* Function get_initial_def_for_reduction
3694 
3695    Input:
3696    STMT - a stmt that performs a reduction operation in the loop.
3697    INIT_VAL - the initial value of the reduction variable
3698 
3699    Output:
3700    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3701         of the reduction (used for adjusting the epilog - see below).
3702    Return a vector variable, initialized according to the operation that STMT
3703         performs. This vector will be used as the initial value of the
3704         vector of partial results.
3705 
3706    Option1 (adjust in epilog): Initialize the vector as follows:
3707      add/bit or/xor:    [0,0,...,0,0]
3708      mult/bit and:      [1,1,...,1,1]
3709      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3710    and when necessary (e.g. add/mult case) let the caller know
3711    that it needs to adjust the result by init_val.
3712 
3713    Option2: Initialize the vector as follows:
3714      add/bit or/xor:    [init_val,0,0,...,0]
3715      mult/bit and:      [init_val,1,1,...,1]
3716      min/max/cond_expr: [init_val,init_val,...,init_val]
3717    and no adjustments are needed.
3718 
3719    For example, for the following code:
3720 
3721    s = init_val;
3722    for (i=0;i<n;i++)
3723      s = s + a[i];
3724 
3725    STMT is 's = s + a[i]', and the reduction variable is 's'.
3726    For a vector of 4 units, we want to return either [0,0,0,init_val],
3727    or [0,0,0,0] and let the caller know that it needs to adjust
3728    the result at the end by 'init_val'.
3729 
3730    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3731    initialization vector is simpler (same element in all entries), if
3732    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3733 
3734    A cost model should help decide between these two schemes.  */
3735 
3736 tree
3737 get_initial_def_for_reduction (gimple stmt, tree init_val,
3738                                tree *adjustment_def)
3739 {
3740   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3741   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3742   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3743   tree scalar_type = TREE_TYPE (init_val);
3744   tree vectype = get_vectype_for_scalar_type (scalar_type);
3745   int nunits;
3746   enum tree_code code = gimple_assign_rhs_code (stmt);
3747   tree def_for_init;
3748   tree init_def;
3749   tree *elts;
3750   int i;
3751   bool nested_in_vect_loop = false;
3752   tree init_value;
3753   REAL_VALUE_TYPE real_init_val = dconst0;
3754   int int_init_val = 0;
3755   gimple def_stmt = NULL;
3756 
3757   gcc_assert (vectype);
3758   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3759 
3760   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3761 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
3762 
3763   if (nested_in_vect_loop_p (loop, stmt))
3764     nested_in_vect_loop = true;
3765   else
3766     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3767 
3768   /* In case of double reduction we only create a vector variable to be put
3769      in the reduction phi node.  The actual statement creation is done in
3770      vect_create_epilog_for_reduction.  */
3771   if (adjustment_def && nested_in_vect_loop
3772       && TREE_CODE (init_val) == SSA_NAME
3773       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3774       && gimple_code (def_stmt) == GIMPLE_PHI
3775       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3776       && vinfo_for_stmt (def_stmt)
3777       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3778           == vect_double_reduction_def)
3779     {
3780       *adjustment_def = NULL;
3781       return vect_create_destination_var (init_val, vectype);
3782     }
3783 
3784   if (TREE_CONSTANT (init_val))
3785     {
3786       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3787         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3788       else
3789         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3790     }
3791   else
3792     init_value = init_val;
3793 
3794   /* In case of a nested reduction do not use an adjustment def as
3795      that case is not supported by the epilogue generation correctly
3796      if ncopies is not one.  */
3797   if (adjustment_def && nested_in_vect_loop)
3798     {
3799       *adjustment_def = NULL;
3800       return vect_get_vec_def_for_operand (init_val, stmt, NULL);
3801     }
3802 
3803   switch (code)
3804     {
3805       case WIDEN_SUM_EXPR:
3806       case DOT_PROD_EXPR:
3807       case SAD_EXPR:
3808       case PLUS_EXPR:
3809       case MINUS_EXPR:
3810       case BIT_IOR_EXPR:
3811       case BIT_XOR_EXPR:
3812       case MULT_EXPR:
3813       case BIT_AND_EXPR:
3814         /* ADJUSMENT_DEF is NULL when called from
3815            vect_create_epilog_for_reduction to vectorize double reduction.  */
3816         if (adjustment_def)
3817 	  *adjustment_def = init_val;
3818 
3819         if (code == MULT_EXPR)
3820           {
3821             real_init_val = dconst1;
3822             int_init_val = 1;
3823           }
3824 
3825         if (code == BIT_AND_EXPR)
3826           int_init_val = -1;
3827 
3828         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3829           def_for_init = build_real (scalar_type, real_init_val);
3830         else
3831           def_for_init = build_int_cst (scalar_type, int_init_val);
3832 
3833         /* Create a vector of '0' or '1' except the first element.  */
3834 	elts = XALLOCAVEC (tree, nunits);
3835         for (i = nunits - 2; i >= 0; --i)
3836 	  elts[i + 1] = def_for_init;
3837 
3838         /* Option1: the first element is '0' or '1' as well.  */
3839         if (adjustment_def)
3840           {
3841 	    elts[0] = def_for_init;
3842             init_def = build_vector (vectype, elts);
3843             break;
3844           }
3845 
3846         /* Option2: the first element is INIT_VAL.  */
3847 	elts[0] = init_val;
3848         if (TREE_CONSTANT (init_val))
3849           init_def = build_vector (vectype, elts);
3850         else
3851 	  {
3852 	    vec<constructor_elt, va_gc> *v;
3853 	    vec_alloc (v, nunits);
3854 	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3855 	    for (i = 1; i < nunits; ++i)
3856 	      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3857 	    init_def = build_constructor (vectype, v);
3858 	  }
3859 
3860         break;
3861 
3862       case MIN_EXPR:
3863       case MAX_EXPR:
3864       case COND_EXPR:
3865         if (adjustment_def)
3866           {
3867             *adjustment_def = NULL_TREE;
3868             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3869             break;
3870           }
3871 
3872 	init_def = build_vector_from_val (vectype, init_value);
3873         break;
3874 
3875       default:
3876         gcc_unreachable ();
3877     }
3878 
3879   return init_def;
3880 }
3881 
3882 /* Function vect_create_epilog_for_reduction
3883 
3884    Create code at the loop-epilog to finalize the result of a reduction
3885    computation.
3886 
3887    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3888      reduction statements.
3889    STMT is the scalar reduction stmt that is being vectorized.
3890    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3891      number of elements that we can fit in a vectype (nunits).  In this case
3892      we have to generate more than one vector stmt - i.e - we need to "unroll"
3893      the vector stmt by a factor VF/nunits.  For more details see documentation
3894      in vectorizable_operation.
3895    REDUC_CODE is the tree-code for the epilog reduction.
3896    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3897      computation.
3898    REDUC_INDEX is the index of the operand in the right hand side of the
3899      statement that is defined by REDUCTION_PHI.
3900    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3901    SLP_NODE is an SLP node containing a group of reduction statements. The
3902      first one in this group is STMT.
3903 
3904    This function:
3905    1. Creates the reduction def-use cycles: sets the arguments for
3906       REDUCTION_PHIS:
3907       The loop-entry argument is the vectorized initial-value of the reduction.
3908       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3909       sums.
3910    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3911       by applying the operation specified by REDUC_CODE if available, or by
3912       other means (whole-vector shifts or a scalar loop).
3913       The function also creates a new phi node at the loop exit to preserve
3914       loop-closed form, as illustrated below.
3915 
3916      The flow at the entry to this function:
3917 
3918         loop:
3919           vec_def = phi <null, null>            # REDUCTION_PHI
3920           VECT_DEF = vector_stmt                # vectorized form of STMT
3921           s_loop = scalar_stmt                  # (scalar) STMT
3922         loop_exit:
3923           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3924           use <s_out0>
3925           use <s_out0>
3926 
3927      The above is transformed by this function into:
3928 
3929         loop:
3930           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3931           VECT_DEF = vector_stmt                # vectorized form of STMT
3932           s_loop = scalar_stmt                  # (scalar) STMT
3933         loop_exit:
3934           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3935           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3936           v_out2 = reduce <v_out1>
3937           s_out3 = extract_field <v_out2, 0>
3938           s_out4 = adjust_result <s_out3>
3939           use <s_out4>
3940           use <s_out4>
3941 */
3942 
3943 static void
3944 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3945 				  int ncopies, enum tree_code reduc_code,
3946 				  vec<gimple> reduction_phis,
3947                                   int reduc_index, bool double_reduc,
3948                                   slp_tree slp_node)
3949 {
3950   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3951   stmt_vec_info prev_phi_info;
3952   tree vectype;
3953   machine_mode mode;
3954   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3955   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3956   basic_block exit_bb;
3957   tree scalar_dest;
3958   tree scalar_type;
3959   gimple new_phi = NULL, phi;
3960   gimple_stmt_iterator exit_gsi;
3961   tree vec_dest;
3962   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3963   gimple epilog_stmt = NULL;
3964   enum tree_code code = gimple_assign_rhs_code (stmt);
3965   gimple exit_phi;
3966   tree bitsize;
3967   tree adjustment_def = NULL;
3968   tree vec_initial_def = NULL;
3969   tree reduction_op, expr, def;
3970   tree orig_name, scalar_result;
3971   imm_use_iterator imm_iter, phi_imm_iter;
3972   use_operand_p use_p, phi_use_p;
3973   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3974   bool nested_in_vect_loop = false;
3975   auto_vec<gimple> new_phis;
3976   auto_vec<gimple> inner_phis;
3977   enum vect_def_type dt = vect_unknown_def_type;
3978   int j, i;
3979   auto_vec<tree> scalar_results;
3980   unsigned int group_size = 1, k, ratio;
3981   auto_vec<tree> vec_initial_defs;
3982   auto_vec<gimple> phis;
3983   bool slp_reduc = false;
3984   tree new_phi_result;
3985   gimple inner_phi = NULL;
3986 
3987   if (slp_node)
3988     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3989 
3990   if (nested_in_vect_loop_p (loop, stmt))
3991     {
3992       outer_loop = loop;
3993       loop = loop->inner;
3994       nested_in_vect_loop = true;
3995       gcc_assert (!slp_node);
3996     }
3997 
3998   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3999     {
4000     case GIMPLE_SINGLE_RHS:
4001       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
4002 		  == ternary_op);
4003       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
4004       break;
4005     case GIMPLE_UNARY_RHS:
4006       reduction_op = gimple_assign_rhs1 (stmt);
4007       break;
4008     case GIMPLE_BINARY_RHS:
4009       reduction_op = reduc_index ?
4010                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
4011       break;
4012     case GIMPLE_TERNARY_RHS:
4013       reduction_op = gimple_op (stmt, reduc_index + 1);
4014       break;
4015     default:
4016       gcc_unreachable ();
4017     }
4018 
4019   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4020   gcc_assert (vectype);
4021   mode = TYPE_MODE (vectype);
4022 
4023   /* 1. Create the reduction def-use cycle:
4024      Set the arguments of REDUCTION_PHIS, i.e., transform
4025 
4026         loop:
4027           vec_def = phi <null, null>            # REDUCTION_PHI
4028           VECT_DEF = vector_stmt                # vectorized form of STMT
4029           ...
4030 
4031      into:
4032 
4033         loop:
4034           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4035           VECT_DEF = vector_stmt                # vectorized form of STMT
4036           ...
4037 
4038      (in case of SLP, do it for all the phis). */
4039 
4040   /* Get the loop-entry arguments.  */
4041   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4042   if (slp_node)
4043     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4044                        NULL, slp_node, reduc_index);
4045   else
4046     {
4047       /* Get at the scalar def before the loop, that defines the initial value
4048 	 of the reduction variable.  */
4049       gimple def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4050       tree initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4051 						loop_preheader_edge (loop));
4052       vect_is_simple_use (initial_def, NULL, loop_vinfo, NULL,
4053 			  &def_stmt, &initial_def, &initial_def_dt);
4054      /* For the case of reduction, vect_get_vec_def_for_operand returns
4055         the scalar def before the loop, that defines the initial value
4056         of the reduction variable.  */
4057       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4058                                                       &adjustment_def);
4059       vec_initial_defs.create (1);
4060       vec_initial_defs.quick_push (vec_initial_def);
4061     }
4062 
4063   /* Set phi nodes arguments.  */
4064   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4065     {
4066       tree vec_init_def, def;
4067       gimple_seq stmts;
4068       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4069 					   true, NULL_TREE);
4070       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4071       def = vect_defs[i];
4072       for (j = 0; j < ncopies; j++)
4073         {
4074 	  if (j != 0)
4075 	    {
4076 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4077 	      if (nested_in_vect_loop)
4078 		vec_init_def
4079 		  = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4080 						    vec_init_def);
4081 	    }
4082 
4083           /* Set the loop-entry arg of the reduction-phi.  */
4084           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4085 		       loop_preheader_edge (loop), UNKNOWN_LOCATION);
4086 
4087           /* Set the loop-latch arg for the reduction-phi.  */
4088           if (j > 0)
4089             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4090 
4091           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4092 		       UNKNOWN_LOCATION);
4093 
4094           if (dump_enabled_p ())
4095             {
4096               dump_printf_loc (MSG_NOTE, vect_location,
4097 			       "transform reduction: created def-use cycle: ");
4098               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4099               dump_printf (MSG_NOTE, "\n");
4100               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4101               dump_printf (MSG_NOTE, "\n");
4102             }
4103         }
4104     }
4105 
4106   /* 2. Create epilog code.
4107         The reduction epilog code operates across the elements of the vector
4108         of partial results computed by the vectorized loop.
4109         The reduction epilog code consists of:
4110 
4111         step 1: compute the scalar result in a vector (v_out2)
4112         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4113         step 3: adjust the scalar result (s_out3) if needed.
4114 
4115         Step 1 can be accomplished using one the following three schemes:
4116           (scheme 1) using reduc_code, if available.
4117           (scheme 2) using whole-vector shifts, if available.
4118           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4119                      combined.
4120 
4121           The overall epilog code looks like this:
4122 
4123           s_out0 = phi <s_loop>         # original EXIT_PHI
4124           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4125           v_out2 = reduce <v_out1>              # step 1
4126           s_out3 = extract_field <v_out2, 0>    # step 2
4127           s_out4 = adjust_result <s_out3>       # step 3
4128 
4129           (step 3 is optional, and steps 1 and 2 may be combined).
4130           Lastly, the uses of s_out0 are replaced by s_out4.  */
4131 
4132 
4133   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4134          v_out1 = phi <VECT_DEF>
4135          Store them in NEW_PHIS.  */
4136 
4137   exit_bb = single_exit (loop)->dest;
4138   prev_phi_info = NULL;
4139   new_phis.create (vect_defs.length ());
4140   FOR_EACH_VEC_ELT (vect_defs, i, def)
4141     {
4142       for (j = 0; j < ncopies; j++)
4143         {
4144 	  tree new_def = copy_ssa_name (def);
4145           phi = create_phi_node (new_def, exit_bb);
4146           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4147           if (j == 0)
4148             new_phis.quick_push (phi);
4149           else
4150 	    {
4151 	      def = vect_get_vec_def_for_stmt_copy (dt, def);
4152 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4153 	    }
4154 
4155           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4156           prev_phi_info = vinfo_for_stmt (phi);
4157         }
4158     }
4159 
4160   /* The epilogue is created for the outer-loop, i.e., for the loop being
4161      vectorized.  Create exit phis for the outer loop.  */
4162   if (double_reduc)
4163     {
4164       loop = outer_loop;
4165       exit_bb = single_exit (loop)->dest;
4166       inner_phis.create (vect_defs.length ());
4167       FOR_EACH_VEC_ELT (new_phis, i, phi)
4168 	{
4169 	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
4170 	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
4171 	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4172 			   PHI_RESULT (phi));
4173 	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4174 							    loop_vinfo, NULL));
4175 	  inner_phis.quick_push (phi);
4176 	  new_phis[i] = outer_phi;
4177 	  prev_phi_info = vinfo_for_stmt (outer_phi);
4178           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4179             {
4180 	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4181 	      new_result = copy_ssa_name (PHI_RESULT (phi));
4182 	      outer_phi = create_phi_node (new_result, exit_bb);
4183 	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4184 			       PHI_RESULT (phi));
4185 	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4186 							loop_vinfo, NULL));
4187 	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4188 	      prev_phi_info = vinfo_for_stmt (outer_phi);
4189 	    }
4190 	}
4191     }
4192 
4193   exit_gsi = gsi_after_labels (exit_bb);
4194 
4195   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4196          (i.e. when reduc_code is not available) and in the final adjustment
4197 	 code (if needed).  Also get the original scalar reduction variable as
4198          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4199          represents a reduction pattern), the tree-code and scalar-def are
4200          taken from the original stmt that the pattern-stmt (STMT) replaces.
4201          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4202          are taken from STMT.  */
4203 
4204   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4205   if (!orig_stmt)
4206     {
4207       /* Regular reduction  */
4208       orig_stmt = stmt;
4209     }
4210   else
4211     {
4212       /* Reduction pattern  */
4213       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4214       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4215       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4216     }
4217 
4218   code = gimple_assign_rhs_code (orig_stmt);
4219   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4220      partial results are added and not subtracted.  */
4221   if (code == MINUS_EXPR)
4222     code = PLUS_EXPR;
4223 
4224   scalar_dest = gimple_assign_lhs (orig_stmt);
4225   scalar_type = TREE_TYPE (scalar_dest);
4226   scalar_results.create (group_size);
4227   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4228   bitsize = TYPE_SIZE (scalar_type);
4229 
4230   /* In case this is a reduction in an inner-loop while vectorizing an outer
4231      loop - we don't need to extract a single scalar result at the end of the
4232      inner-loop (unless it is double reduction, i.e., the use of reduction is
4233      outside the outer-loop).  The final vector of partial results will be used
4234      in the vectorized outer-loop, or reduced to a scalar result at the end of
4235      the outer-loop.  */
4236   if (nested_in_vect_loop && !double_reduc)
4237     goto vect_finalize_reduction;
4238 
4239   /* SLP reduction without reduction chain, e.g.,
4240      # a1 = phi <a2, a0>
4241      # b1 = phi <b2, b0>
4242      a2 = operation (a1)
4243      b2 = operation (b1)  */
4244   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4245 
4246   /* In case of reduction chain, e.g.,
4247      # a1 = phi <a3, a0>
4248      a2 = operation (a1)
4249      a3 = operation (a2),
4250 
4251      we may end up with more than one vector result.  Here we reduce them to
4252      one vector.  */
4253   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4254     {
4255       tree first_vect = PHI_RESULT (new_phis[0]);
4256       tree tmp;
4257       gassign *new_vec_stmt = NULL;
4258 
4259       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4260       for (k = 1; k < new_phis.length (); k++)
4261         {
4262           gimple next_phi = new_phis[k];
4263           tree second_vect = PHI_RESULT (next_phi);
4264 
4265           tmp = build2 (code, vectype,  first_vect, second_vect);
4266           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4267           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4268           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4269           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4270         }
4271 
4272       new_phi_result = first_vect;
4273       if (new_vec_stmt)
4274         {
4275           new_phis.truncate (0);
4276           new_phis.safe_push (new_vec_stmt);
4277         }
4278     }
4279   else
4280     new_phi_result = PHI_RESULT (new_phis[0]);
4281 
4282   /* 2.3 Create the reduction code, using one of the three schemes described
4283          above. In SLP we simply need to extract all the elements from the
4284          vector (without reducing them), so we use scalar shifts.  */
4285   if (reduc_code != ERROR_MARK && !slp_reduc)
4286     {
4287       tree tmp;
4288       tree vec_elem_type;
4289 
4290       /*** Case 1:  Create:
4291            v_out2 = reduc_expr <v_out1>  */
4292 
4293       if (dump_enabled_p ())
4294         dump_printf_loc (MSG_NOTE, vect_location,
4295 			 "Reduce using direct vector reduction.\n");
4296 
4297       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4298       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4299 	{
4300           tree tmp_dest =
4301 	      vect_create_destination_var (scalar_dest, vec_elem_type);
4302 	  tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4303 	  epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4304 	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4305 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
4306 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4307 
4308 	  tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4309 	}
4310       else
4311 	tmp = build1 (reduc_code, scalar_type, new_phi_result);
4312       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4313       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4314       gimple_assign_set_lhs (epilog_stmt, new_temp);
4315       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4316       scalar_results.safe_push (new_temp);
4317     }
4318   else
4319     {
4320       bool reduce_with_shift = have_whole_vector_shift (mode);
4321       int element_bitsize = tree_to_uhwi (bitsize);
4322       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4323       tree vec_temp;
4324 
4325       /* Regardless of whether we have a whole vector shift, if we're
4326          emulating the operation via tree-vect-generic, we don't want
4327          to use it.  Only the first round of the reduction is likely
4328          to still be profitable via emulation.  */
4329       /* ??? It might be better to emit a reduction tree code here, so that
4330          tree-vect-generic can expand the first round via bit tricks.  */
4331       if (!VECTOR_MODE_P (mode))
4332         reduce_with_shift = false;
4333       else
4334         {
4335           optab optab = optab_for_tree_code (code, vectype, optab_default);
4336           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4337             reduce_with_shift = false;
4338         }
4339 
4340       if (reduce_with_shift && !slp_reduc)
4341         {
4342           int nelements = vec_size_in_bits / element_bitsize;
4343           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4344 
4345           int elt_offset;
4346 
4347           tree zero_vec = build_zero_cst (vectype);
4348           /*** Case 2: Create:
4349              for (offset = nelements/2; offset >= 1; offset/=2)
4350                 {
4351                   Create:  va' = vec_shift <va, offset>
4352                   Create:  va = vop <va, va'>
4353                 }  */
4354 
4355           tree rhs;
4356 
4357           if (dump_enabled_p ())
4358             dump_printf_loc (MSG_NOTE, vect_location,
4359 			     "Reduce using vector shifts\n");
4360 
4361           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4362           new_temp = new_phi_result;
4363           for (elt_offset = nelements / 2;
4364                elt_offset >= 1;
4365                elt_offset /= 2)
4366             {
4367               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4368               tree mask = vect_gen_perm_mask_any (vectype, sel);
4369 	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4370 						 new_temp, zero_vec, mask);
4371               new_name = make_ssa_name (vec_dest, epilog_stmt);
4372               gimple_assign_set_lhs (epilog_stmt, new_name);
4373               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4374 
4375 	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4376 						 new_temp);
4377               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4378               gimple_assign_set_lhs (epilog_stmt, new_temp);
4379               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4380             }
4381 
4382 	  /* 2.4  Extract the final scalar result.  Create:
4383 	     s_out3 = extract_field <v_out2, bitpos>  */
4384 
4385 	  if (dump_enabled_p ())
4386 	    dump_printf_loc (MSG_NOTE, vect_location,
4387 			     "extract scalar result\n");
4388 
4389 	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4390 			bitsize, bitsize_zero_node);
4391 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4392 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4393 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
4394 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4395 	  scalar_results.safe_push (new_temp);
4396         }
4397       else
4398         {
4399           /*** Case 3: Create:
4400              s = extract_field <v_out2, 0>
4401              for (offset = element_size;
4402                   offset < vector_size;
4403                   offset += element_size;)
4404                {
4405                  Create:  s' = extract_field <v_out2, offset>
4406                  Create:  s = op <s, s'>  // For non SLP cases
4407                }  */
4408 
4409           if (dump_enabled_p ())
4410             dump_printf_loc (MSG_NOTE, vect_location,
4411 			     "Reduce using scalar code.\n");
4412 
4413           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4414           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4415             {
4416               int bit_offset;
4417               if (gimple_code (new_phi) == GIMPLE_PHI)
4418                 vec_temp = PHI_RESULT (new_phi);
4419               else
4420                 vec_temp = gimple_assign_lhs (new_phi);
4421               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4422                             bitsize_zero_node);
4423               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4424               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4425               gimple_assign_set_lhs (epilog_stmt, new_temp);
4426               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4427 
4428               /* In SLP we don't need to apply reduction operation, so we just
4429                  collect s' values in SCALAR_RESULTS.  */
4430               if (slp_reduc)
4431                 scalar_results.safe_push (new_temp);
4432 
4433               for (bit_offset = element_bitsize;
4434                    bit_offset < vec_size_in_bits;
4435                    bit_offset += element_bitsize)
4436                 {
4437                   tree bitpos = bitsize_int (bit_offset);
4438                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4439                                      bitsize, bitpos);
4440 
4441                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4442                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4443                   gimple_assign_set_lhs (epilog_stmt, new_name);
4444                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4445 
4446                   if (slp_reduc)
4447                     {
4448                       /* In SLP we don't need to apply reduction operation, so
4449                          we just collect s' values in SCALAR_RESULTS.  */
4450                       new_temp = new_name;
4451                       scalar_results.safe_push (new_name);
4452                     }
4453                   else
4454                     {
4455 		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4456 							 new_name, new_temp);
4457                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4458                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4459                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4460                     }
4461                 }
4462             }
4463 
4464           /* The only case where we need to reduce scalar results in SLP, is
4465              unrolling.  If the size of SCALAR_RESULTS is greater than
4466              GROUP_SIZE, we reduce them combining elements modulo
4467              GROUP_SIZE.  */
4468           if (slp_reduc)
4469             {
4470               tree res, first_res, new_res;
4471               gimple new_stmt;
4472 
4473               /* Reduce multiple scalar results in case of SLP unrolling.  */
4474               for (j = group_size; scalar_results.iterate (j, &res);
4475                    j++)
4476                 {
4477                   first_res = scalar_results[j % group_size];
4478 		  new_stmt = gimple_build_assign (new_scalar_dest, code,
4479 						  first_res, res);
4480                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4481                   gimple_assign_set_lhs (new_stmt, new_res);
4482                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4483                   scalar_results[j % group_size] = new_res;
4484                 }
4485             }
4486           else
4487             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4488             scalar_results.safe_push (new_temp);
4489         }
4490     }
4491 
4492 vect_finalize_reduction:
4493 
4494   if (double_reduc)
4495     loop = loop->inner;
4496 
4497   /* 2.5 Adjust the final result by the initial value of the reduction
4498 	 variable. (When such adjustment is not needed, then
4499 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
4500 	 new_temp = loop_exit_def + adjustment_def  */
4501 
4502   if (adjustment_def)
4503     {
4504       gcc_assert (!slp_reduc);
4505       if (nested_in_vect_loop)
4506 	{
4507           new_phi = new_phis[0];
4508 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4509 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4510 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
4511 	}
4512       else
4513 	{
4514           new_temp = scalar_results[0];
4515 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4516 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
4517 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4518 	}
4519 
4520       epilog_stmt = gimple_build_assign (new_dest, expr);
4521       new_temp = make_ssa_name (new_dest, epilog_stmt);
4522       gimple_assign_set_lhs (epilog_stmt, new_temp);
4523       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4524       if (nested_in_vect_loop)
4525         {
4526           set_vinfo_for_stmt (epilog_stmt,
4527                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4528                                                  NULL));
4529           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4530                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4531 
4532           if (!double_reduc)
4533             scalar_results.quick_push (new_temp);
4534           else
4535             scalar_results[0] = new_temp;
4536         }
4537       else
4538         scalar_results[0] = new_temp;
4539 
4540       new_phis[0] = epilog_stmt;
4541     }
4542 
4543   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4544           phis with new adjusted scalar results, i.e., replace use <s_out0>
4545           with use <s_out4>.
4546 
4547      Transform:
4548         loop_exit:
4549           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4550           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4551           v_out2 = reduce <v_out1>
4552           s_out3 = extract_field <v_out2, 0>
4553           s_out4 = adjust_result <s_out3>
4554           use <s_out0>
4555           use <s_out0>
4556 
4557      into:
4558 
4559         loop_exit:
4560           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4561           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4562           v_out2 = reduce <v_out1>
4563           s_out3 = extract_field <v_out2, 0>
4564           s_out4 = adjust_result <s_out3>
4565           use <s_out4>
4566           use <s_out4> */
4567 
4568 
4569   /* In SLP reduction chain we reduce vector results into one vector if
4570      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4571      the last stmt in the reduction chain, since we are looking for the loop
4572      exit phi node.  */
4573   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4574     {
4575       scalar_dest = gimple_assign_lhs (
4576 			SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4577       group_size = 1;
4578     }
4579 
4580   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4581      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4582      need to match SCALAR_RESULTS with corresponding statements.  The first
4583      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4584      the first vector stmt, etc.
4585      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4586   if (group_size > new_phis.length ())
4587     {
4588       ratio = group_size / new_phis.length ();
4589       gcc_assert (!(group_size % new_phis.length ()));
4590     }
4591   else
4592     ratio = 1;
4593 
4594   for (k = 0; k < group_size; k++)
4595     {
4596       if (k % ratio == 0)
4597         {
4598           epilog_stmt = new_phis[k / ratio];
4599           reduction_phi = reduction_phis[k / ratio];
4600 	  if (double_reduc)
4601 	    inner_phi = inner_phis[k / ratio];
4602         }
4603 
4604       if (slp_reduc)
4605         {
4606           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4607 
4608           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4609           /* SLP statements can't participate in patterns.  */
4610           gcc_assert (!orig_stmt);
4611           scalar_dest = gimple_assign_lhs (current_stmt);
4612         }
4613 
4614       phis.create (3);
4615       /* Find the loop-closed-use at the loop exit of the original scalar
4616          result.  (The reduction result is expected to have two immediate uses -
4617          one at the latch block, and one at the loop exit).  */
4618       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4619         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4620 	    && !is_gimple_debug (USE_STMT (use_p)))
4621           phis.safe_push (USE_STMT (use_p));
4622 
4623       /* While we expect to have found an exit_phi because of loop-closed-ssa
4624          form we can end up without one if the scalar cycle is dead.  */
4625 
4626       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4627         {
4628           if (outer_loop)
4629             {
4630               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4631               gphi *vect_phi;
4632 
4633               /* FORNOW. Currently not supporting the case that an inner-loop
4634                  reduction is not used in the outer-loop (but only outside the
4635                  outer-loop), unless it is double reduction.  */
4636               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4637                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4638                           || double_reduc);
4639 
4640 	      if (double_reduc)
4641 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4642 	      else
4643 		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4644               if (!double_reduc
4645                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4646                       != vect_double_reduction_def)
4647                 continue;
4648 
4649               /* Handle double reduction:
4650 
4651                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4652                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4653                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4654                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4655 
4656                  At that point the regular reduction (stmt2 and stmt3) is
4657                  already vectorized, as well as the exit phi node, stmt4.
4658                  Here we vectorize the phi node of double reduction, stmt1, and
4659                  update all relevant statements.  */
4660 
4661               /* Go through all the uses of s2 to find double reduction phi
4662                  node, i.e., stmt1 above.  */
4663               orig_name = PHI_RESULT (exit_phi);
4664               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4665                 {
4666                   stmt_vec_info use_stmt_vinfo;
4667                   stmt_vec_info new_phi_vinfo;
4668                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4669                   basic_block bb = gimple_bb (use_stmt);
4670                   gimple use;
4671 
4672                   /* Check that USE_STMT is really double reduction phi
4673                      node.  */
4674                   if (gimple_code (use_stmt) != GIMPLE_PHI
4675                       || gimple_phi_num_args (use_stmt) != 2
4676                       || bb->loop_father != outer_loop)
4677                     continue;
4678                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4679                   if (!use_stmt_vinfo
4680                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4681                           != vect_double_reduction_def)
4682 		    continue;
4683 
4684                   /* Create vector phi node for double reduction:
4685                      vs1 = phi <vs0, vs2>
4686                      vs1 was created previously in this function by a call to
4687                        vect_get_vec_def_for_operand and is stored in
4688                        vec_initial_def;
4689                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4690                      vs0 is created here.  */
4691 
4692                   /* Create vector phi node.  */
4693                   vect_phi = create_phi_node (vec_initial_def, bb);
4694                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4695                                     loop_vec_info_for_loop (outer_loop), NULL);
4696                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4697 
4698                   /* Create vs0 - initial def of the double reduction phi.  */
4699                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4700                                              loop_preheader_edge (outer_loop));
4701                   init_def = get_initial_def_for_reduction (stmt,
4702                                                           preheader_arg, NULL);
4703                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4704                                                     vectype, NULL);
4705 
4706                   /* Update phi node arguments with vs0 and vs2.  */
4707                   add_phi_arg (vect_phi, vect_phi_init,
4708                                loop_preheader_edge (outer_loop),
4709                                UNKNOWN_LOCATION);
4710                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4711                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4712                   if (dump_enabled_p ())
4713                     {
4714                       dump_printf_loc (MSG_NOTE, vect_location,
4715 				       "created double reduction phi node: ");
4716                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4717                       dump_printf (MSG_NOTE, "\n");
4718                     }
4719 
4720                   vect_phi_res = PHI_RESULT (vect_phi);
4721 
4722                   /* Replace the use, i.e., set the correct vs1 in the regular
4723                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4724                      loop is redundant.  */
4725                   use = reduction_phi;
4726                   for (j = 0; j < ncopies; j++)
4727                     {
4728                       edge pr_edge = loop_preheader_edge (loop);
4729                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4730                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4731                     }
4732                 }
4733             }
4734         }
4735 
4736       phis.release ();
4737       if (nested_in_vect_loop)
4738         {
4739           if (double_reduc)
4740             loop = outer_loop;
4741           else
4742             continue;
4743         }
4744 
4745       phis.create (3);
4746       /* Find the loop-closed-use at the loop exit of the original scalar
4747          result.  (The reduction result is expected to have two immediate uses,
4748          one at the latch block, and one at the loop exit).  For double
4749          reductions we are looking for exit phis of the outer loop.  */
4750       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4751         {
4752           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4753 	    {
4754 	      if (!is_gimple_debug (USE_STMT (use_p)))
4755 		phis.safe_push (USE_STMT (use_p));
4756 	    }
4757           else
4758             {
4759               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4760                 {
4761                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4762 
4763                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4764                     {
4765                       if (!flow_bb_inside_loop_p (loop,
4766                                              gimple_bb (USE_STMT (phi_use_p)))
4767 			  && !is_gimple_debug (USE_STMT (phi_use_p)))
4768                         phis.safe_push (USE_STMT (phi_use_p));
4769                     }
4770                 }
4771             }
4772         }
4773 
4774       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4775         {
4776           /* Replace the uses:  */
4777           orig_name = PHI_RESULT (exit_phi);
4778           scalar_result = scalar_results[k];
4779           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4780             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4781               SET_USE (use_p, scalar_result);
4782         }
4783 
4784       phis.release ();
4785     }
4786 }
4787 
4788 
4789 /* Function vectorizable_reduction.
4790 
4791    Check if STMT performs a reduction operation that can be vectorized.
4792    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4793    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4794    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4795 
4796    This function also handles reduction idioms (patterns) that have been
4797    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4798    of this form:
4799      X = pattern_expr (arg0, arg1, ..., X)
4800    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4801    sequence that had been detected and replaced by the pattern-stmt (STMT).
4802 
4803    In some cases of reduction patterns, the type of the reduction variable X is
4804    different than the type of the other arguments of STMT.
4805    In such cases, the vectype that is used when transforming STMT into a vector
4806    stmt is different than the vectype that is used to determine the
4807    vectorization factor, because it consists of a different number of elements
4808    than the actual number of elements that are being operated upon in parallel.
4809 
4810    For example, consider an accumulation of shorts into an int accumulator.
4811    On some targets it's possible to vectorize this pattern operating on 8
4812    shorts at a time (hence, the vectype for purposes of determining the
4813    vectorization factor should be V8HI); on the other hand, the vectype that
4814    is used to create the vector form is actually V4SI (the type of the result).
4815 
4816    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4817    indicates what is the actual level of parallelism (V8HI in the example), so
4818    that the right vectorization factor would be derived.  This vectype
4819    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4820    be used to create the vectorized stmt.  The right vectype for the vectorized
4821    stmt is obtained from the type of the result X:
4822         get_vectype_for_scalar_type (TREE_TYPE (X))
4823 
4824    This means that, contrary to "regular" reductions (or "regular" stmts in
4825    general), the following equation:
4826       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4827    does *NOT* necessarily hold for reduction patterns.  */
4828 
4829 bool
4830 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4831 			gimple *vec_stmt, slp_tree slp_node)
4832 {
4833   tree vec_dest;
4834   tree scalar_dest;
4835   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4836   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4837   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4838   tree vectype_in = NULL_TREE;
4839   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4840   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4841   enum tree_code code, orig_code, epilog_reduc_code;
4842   machine_mode vec_mode;
4843   int op_type;
4844   optab optab, reduc_optab;
4845   tree new_temp = NULL_TREE;
4846   tree def;
4847   gimple def_stmt;
4848   enum vect_def_type dt;
4849   gphi *new_phi = NULL;
4850   tree scalar_type;
4851   bool is_simple_use;
4852   gimple orig_stmt;
4853   stmt_vec_info orig_stmt_info;
4854   tree expr = NULL_TREE;
4855   int i;
4856   int ncopies;
4857   int epilog_copies;
4858   stmt_vec_info prev_stmt_info, prev_phi_info;
4859   bool single_defuse_cycle = false;
4860   tree reduc_def = NULL_TREE;
4861   gimple new_stmt = NULL;
4862   int j;
4863   tree ops[3];
4864   bool nested_cycle = false, found_nested_cycle_def = false;
4865   gimple reduc_def_stmt = NULL;
4866   /* The default is that the reduction variable is the last in statement.  */
4867   int reduc_index = 2;
4868   bool double_reduc = false, dummy;
4869   basic_block def_bb;
4870   struct loop * def_stmt_loop, *outer_loop = NULL;
4871   tree def_arg;
4872   gimple def_arg_stmt;
4873   auto_vec<tree> vec_oprnds0;
4874   auto_vec<tree> vec_oprnds1;
4875   auto_vec<tree> vect_defs;
4876   auto_vec<gimple> phis;
4877   int vec_num;
4878   tree def0, def1, tem, op0, op1 = NULL_TREE;
4879 
4880   /* In case of reduction chain we switch to the first stmt in the chain, but
4881      we don't update STMT_INFO, since only the last stmt is marked as reduction
4882      and has reduction properties.  */
4883   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4884     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4885 
4886   if (nested_in_vect_loop_p (loop, stmt))
4887     {
4888       outer_loop = loop;
4889       loop = loop->inner;
4890       nested_cycle = true;
4891     }
4892 
4893   /* 1. Is vectorizable reduction?  */
4894   /* Not supportable if the reduction variable is used in the loop, unless
4895      it's a reduction chain.  */
4896   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4897       && !GROUP_FIRST_ELEMENT (stmt_info))
4898     return false;
4899 
4900   /* Reductions that are not used even in an enclosing outer-loop,
4901      are expected to be "live" (used out of the loop).  */
4902   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4903       && !STMT_VINFO_LIVE_P (stmt_info))
4904     return false;
4905 
4906   /* Make sure it was already recognized as a reduction computation.  */
4907   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4908       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4909     return false;
4910 
4911   /* 2. Has this been recognized as a reduction pattern?
4912 
4913      Check if STMT represents a pattern that has been recognized
4914      in earlier analysis stages.  For stmts that represent a pattern,
4915      the STMT_VINFO_RELATED_STMT field records the last stmt in
4916      the original sequence that constitutes the pattern.  */
4917 
4918   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4919   if (orig_stmt)
4920     {
4921       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4922       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4923       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4924     }
4925 
4926   /* 3. Check the operands of the operation.  The first operands are defined
4927         inside the loop body. The last operand is the reduction variable,
4928         which is defined by the loop-header-phi.  */
4929 
4930   gcc_assert (is_gimple_assign (stmt));
4931 
4932   /* Flatten RHS.  */
4933   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4934     {
4935     case GIMPLE_SINGLE_RHS:
4936       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4937       if (op_type == ternary_op)
4938 	{
4939 	  tree rhs = gimple_assign_rhs1 (stmt);
4940 	  ops[0] = TREE_OPERAND (rhs, 0);
4941 	  ops[1] = TREE_OPERAND (rhs, 1);
4942 	  ops[2] = TREE_OPERAND (rhs, 2);
4943 	  code = TREE_CODE (rhs);
4944 	}
4945       else
4946 	return false;
4947       break;
4948 
4949     case GIMPLE_BINARY_RHS:
4950       code = gimple_assign_rhs_code (stmt);
4951       op_type = TREE_CODE_LENGTH (code);
4952       gcc_assert (op_type == binary_op);
4953       ops[0] = gimple_assign_rhs1 (stmt);
4954       ops[1] = gimple_assign_rhs2 (stmt);
4955       break;
4956 
4957     case GIMPLE_TERNARY_RHS:
4958       code = gimple_assign_rhs_code (stmt);
4959       op_type = TREE_CODE_LENGTH (code);
4960       gcc_assert (op_type == ternary_op);
4961       ops[0] = gimple_assign_rhs1 (stmt);
4962       ops[1] = gimple_assign_rhs2 (stmt);
4963       ops[2] = gimple_assign_rhs3 (stmt);
4964       break;
4965 
4966     case GIMPLE_UNARY_RHS:
4967       return false;
4968 
4969     default:
4970       gcc_unreachable ();
4971     }
4972 
4973   if (code == COND_EXPR && slp_node)
4974     return false;
4975 
4976   scalar_dest = gimple_assign_lhs (stmt);
4977   scalar_type = TREE_TYPE (scalar_dest);
4978   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4979       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4980     return false;
4981 
4982   /* Do not try to vectorize bit-precision reductions.  */
4983   if ((TYPE_PRECISION (scalar_type)
4984        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4985     return false;
4986 
4987   /* All uses but the last are expected to be defined in the loop.
4988      The last use is the reduction variable.  In case of nested cycle this
4989      assumption is not true: we use reduc_index to record the index of the
4990      reduction variable.  */
4991   for (i = 0; i < op_type - 1; i++)
4992     {
4993       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4994       if (i == 0 && code == COND_EXPR)
4995         continue;
4996 
4997       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4998 					    &def_stmt, &def, &dt, &tem);
4999       if (!vectype_in)
5000 	vectype_in = tem;
5001       gcc_assert (is_simple_use);
5002 
5003       if (dt != vect_internal_def
5004 	  && dt != vect_external_def
5005 	  && dt != vect_constant_def
5006 	  && dt != vect_induction_def
5007           && !(dt == vect_nested_cycle && nested_cycle))
5008 	return false;
5009 
5010       if (dt == vect_nested_cycle)
5011         {
5012           found_nested_cycle_def = true;
5013           reduc_def_stmt = def_stmt;
5014           reduc_index = i;
5015         }
5016     }
5017 
5018   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5019 					&def_stmt, &def, &dt, &tem);
5020   if (!vectype_in)
5021     vectype_in = tem;
5022   gcc_assert (is_simple_use);
5023   if (!found_nested_cycle_def)
5024     reduc_def_stmt = def_stmt;
5025 
5026   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5027     return false;
5028 
5029   if (!(dt == vect_reduction_def
5030 	|| dt == vect_nested_cycle
5031 	|| ((dt == vect_internal_def || dt == vect_external_def
5032 	     || dt == vect_constant_def || dt == vect_induction_def)
5033 	    && nested_cycle && found_nested_cycle_def)))
5034     {
5035       /* For pattern recognized stmts, orig_stmt might be a reduction,
5036 	 but some helper statements for the pattern might not, or
5037 	 might be COND_EXPRs with reduction uses in the condition.  */
5038       gcc_assert (orig_stmt);
5039       return false;
5040     }
5041 
5042   if (orig_stmt)
5043     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5044                                                        reduc_def_stmt,
5045                                                        !nested_cycle,
5046                                                        &dummy));
5047   else
5048     {
5049       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5050                                              !nested_cycle, &dummy);
5051       /* We changed STMT to be the first stmt in reduction chain, hence we
5052          check that in this case the first element in the chain is STMT.  */
5053       gcc_assert (stmt == tmp
5054                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5055     }
5056 
5057   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5058     return false;
5059 
5060   if (slp_node || PURE_SLP_STMT (stmt_info))
5061     ncopies = 1;
5062   else
5063     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5064                / TYPE_VECTOR_SUBPARTS (vectype_in));
5065 
5066   gcc_assert (ncopies >= 1);
5067 
5068   vec_mode = TYPE_MODE (vectype_in);
5069 
5070   if (code == COND_EXPR)
5071     {
5072       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5073         {
5074           if (dump_enabled_p ())
5075 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5076 			     "unsupported condition in reduction\n");
5077 
5078             return false;
5079         }
5080     }
5081   else
5082     {
5083       /* 4. Supportable by target?  */
5084 
5085       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5086 	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
5087 	{
5088 	  /* Shifts and rotates are only supported by vectorizable_shifts,
5089 	     not vectorizable_reduction.  */
5090           if (dump_enabled_p ())
5091 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5092 			     "unsupported shift or rotation.\n");
5093 	  return false;
5094 	}
5095 
5096       /* 4.1. check support for the operation in the loop  */
5097       optab = optab_for_tree_code (code, vectype_in, optab_default);
5098       if (!optab)
5099         {
5100           if (dump_enabled_p ())
5101 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102 			     "no optab.\n");
5103 
5104           return false;
5105         }
5106 
5107       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5108         {
5109           if (dump_enabled_p ())
5110             dump_printf (MSG_NOTE, "op not supported by target.\n");
5111 
5112           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5113               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5114 	          < vect_min_worthwhile_factor (code))
5115             return false;
5116 
5117           if (dump_enabled_p ())
5118   	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5119         }
5120 
5121       /* Worthwhile without SIMD support?  */
5122       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5123           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5124    	     < vect_min_worthwhile_factor (code))
5125         {
5126           if (dump_enabled_p ())
5127 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5128 			     "not worthwhile without SIMD support.\n");
5129 
5130           return false;
5131         }
5132     }
5133 
5134   /* 4.2. Check support for the epilog operation.
5135 
5136           If STMT represents a reduction pattern, then the type of the
5137           reduction variable may be different than the type of the rest
5138           of the arguments.  For example, consider the case of accumulation
5139           of shorts into an int accumulator; The original code:
5140                         S1: int_a = (int) short_a;
5141           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5142 
5143           was replaced with:
5144                         STMT: int_acc = widen_sum <short_a, int_acc>
5145 
5146           This means that:
5147           1. The tree-code that is used to create the vector operation in the
5148              epilog code (that reduces the partial results) is not the
5149              tree-code of STMT, but is rather the tree-code of the original
5150              stmt from the pattern that STMT is replacing.  I.e, in the example
5151              above we want to use 'widen_sum' in the loop, but 'plus' in the
5152              epilog.
5153           2. The type (mode) we use to check available target support
5154              for the vector operation to be created in the *epilog*, is
5155              determined by the type of the reduction variable (in the example
5156              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5157              However the type (mode) we use to check available target support
5158              for the vector operation to be created *inside the loop*, is
5159              determined by the type of the other arguments to STMT (in the
5160              example we'd check this: optab_handler (widen_sum_optab,
5161 	     vect_short_mode)).
5162 
5163           This is contrary to "regular" reductions, in which the types of all
5164           the arguments are the same as the type of the reduction variable.
5165           For "regular" reductions we can therefore use the same vector type
5166           (and also the same tree-code) when generating the epilog code and
5167           when generating the code inside the loop.  */
5168 
5169   if (orig_stmt)
5170     {
5171       /* This is a reduction pattern: get the vectype from the type of the
5172          reduction variable, and get the tree-code from orig_stmt.  */
5173       orig_code = gimple_assign_rhs_code (orig_stmt);
5174       gcc_assert (vectype_out);
5175       vec_mode = TYPE_MODE (vectype_out);
5176     }
5177   else
5178     {
5179       /* Regular reduction: use the same vectype and tree-code as used for
5180          the vector code inside the loop can be used for the epilog code. */
5181       orig_code = code;
5182     }
5183 
5184   if (nested_cycle)
5185     {
5186       def_bb = gimple_bb (reduc_def_stmt);
5187       def_stmt_loop = def_bb->loop_father;
5188       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5189                                        loop_preheader_edge (def_stmt_loop));
5190       if (TREE_CODE (def_arg) == SSA_NAME
5191           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5192           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5193           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5194           && vinfo_for_stmt (def_arg_stmt)
5195           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5196               == vect_double_reduction_def)
5197         double_reduc = true;
5198     }
5199 
5200   epilog_reduc_code = ERROR_MARK;
5201   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5202     {
5203       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5204                                          optab_default);
5205       if (!reduc_optab)
5206         {
5207           if (dump_enabled_p ())
5208 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5209 			     "no optab for reduction.\n");
5210 
5211           epilog_reduc_code = ERROR_MARK;
5212         }
5213       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5214         {
5215           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5216           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5217             {
5218               if (dump_enabled_p ())
5219 	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5220 				 "reduc op not supported by target.\n");
5221 
5222 	      epilog_reduc_code = ERROR_MARK;
5223 	    }
5224         }
5225     }
5226   else
5227     {
5228       if (!nested_cycle || double_reduc)
5229         {
5230           if (dump_enabled_p ())
5231 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5232 			     "no reduc code for scalar code.\n");
5233 
5234           return false;
5235         }
5236     }
5237 
5238   if (double_reduc && ncopies > 1)
5239     {
5240       if (dump_enabled_p ())
5241 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5242 			 "multiple types in double reduction\n");
5243 
5244       return false;
5245     }
5246 
5247   /* In case of widenning multiplication by a constant, we update the type
5248      of the constant to be the type of the other operand.  We check that the
5249      constant fits the type in the pattern recognition pass.  */
5250   if (code == DOT_PROD_EXPR
5251       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5252     {
5253       if (TREE_CODE (ops[0]) == INTEGER_CST)
5254         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5255       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5256         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5257       else
5258         {
5259           if (dump_enabled_p ())
5260 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5261 			     "invalid types in dot-prod\n");
5262 
5263           return false;
5264         }
5265     }
5266 
5267   if (!vec_stmt) /* transformation not required.  */
5268     {
5269       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5270         return false;
5271       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5272       return true;
5273     }
5274 
5275   /** Transform.  **/
5276 
5277   if (dump_enabled_p ())
5278     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5279 
5280   /* FORNOW: Multiple types are not supported for condition.  */
5281   if (code == COND_EXPR)
5282     gcc_assert (ncopies == 1);
5283 
5284   /* Create the destination vector  */
5285   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5286 
5287   /* In case the vectorization factor (VF) is bigger than the number
5288      of elements that we can fit in a vectype (nunits), we have to generate
5289      more than one vector stmt - i.e - we need to "unroll" the
5290      vector stmt by a factor VF/nunits.  For more details see documentation
5291      in vectorizable_operation.  */
5292 
5293   /* If the reduction is used in an outer loop we need to generate
5294      VF intermediate results, like so (e.g. for ncopies=2):
5295 	r0 = phi (init, r0)
5296 	r1 = phi (init, r1)
5297 	r0 = x0 + r0;
5298         r1 = x1 + r1;
5299     (i.e. we generate VF results in 2 registers).
5300     In this case we have a separate def-use cycle for each copy, and therefore
5301     for each copy we get the vector def for the reduction variable from the
5302     respective phi node created for this copy.
5303 
5304     Otherwise (the reduction is unused in the loop nest), we can combine
5305     together intermediate results, like so (e.g. for ncopies=2):
5306 	r = phi (init, r)
5307 	r = x0 + r;
5308 	r = x1 + r;
5309    (i.e. we generate VF/2 results in a single register).
5310    In this case for each copy we get the vector def for the reduction variable
5311    from the vectorized reduction operation generated in the previous iteration.
5312   */
5313 
5314   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5315     {
5316       single_defuse_cycle = true;
5317       epilog_copies = 1;
5318     }
5319   else
5320     epilog_copies = ncopies;
5321 
5322   prev_stmt_info = NULL;
5323   prev_phi_info = NULL;
5324   if (slp_node)
5325     {
5326       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5327       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5328                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5329     }
5330   else
5331     {
5332       vec_num = 1;
5333       vec_oprnds0.create (1);
5334       if (op_type == ternary_op)
5335         vec_oprnds1.create (1);
5336     }
5337 
5338   phis.create (vec_num);
5339   vect_defs.create (vec_num);
5340   if (!slp_node)
5341     vect_defs.quick_push (NULL_TREE);
5342 
5343   for (j = 0; j < ncopies; j++)
5344     {
5345       if (j == 0 || !single_defuse_cycle)
5346 	{
5347           for (i = 0; i < vec_num; i++)
5348             {
5349               /* Create the reduction-phi that defines the reduction
5350                  operand.  */
5351               new_phi = create_phi_node (vec_dest, loop->header);
5352               set_vinfo_for_stmt (new_phi,
5353                                   new_stmt_vec_info (new_phi, loop_vinfo,
5354                                                      NULL));
5355                if (j == 0 || slp_node)
5356                  phis.quick_push (new_phi);
5357             }
5358         }
5359 
5360       if (code == COND_EXPR)
5361         {
5362           gcc_assert (!slp_node);
5363           vectorizable_condition (stmt, gsi, vec_stmt,
5364                                   PHI_RESULT (phis[0]),
5365                                   reduc_index, NULL);
5366           /* Multiple types are not supported for condition.  */
5367           break;
5368         }
5369 
5370       /* Handle uses.  */
5371       if (j == 0)
5372         {
5373           op0 = ops[!reduc_index];
5374           if (op_type == ternary_op)
5375             {
5376               if (reduc_index == 0)
5377                 op1 = ops[2];
5378               else
5379                 op1 = ops[1];
5380             }
5381 
5382           if (slp_node)
5383             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5384                                slp_node, -1);
5385           else
5386             {
5387               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5388                                                             stmt, NULL);
5389               vec_oprnds0.quick_push (loop_vec_def0);
5390               if (op_type == ternary_op)
5391                {
5392                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5393                                                                NULL);
5394                  vec_oprnds1.quick_push (loop_vec_def1);
5395                }
5396             }
5397         }
5398       else
5399         {
5400           if (!slp_node)
5401             {
5402               enum vect_def_type dt;
5403               gimple dummy_stmt;
5404               tree dummy;
5405 
5406               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5407                                   &dummy_stmt, &dummy, &dt);
5408               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5409                                                               loop_vec_def0);
5410               vec_oprnds0[0] = loop_vec_def0;
5411               if (op_type == ternary_op)
5412                 {
5413                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5414                                       &dummy, &dt);
5415                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5416                                                                 loop_vec_def1);
5417                   vec_oprnds1[0] = loop_vec_def1;
5418                 }
5419             }
5420 
5421           if (single_defuse_cycle)
5422             reduc_def = gimple_assign_lhs (new_stmt);
5423 
5424           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5425         }
5426 
5427       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5428         {
5429           if (slp_node)
5430             reduc_def = PHI_RESULT (phis[i]);
5431           else
5432             {
5433               if (!single_defuse_cycle || j == 0)
5434                 reduc_def = PHI_RESULT (new_phi);
5435             }
5436 
5437           def1 = ((op_type == ternary_op)
5438                   ? vec_oprnds1[i] : NULL);
5439           if (op_type == binary_op)
5440             {
5441               if (reduc_index == 0)
5442                 expr = build2 (code, vectype_out, reduc_def, def0);
5443               else
5444                 expr = build2 (code, vectype_out, def0, reduc_def);
5445             }
5446           else
5447             {
5448               if (reduc_index == 0)
5449                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5450               else
5451                 {
5452                   if (reduc_index == 1)
5453                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5454                   else
5455                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5456                 }
5457             }
5458 
5459           new_stmt = gimple_build_assign (vec_dest, expr);
5460           new_temp = make_ssa_name (vec_dest, new_stmt);
5461           gimple_assign_set_lhs (new_stmt, new_temp);
5462           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5463 
5464           if (slp_node)
5465             {
5466               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5467               vect_defs.quick_push (new_temp);
5468             }
5469           else
5470             vect_defs[0] = new_temp;
5471         }
5472 
5473       if (slp_node)
5474         continue;
5475 
5476       if (j == 0)
5477 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5478       else
5479 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5480 
5481       prev_stmt_info = vinfo_for_stmt (new_stmt);
5482       prev_phi_info = vinfo_for_stmt (new_phi);
5483     }
5484 
5485   /* Finalize the reduction-phi (set its arguments) and create the
5486      epilog reduction code.  */
5487   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5488     {
5489       new_temp = gimple_assign_lhs (*vec_stmt);
5490       vect_defs[0] = new_temp;
5491     }
5492 
5493   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5494                                     epilog_reduc_code, phis, reduc_index,
5495                                     double_reduc, slp_node);
5496 
5497   return true;
5498 }
5499 
5500 /* Function vect_min_worthwhile_factor.
5501 
5502    For a loop where we could vectorize the operation indicated by CODE,
5503    return the minimum vectorization factor that makes it worthwhile
5504    to use generic vectors.  */
5505 int
5506 vect_min_worthwhile_factor (enum tree_code code)
5507 {
5508   switch (code)
5509     {
5510     case PLUS_EXPR:
5511     case MINUS_EXPR:
5512     case NEGATE_EXPR:
5513       return 4;
5514 
5515     case BIT_AND_EXPR:
5516     case BIT_IOR_EXPR:
5517     case BIT_XOR_EXPR:
5518     case BIT_NOT_EXPR:
5519       return 2;
5520 
5521     default:
5522       return INT_MAX;
5523     }
5524 }
5525 
5526 
5527 /* Function vectorizable_induction
5528 
5529    Check if PHI performs an induction computation that can be vectorized.
5530    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5531    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5532    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5533 
5534 bool
5535 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5536 			gimple *vec_stmt)
5537 {
5538   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5539   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5540   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5541   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5542   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5543   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5544   tree vec_def;
5545 
5546   gcc_assert (ncopies >= 1);
5547   /* FORNOW. These restrictions should be relaxed.  */
5548   if (nested_in_vect_loop_p (loop, phi))
5549     {
5550       imm_use_iterator imm_iter;
5551       use_operand_p use_p;
5552       gimple exit_phi;
5553       edge latch_e;
5554       tree loop_arg;
5555 
5556       if (ncopies > 1)
5557 	{
5558 	  if (dump_enabled_p ())
5559 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5560 			     "multiple types in nested loop.\n");
5561 	  return false;
5562 	}
5563 
5564       exit_phi = NULL;
5565       latch_e = loop_latch_edge (loop->inner);
5566       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5567       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5568 	{
5569 	  gimple use_stmt = USE_STMT (use_p);
5570 	  if (is_gimple_debug (use_stmt))
5571 	    continue;
5572 
5573 	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5574 	    {
5575 	      exit_phi = use_stmt;
5576 	      break;
5577 	    }
5578 	}
5579       if (exit_phi)
5580 	{
5581 	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5582 	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5583 		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5584 	    {
5585 	      if (dump_enabled_p ())
5586 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5587 				 "inner-loop induction only used outside "
5588 				 "of the outer vectorized loop.\n");
5589 	      return false;
5590 	    }
5591 	}
5592     }
5593 
5594   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5595     return false;
5596 
5597   /* FORNOW: SLP not supported.  */
5598   if (STMT_SLP_TYPE (stmt_info))
5599     return false;
5600 
5601   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5602 
5603   if (gimple_code (phi) != GIMPLE_PHI)
5604     return false;
5605 
5606   if (!vec_stmt) /* transformation not required.  */
5607     {
5608       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5609       if (dump_enabled_p ())
5610         dump_printf_loc (MSG_NOTE, vect_location,
5611                          "=== vectorizable_induction ===\n");
5612       vect_model_induction_cost (stmt_info, ncopies);
5613       return true;
5614     }
5615 
5616   /** Transform.  **/
5617 
5618   if (dump_enabled_p ())
5619     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5620 
5621   vec_def = get_initial_def_for_induction (phi);
5622   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5623   return true;
5624 }
5625 
5626 /* Function vectorizable_live_operation.
5627 
5628    STMT computes a value that is used outside the loop.  Check if
5629    it can be supported.  */
5630 
5631 bool
5632 vectorizable_live_operation (gimple stmt,
5633 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5634 			     gimple *vec_stmt)
5635 {
5636   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5637   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5639   int i;
5640   int op_type;
5641   tree op;
5642   tree def;
5643   gimple def_stmt;
5644   enum vect_def_type dt;
5645   enum tree_code code;
5646   enum gimple_rhs_class rhs_class;
5647 
5648   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5649 
5650   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5651     return false;
5652 
5653   if (!is_gimple_assign (stmt))
5654     {
5655       if (gimple_call_internal_p (stmt)
5656 	  && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5657 	  && gimple_call_lhs (stmt)
5658 	  && loop->simduid
5659 	  && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5660 	  && loop->simduid
5661 	     == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5662 	{
5663 	  edge e = single_exit (loop);
5664 	  basic_block merge_bb = e->dest;
5665 	  imm_use_iterator imm_iter;
5666 	  use_operand_p use_p;
5667 	  tree lhs = gimple_call_lhs (stmt);
5668 
5669 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5670 	    {
5671 	      gimple use_stmt = USE_STMT (use_p);
5672 	      if (gimple_code (use_stmt) == GIMPLE_PHI
5673 		  && gimple_bb (use_stmt) == merge_bb)
5674 		{
5675 		  if (vec_stmt)
5676 		    {
5677 		      tree vfm1
5678 			= build_int_cst (unsigned_type_node,
5679 					 loop_vinfo->vectorization_factor - 1);
5680 		      SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5681 		    }
5682 		  return true;
5683 		}
5684 	    }
5685 	}
5686 
5687       return false;
5688     }
5689 
5690   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5691     return false;
5692 
5693   /* FORNOW. CHECKME. */
5694   if (nested_in_vect_loop_p (loop, stmt))
5695     return false;
5696 
5697   code = gimple_assign_rhs_code (stmt);
5698   op_type = TREE_CODE_LENGTH (code);
5699   rhs_class = get_gimple_rhs_class (code);
5700   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5701   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5702 
5703   /* FORNOW: support only if all uses are invariant.  This means
5704      that the scalar operations can remain in place, unvectorized.
5705      The original last scalar value that they compute will be used.  */
5706 
5707   for (i = 0; i < op_type; i++)
5708     {
5709       if (rhs_class == GIMPLE_SINGLE_RHS)
5710 	op = TREE_OPERAND (gimple_op (stmt, 1), i);
5711       else
5712 	op = gimple_op (stmt, i + 1);
5713       if (op
5714           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5715 				  &dt))
5716         {
5717           if (dump_enabled_p ())
5718 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 			     "use not simple.\n");
5720           return false;
5721         }
5722 
5723       if (dt != vect_external_def && dt != vect_constant_def)
5724         return false;
5725     }
5726 
5727   /* No transformation is required for the cases we currently support.  */
5728   return true;
5729 }
5730 
5731 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5732 
5733 static void
5734 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5735 {
5736   ssa_op_iter op_iter;
5737   imm_use_iterator imm_iter;
5738   def_operand_p def_p;
5739   gimple ustmt;
5740 
5741   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5742     {
5743       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5744 	{
5745 	  basic_block bb;
5746 
5747 	  if (!is_gimple_debug (ustmt))
5748 	    continue;
5749 
5750 	  bb = gimple_bb (ustmt);
5751 
5752 	  if (!flow_bb_inside_loop_p (loop, bb))
5753 	    {
5754 	      if (gimple_debug_bind_p (ustmt))
5755 		{
5756 		  if (dump_enabled_p ())
5757 		    dump_printf_loc (MSG_NOTE, vect_location,
5758                                      "killing debug use\n");
5759 
5760 		  gimple_debug_bind_reset_value (ustmt);
5761 		  update_stmt (ustmt);
5762 		}
5763 	      else
5764 		gcc_unreachable ();
5765 	    }
5766 	}
5767     }
5768 }
5769 
5770 
5771 /* This function builds ni_name = number of iterations.  Statements
5772    are emitted on the loop preheader edge.  */
5773 
5774 static tree
5775 vect_build_loop_niters (loop_vec_info loop_vinfo)
5776 {
5777   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5778   if (TREE_CODE (ni) == INTEGER_CST)
5779     return ni;
5780   else
5781     {
5782       tree ni_name, var;
5783       gimple_seq stmts = NULL;
5784       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5785 
5786       var = create_tmp_var (TREE_TYPE (ni), "niters");
5787       ni_name = force_gimple_operand (ni, &stmts, false, var);
5788       if (stmts)
5789 	gsi_insert_seq_on_edge_immediate (pe, stmts);
5790 
5791       return ni_name;
5792     }
5793 }
5794 
5795 
5796 /* This function generates the following statements:
5797 
5798    ni_name = number of iterations loop executes
5799    ratio = ni_name / vf
5800    ratio_mult_vf_name = ratio * vf
5801 
5802    and places them on the loop preheader edge.  */
5803 
5804 static void
5805 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5806 				 tree ni_name,
5807 				 tree *ratio_mult_vf_name_ptr,
5808 				 tree *ratio_name_ptr)
5809 {
5810   tree ni_minus_gap_name;
5811   tree var;
5812   tree ratio_name;
5813   tree ratio_mult_vf_name;
5814   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5815   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5816   tree log_vf;
5817 
5818   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5819 
5820   /* If epilogue loop is required because of data accesses with gaps, we
5821      subtract one iteration from the total number of iterations here for
5822      correct calculation of RATIO.  */
5823   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5824     {
5825       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5826 				       ni_name,
5827 			               build_one_cst (TREE_TYPE (ni_name)));
5828       if (!is_gimple_val (ni_minus_gap_name))
5829 	{
5830 	  var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5831           gimple stmts = NULL;
5832           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5833 						    true, var);
5834 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
5835         }
5836     }
5837   else
5838     ni_minus_gap_name = ni_name;
5839 
5840   /* Create: ratio = ni >> log2(vf) */
5841   /* ???  As we have ni == number of latch executions + 1, ni could
5842      have overflown to zero.  So avoid computing ratio based on ni
5843      but compute it using the fact that we know ratio will be at least
5844      one, thus via (ni - vf) >> log2(vf) + 1.  */
5845   ratio_name
5846     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5847 		   fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5848 				fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5849 					     ni_minus_gap_name,
5850 					     build_int_cst
5851 					       (TREE_TYPE (ni_name), vf)),
5852 				log_vf),
5853 		   build_int_cst (TREE_TYPE (ni_name), 1));
5854   if (!is_gimple_val (ratio_name))
5855     {
5856       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5857       gimple stmts = NULL;
5858       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5859       gsi_insert_seq_on_edge_immediate (pe, stmts);
5860     }
5861   *ratio_name_ptr = ratio_name;
5862 
5863   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5864 
5865   if (ratio_mult_vf_name_ptr)
5866     {
5867       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5868 					ratio_name, log_vf);
5869       if (!is_gimple_val (ratio_mult_vf_name))
5870 	{
5871 	  var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5872 	  gimple stmts = NULL;
5873 	  ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5874 						     true, var);
5875 	  gsi_insert_seq_on_edge_immediate (pe, stmts);
5876 	}
5877       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5878     }
5879 
5880   return;
5881 }
5882 
5883 
5884 /* Function vect_transform_loop.
5885 
5886    The analysis phase has determined that the loop is vectorizable.
5887    Vectorize the loop - created vectorized stmts to replace the scalar
5888    stmts in the loop, and update the loop exit condition.  */
5889 
5890 void
5891 vect_transform_loop (loop_vec_info loop_vinfo)
5892 {
5893   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5894   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5895   int nbbs = loop->num_nodes;
5896   int i;
5897   tree ratio = NULL;
5898   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5899   bool grouped_store;
5900   bool slp_scheduled = false;
5901   gimple stmt, pattern_stmt;
5902   gimple_seq pattern_def_seq = NULL;
5903   gimple_stmt_iterator pattern_def_si = gsi_none ();
5904   bool transform_pattern_stmt = false;
5905   bool check_profitability = false;
5906   int th;
5907   /* Record number of iterations before we started tampering with the profile. */
5908   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5909 
5910   if (dump_enabled_p ())
5911     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5912 
5913   /* If profile is inprecise, we have chance to fix it up.  */
5914   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5915     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5916 
5917   /* Use the more conservative vectorization threshold.  If the number
5918      of iterations is constant assume the cost check has been performed
5919      by our caller.  If the threshold makes all loops profitable that
5920      run at least the vectorization factor number of times checking
5921      is pointless, too.  */
5922   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5923   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5924       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5925     {
5926       if (dump_enabled_p ())
5927 	dump_printf_loc (MSG_NOTE, vect_location,
5928 			 "Profitability threshold is %d loop iterations.\n",
5929                          th);
5930       check_profitability = true;
5931     }
5932 
5933   /* Version the loop first, if required, so the profitability check
5934      comes first.  */
5935 
5936   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5937       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5938     {
5939       vect_loop_versioning (loop_vinfo, th, check_profitability);
5940       check_profitability = false;
5941     }
5942 
5943   tree ni_name = vect_build_loop_niters (loop_vinfo);
5944   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5945 
5946   /* Peel the loop if there are data refs with unknown alignment.
5947      Only one data ref with unknown store is allowed.  */
5948 
5949   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5950     {
5951       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5952 				     th, check_profitability);
5953       check_profitability = false;
5954       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5955 	 be re-computed.  */
5956       ni_name = NULL_TREE;
5957     }
5958 
5959   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5960      compile time constant), or it is a constant that doesn't divide by the
5961      vectorization factor, then an epilog loop needs to be created.
5962      We therefore duplicate the loop: the original loop will be vectorized,
5963      and will compute the first (n/VF) iterations.  The second copy of the loop
5964      will remain scalar and will compute the remaining (n%VF) iterations.
5965      (VF is the vectorization factor).  */
5966 
5967   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5968       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5969     {
5970       tree ratio_mult_vf;
5971       if (!ni_name)
5972 	ni_name = vect_build_loop_niters (loop_vinfo);
5973       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5974 				       &ratio);
5975       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5976 				      th, check_profitability);
5977     }
5978   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5979     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5980 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5981   else
5982     {
5983       if (!ni_name)
5984 	ni_name = vect_build_loop_niters (loop_vinfo);
5985       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5986     }
5987 
5988   /* 1) Make sure the loop header has exactly two entries
5989      2) Make sure we have a preheader basic block.  */
5990 
5991   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5992 
5993   split_edge (loop_preheader_edge (loop));
5994 
5995   /* FORNOW: the vectorizer supports only loops which body consist
5996      of one basic block (header + empty latch). When the vectorizer will
5997      support more involved loop forms, the order by which the BBs are
5998      traversed need to be reconsidered.  */
5999 
6000   for (i = 0; i < nbbs; i++)
6001     {
6002       basic_block bb = bbs[i];
6003       stmt_vec_info stmt_info;
6004 
6005       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6006 	   gsi_next (&si))
6007         {
6008 	  gphi *phi = si.phi ();
6009 	  if (dump_enabled_p ())
6010 	    {
6011 	      dump_printf_loc (MSG_NOTE, vect_location,
6012                                "------>vectorizing phi: ");
6013 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6014               dump_printf (MSG_NOTE, "\n");
6015 	    }
6016 	  stmt_info = vinfo_for_stmt (phi);
6017 	  if (!stmt_info)
6018 	    continue;
6019 
6020 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6021 	    vect_loop_kill_debug_uses (loop, phi);
6022 
6023 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
6024 	      && !STMT_VINFO_LIVE_P (stmt_info))
6025 	    continue;
6026 
6027 	  if (STMT_VINFO_VECTYPE (stmt_info)
6028 	      && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6029 		  != (unsigned HOST_WIDE_INT) vectorization_factor)
6030 	      && dump_enabled_p ())
6031 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6032 
6033 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6034 	    {
6035 	      if (dump_enabled_p ())
6036 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6037 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6038 	    }
6039 	}
6040 
6041       pattern_stmt = NULL;
6042       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6043 	   !gsi_end_p (si) || transform_pattern_stmt;)
6044 	{
6045 	  bool is_store;
6046 
6047           if (transform_pattern_stmt)
6048 	    stmt = pattern_stmt;
6049           else
6050 	    {
6051 	      stmt = gsi_stmt (si);
6052 	      /* During vectorization remove existing clobber stmts.  */
6053 	      if (gimple_clobber_p (stmt))
6054 		{
6055 		  unlink_stmt_vdef (stmt);
6056 		  gsi_remove (&si, true);
6057 		  release_defs (stmt);
6058 		  continue;
6059 		}
6060 	    }
6061 
6062 	  if (dump_enabled_p ())
6063 	    {
6064 	      dump_printf_loc (MSG_NOTE, vect_location,
6065 			       "------>vectorizing statement: ");
6066 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6067               dump_printf (MSG_NOTE, "\n");
6068 	    }
6069 
6070 	  stmt_info = vinfo_for_stmt (stmt);
6071 
6072 	  /* vector stmts created in the outer-loop during vectorization of
6073 	     stmts in an inner-loop may not have a stmt_info, and do not
6074 	     need to be vectorized.  */
6075 	  if (!stmt_info)
6076 	    {
6077 	      gsi_next (&si);
6078 	      continue;
6079 	    }
6080 
6081 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6082 	    vect_loop_kill_debug_uses (loop, stmt);
6083 
6084 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
6085 	      && !STMT_VINFO_LIVE_P (stmt_info))
6086             {
6087               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6088                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6089                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6090                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6091                 {
6092                   stmt = pattern_stmt;
6093                   stmt_info = vinfo_for_stmt (stmt);
6094                 }
6095               else
6096 	        {
6097    	          gsi_next (&si);
6098 	          continue;
6099                 }
6100 	    }
6101           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6102                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6103                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6104                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6105             transform_pattern_stmt = true;
6106 
6107 	  /* If pattern statement has def stmts, vectorize them too.  */
6108 	  if (is_pattern_stmt_p (stmt_info))
6109 	    {
6110 	      if (pattern_def_seq == NULL)
6111 		{
6112 		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6113 		  pattern_def_si = gsi_start (pattern_def_seq);
6114 		}
6115 	      else if (!gsi_end_p (pattern_def_si))
6116 		gsi_next (&pattern_def_si);
6117 	      if (pattern_def_seq != NULL)
6118 		{
6119 		  gimple pattern_def_stmt = NULL;
6120 		  stmt_vec_info pattern_def_stmt_info = NULL;
6121 
6122 		  while (!gsi_end_p (pattern_def_si))
6123 		    {
6124 		      pattern_def_stmt = gsi_stmt (pattern_def_si);
6125 		      pattern_def_stmt_info
6126 			= vinfo_for_stmt (pattern_def_stmt);
6127 		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6128 			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6129 			break;
6130 		      gsi_next (&pattern_def_si);
6131 		    }
6132 
6133 		  if (!gsi_end_p (pattern_def_si))
6134 		    {
6135 		      if (dump_enabled_p ())
6136 			{
6137 			  dump_printf_loc (MSG_NOTE, vect_location,
6138 					   "==> vectorizing pattern def "
6139 					   "stmt: ");
6140 			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6141 					    pattern_def_stmt, 0);
6142                           dump_printf (MSG_NOTE, "\n");
6143 			}
6144 
6145 		      stmt = pattern_def_stmt;
6146 		      stmt_info = pattern_def_stmt_info;
6147 		    }
6148 		  else
6149 		    {
6150 		      pattern_def_si = gsi_none ();
6151 		      transform_pattern_stmt = false;
6152 		    }
6153 		}
6154 	      else
6155 		transform_pattern_stmt = false;
6156             }
6157 
6158 	  if (STMT_VINFO_VECTYPE (stmt_info))
6159 	    {
6160 	      unsigned int nunits
6161 		= (unsigned int)
6162 		  TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6163 	      if (!STMT_SLP_TYPE (stmt_info)
6164 		  && nunits != (unsigned int) vectorization_factor
6165 		  && dump_enabled_p ())
6166 		  /* For SLP VF is set according to unrolling factor, and not
6167 		     to vector size, hence for SLP this print is not valid.  */
6168 		dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6169 	    }
6170 
6171 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
6172 	     reached.  */
6173 	  if (STMT_SLP_TYPE (stmt_info))
6174 	    {
6175 	      if (!slp_scheduled)
6176 		{
6177 		  slp_scheduled = true;
6178 
6179 		  if (dump_enabled_p ())
6180 		    dump_printf_loc (MSG_NOTE, vect_location,
6181 				     "=== scheduling SLP instances ===\n");
6182 
6183 		  vect_schedule_slp (loop_vinfo, NULL);
6184 		}
6185 
6186 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6187 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6188 		{
6189 		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6190 		    {
6191 		      pattern_def_seq = NULL;
6192 		      gsi_next (&si);
6193 		    }
6194 		  continue;
6195 		}
6196 	    }
6197 
6198 	  /* -------- vectorize statement ------------ */
6199 	  if (dump_enabled_p ())
6200 	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6201 
6202 	  grouped_store = false;
6203 	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6204           if (is_store)
6205             {
6206 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6207 		{
6208 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6209 		     interleaving chain was completed - free all the stores in
6210 		     the chain.  */
6211 		  gsi_next (&si);
6212 		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6213 		}
6214 	      else
6215 		{
6216 		  /* Free the attached stmt_vec_info and remove the stmt.  */
6217 		  gimple store = gsi_stmt (si);
6218 		  free_stmt_vec_info (store);
6219 		  unlink_stmt_vdef (store);
6220 		  gsi_remove (&si, true);
6221 		  release_defs (store);
6222 		}
6223 
6224 	      /* Stores can only appear at the end of pattern statements.  */
6225 	      gcc_assert (!transform_pattern_stmt);
6226 	      pattern_def_seq = NULL;
6227 	    }
6228 	  else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6229 	    {
6230 	      pattern_def_seq = NULL;
6231 	      gsi_next (&si);
6232 	    }
6233 	}		        /* stmts in BB */
6234     }				/* BBs in loop */
6235 
6236   slpeel_make_loop_iterate_ntimes (loop, ratio);
6237 
6238   /* Reduce loop iterations by the vectorization factor.  */
6239   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6240 		      expected_iterations / vectorization_factor);
6241   loop->nb_iterations_upper_bound
6242     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6243   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6244       && loop->nb_iterations_upper_bound != 0)
6245     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6246   if (loop->any_estimate)
6247     {
6248       loop->nb_iterations_estimate
6249         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6250        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6251 	   && loop->nb_iterations_estimate != 0)
6252 	 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6253     }
6254 
6255   if (dump_enabled_p ())
6256     {
6257       dump_printf_loc (MSG_NOTE, vect_location,
6258 		       "LOOP VECTORIZED\n");
6259       if (loop->inner)
6260 	dump_printf_loc (MSG_NOTE, vect_location,
6261 			 "OUTER LOOP VECTORIZED\n");
6262       dump_printf (MSG_NOTE, "\n");
6263     }
6264 }
6265