xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-loop.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /* Loop Vectorization
2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3    Free Software Foundation, Inc.
4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5    Ira Rosen <irar@il.ibm.com>
6 
7 This file is part of GCC.
8 
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13 
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17 for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3.  If not see
21 <http://www.gnu.org/licenses/>.  */
22 
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "cfgloop.h"
34 #include "cfglayout.h"
35 #include "expr.h"
36 #include "recog.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "toplev.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 
44 /* Loop Vectorization Pass.
45 
46    This pass tries to vectorize loops.
47 
48    For example, the vectorizer transforms the following simple loop:
49 
50         short a[N]; short b[N]; short c[N]; int i;
51 
52         for (i=0; i<N; i++){
53           a[i] = b[i] + c[i];
54         }
55 
56    as if it was manually vectorized by rewriting the source code into:
57 
58         typedef int __attribute__((mode(V8HI))) v8hi;
59         short a[N];  short b[N]; short c[N];   int i;
60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61         v8hi va, vb, vc;
62 
63         for (i=0; i<N/8; i++){
64           vb = pb[i];
65           vc = pc[i];
66           va = vb + vc;
67           pa[i] = va;
68         }
69 
70         The main entry to this pass is vectorize_loops(), in which
71    the vectorizer applies a set of analyses on a given set of loops,
72    followed by the actual vectorization transformation for the loops that
73    had successfully passed the analysis phase.
74         Throughout this pass we make a distinction between two types of
75    data: scalars (which are represented by SSA_NAMES), and memory references
76    ("data-refs"). These two types of data require different handling both
77    during analysis and transformation. The types of data-refs that the
78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80    accesses are required to have a simple (consecutive) access pattern.
81 
82    Analysis phase:
83    ===============
84         The driver for the analysis phase is vect_analyze_loop().
85    It applies a set of analyses, some of which rely on the scalar evolution
86    analyzer (scev) developed by Sebastian Pop.
87 
88         During the analysis phase the vectorizer records some information
89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90    loop, as well as general information about the loop as a whole, which is
91    recorded in a "loop_vec_info" struct attached to each loop.
92 
93    Transformation phase:
94    =====================
95         The loop transformation phase scans all the stmts in the loop, and
96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97    the loop that needs to be vectorized. It inserts the vector code sequence
98    just before the scalar stmt S, and records a pointer to the vector code
99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100    attached to S). This pointer will be used for the vectorization of following
101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102    otherwise, we rely on dead code elimination for removing it.
103 
104         For example, say stmt S1 was vectorized into stmt VS1:
105 
106    VS1: vb = px[i];
107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108    S2:  a = b;
109 
110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113    resulting sequence would be:
114 
115    VS1: vb = px[i];
116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117    VS2: va = vb;
118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119 
120         Operands that are not SSA_NAMEs, are data-refs that appear in
121    load/store operations (like 'x[i]' in S1), and are handled differently.
122 
123    Target modeling:
124    =================
125         Currently the only target specific information that is used is the
126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
127    support different sizes of vectors, for now will need to specify one value
128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
129 
130         Since we only vectorize operations which vector form can be
131    expressed using existing tree codes, to verify that an operation is
132    supported, the vectorizer checks the relevant optab at the relevant
133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
134    the value found is CODE_FOR_nothing, then there's no target support, and
135    we can't vectorize the stmt.
136 
137    For additional information on this project see:
138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
139 */
140 
141 /* Function vect_determine_vectorization_factor
142 
143    Determine the vectorization factor (VF). VF is the number of data elements
144    that are operated upon in parallel in a single iteration of the vectorized
145    loop. For example, when vectorizing a loop that operates on 4byte elements,
146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
147    elements can fit in a single vector register.
148 
149    We currently support vectorization of loops in which all types operated upon
150    are of the same size. Therefore this function currently sets VF according to
151    the size of the types operated upon, and fails if there are multiple sizes
152    in the loop.
153 
154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
155    original loop:
156         for (i=0; i<N; i++){
157           a[i] = b[i] + c[i];
158         }
159 
160    vectorized loop:
161         for (i=0; i<N; i+=VF){
162           a[i:VF] = b[i:VF] + c[i:VF];
163         }
164 */
165 
166 static bool
167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
168 {
169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
171   int nbbs = loop->num_nodes;
172   gimple_stmt_iterator si;
173   unsigned int vectorization_factor = 0;
174   tree scalar_type;
175   gimple phi;
176   tree vectype;
177   unsigned int nunits;
178   stmt_vec_info stmt_info;
179   int i;
180   HOST_WIDE_INT dummy;
181 
182   if (vect_print_dump_info (REPORT_DETAILS))
183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
184 
185   for (i = 0; i < nbbs; i++)
186     {
187       basic_block bb = bbs[i];
188 
189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
190 	{
191 	  phi = gsi_stmt (si);
192 	  stmt_info = vinfo_for_stmt (phi);
193 	  if (vect_print_dump_info (REPORT_DETAILS))
194 	    {
195 	      fprintf (vect_dump, "==> examining phi: ");
196 	      print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
197 	    }
198 
199 	  gcc_assert (stmt_info);
200 
201 	  if (STMT_VINFO_RELEVANT_P (stmt_info))
202             {
203 	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
205 
206 	      if (vect_print_dump_info (REPORT_DETAILS))
207 		{
208 		  fprintf (vect_dump, "get vectype for scalar type:  ");
209 		  print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
210 		}
211 
212 	      vectype = get_vectype_for_scalar_type (scalar_type);
213 	      if (!vectype)
214 		{
215 		  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
216 		    {
217 		      fprintf (vect_dump,
218 		               "not vectorized: unsupported data-type ");
219 		      print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
220 		    }
221 		  return false;
222 		}
223 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
224 
225 	      if (vect_print_dump_info (REPORT_DETAILS))
226 		{
227 		  fprintf (vect_dump, "vectype: ");
228 		  print_generic_expr (vect_dump, vectype, TDF_SLIM);
229 		}
230 
231 	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
232 	      if (vect_print_dump_info (REPORT_DETAILS))
233 		fprintf (vect_dump, "nunits = %d", nunits);
234 
235 	      if (!vectorization_factor
236 		  || (nunits > vectorization_factor))
237 		vectorization_factor = nunits;
238 	    }
239 	}
240 
241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
242         {
243 	  gimple stmt = gsi_stmt (si);
244 	  stmt_info = vinfo_for_stmt (stmt);
245 
246 	  if (vect_print_dump_info (REPORT_DETAILS))
247 	    {
248 	      fprintf (vect_dump, "==> examining statement: ");
249 	      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
250 	    }
251 
252 	  gcc_assert (stmt_info);
253 
254 	  /* skip stmts which do not need to be vectorized.  */
255 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
256 	      && !STMT_VINFO_LIVE_P (stmt_info))
257 	    {
258 	      if (vect_print_dump_info (REPORT_DETAILS))
259 	        fprintf (vect_dump, "skip.");
260 	      continue;
261 	    }
262 
263 	  if (gimple_get_lhs (stmt) == NULL_TREE)
264 	    {
265 	      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
266 		{
267 	          fprintf (vect_dump, "not vectorized: irregular stmt.");
268 		  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
269 		}
270 	      return false;
271 	    }
272 
273 	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
274 	    {
275 	      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
276 	        {
277 	          fprintf (vect_dump, "not vectorized: vector stmt in loop:");
278 	          print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
279 	        }
280 	      return false;
281 	    }
282 
283 	  if (STMT_VINFO_VECTYPE (stmt_info))
284 	    {
285 	      /* The only case when a vectype had been already set is for stmts
286 	         that contain a dataref, or for "pattern-stmts" (stmts generated
287 		 by the vectorizer to represent/replace a certain idiom).  */
288 	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
289 			  || is_pattern_stmt_p (stmt_info));
290 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
291 	    }
292 	  else
293 	    {
294 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
295 			  && !is_pattern_stmt_p (stmt_info));
296 
297 	      scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
298                                                            &dummy);
299 	      if (vect_print_dump_info (REPORT_DETAILS))
300 		{
301 		  fprintf (vect_dump, "get vectype for scalar type:  ");
302 		  print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
303 		}
304 
305 	      vectype = get_vectype_for_scalar_type (scalar_type);
306 	      if (!vectype)
307 		{
308 		  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
309 		    {
310 		      fprintf (vect_dump,
311 			       "not vectorized: unsupported data-type ");
312 		      print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
313 		    }
314 		  return false;
315 		}
316 	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
317             }
318 
319 	  if (vect_print_dump_info (REPORT_DETAILS))
320 	    {
321 	      fprintf (vect_dump, "vectype: ");
322 	      print_generic_expr (vect_dump, vectype, TDF_SLIM);
323 	    }
324 
325 	  nunits = TYPE_VECTOR_SUBPARTS (vectype);
326 	  if (vect_print_dump_info (REPORT_DETAILS))
327 	    fprintf (vect_dump, "nunits = %d", nunits);
328 
329 	  if (!vectorization_factor
330 	      || (nunits > vectorization_factor))
331 	    vectorization_factor = nunits;
332 
333         }
334     }
335 
336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
337   if (vect_print_dump_info (REPORT_DETAILS))
338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
339   if (vectorization_factor <= 1)
340     {
341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
342         fprintf (vect_dump, "not vectorized: unsupported data-type");
343       return false;
344     }
345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
346 
347   return true;
348 }
349 
350 
351 /* Function vect_is_simple_iv_evolution.
352 
353    FORNOW: A simple evolution of an induction variables in the loop is
354    considered a polynomial evolution with constant step.  */
355 
356 static bool
357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
358                              tree * step)
359 {
360   tree init_expr;
361   tree step_expr;
362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
363 
364   /* When there is no evolution in this loop, the evolution function
365      is not "simple".  */
366   if (evolution_part == NULL_TREE)
367     return false;
368 
369   /* When the evolution is a polynomial of degree >= 2
370      the evolution function is not "simple".  */
371   if (tree_is_chrec (evolution_part))
372     return false;
373 
374   step_expr = evolution_part;
375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
376 
377   if (vect_print_dump_info (REPORT_DETAILS))
378     {
379       fprintf (vect_dump, "step: ");
380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
381       fprintf (vect_dump, ",  init: ");
382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
383     }
384 
385   *init = init_expr;
386   *step = step_expr;
387 
388   if (TREE_CODE (step_expr) != INTEGER_CST)
389     {
390       if (vect_print_dump_info (REPORT_DETAILS))
391         fprintf (vect_dump, "step unknown.");
392       return false;
393     }
394 
395   return true;
396 }
397 
398 /* Function vect_analyze_scalar_cycles_1.
399 
400    Examine the cross iteration def-use cycles of scalar variables
401    in LOOP. LOOP_VINFO represents the loop that is now being
402    considered for vectorization (can be LOOP, or an outer-loop
403    enclosing LOOP).  */
404 
405 static void
406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
407 {
408   basic_block bb = loop->header;
409   tree dumy;
410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
411   gimple_stmt_iterator gsi;
412   bool double_reduc;
413 
414   if (vect_print_dump_info (REPORT_DETAILS))
415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
416 
417   /* First - identify all inductions. Reduction detection assumes that all the
418      inductions have been identified, therefore, this order must not be
419      changed.  */
420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
421     {
422       gimple phi = gsi_stmt (gsi);
423       tree access_fn = NULL;
424       tree def = PHI_RESULT (phi);
425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
426 
427       if (vect_print_dump_info (REPORT_DETAILS))
428 	{
429 	  fprintf (vect_dump, "Analyze phi: ");
430 	  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
431 	}
432 
433       /* Skip virtual phi's. The data dependences that are associated with
434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
436 	continue;
437 
438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
439 
440       /* Analyze the evolution function.  */
441       access_fn = analyze_scalar_evolution (loop, def);
442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
443 	{
444 	  fprintf (vect_dump, "Access function of PHI: ");
445 	  print_generic_expr (vect_dump, access_fn, TDF_SLIM);
446 	}
447 
448       if (!access_fn
449 	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
450 	{
451 	  VEC_safe_push (gimple, heap, worklist, phi);
452 	  continue;
453 	}
454 
455       if (vect_print_dump_info (REPORT_DETAILS))
456 	fprintf (vect_dump, "Detected induction.");
457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
458     }
459 
460 
461   /* Second - identify all reductions and nested cycles.  */
462   while (VEC_length (gimple, worklist) > 0)
463     {
464       gimple phi = VEC_pop (gimple, worklist);
465       tree def = PHI_RESULT (phi);
466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
467       gimple reduc_stmt;
468       bool nested_cycle;
469 
470       if (vect_print_dump_info (REPORT_DETAILS))
471         {
472           fprintf (vect_dump, "Analyze phi: ");
473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
474         }
475 
476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
478 
479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
481                                              &double_reduc);
482       if (reduc_stmt)
483         {
484           if (double_reduc)
485             {
486               if (vect_print_dump_info (REPORT_DETAILS))
487                 fprintf (vect_dump, "Detected double reduction.");
488 
489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
491                                                     vect_double_reduction_def;
492             }
493           else
494             {
495               if (nested_cycle)
496                 {
497                   if (vect_print_dump_info (REPORT_DETAILS))
498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
499 
500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
502                                                              vect_nested_cycle;
503                 }
504               else
505                 {
506                   if (vect_print_dump_info (REPORT_DETAILS))
507                     fprintf (vect_dump, "Detected reduction.");
508 
509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
511                                                            vect_reduction_def;
512                 }
513             }
514         }
515       else
516         if (vect_print_dump_info (REPORT_DETAILS))
517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
518     }
519 
520   VEC_free (gimple, heap, worklist);
521 }
522 
523 
524 /* Function vect_analyze_scalar_cycles.
525 
526    Examine the cross iteration def-use cycles of scalar variables, by
527    analyzing the loop-header PHIs of scalar variables; Classify each
528    cycle as one of the following: invariant, induction, reduction, unknown.
529    We do that for the loop represented by LOOP_VINFO, and also to its
530    inner-loop, if exists.
531    Examples for scalar cycles:
532 
533    Example1: reduction:
534 
535               loop1:
536               for (i=0; i<N; i++)
537                  sum += a[i];
538 
539    Example2: induction:
540 
541               loop2:
542               for (i=0; i<N; i++)
543                  a[i] = i;  */
544 
545 static void
546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
547 {
548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
549 
550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
551 
552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
553      Reductions in such inner-loop therefore have different properties than
554      the reductions in the nest that gets vectorized:
555      1. When vectorized, they are executed in the same order as in the original
556         scalar loop, so we can't change the order of computation when
557         vectorizing them.
558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
559         current checks are too strict.  */
560 
561   if (loop->inner)
562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
563 }
564 
565 /* Function vect_get_loop_niters.
566 
567    Determine how many iterations the loop is executed.
568    If an expression that represents the number of iterations
569    can be constructed, place it in NUMBER_OF_ITERATIONS.
570    Return the loop exit condition.  */
571 
572 static gimple
573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
574 {
575   tree niters;
576 
577   if (vect_print_dump_info (REPORT_DETAILS))
578     fprintf (vect_dump, "=== get_loop_niters ===");
579 
580   niters = number_of_exit_cond_executions (loop);
581 
582   if (niters != NULL_TREE
583       && niters != chrec_dont_know)
584     {
585       *number_of_iterations = niters;
586 
587       if (vect_print_dump_info (REPORT_DETAILS))
588         {
589           fprintf (vect_dump, "==> get_loop_niters:" );
590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
591         }
592     }
593 
594   return get_loop_exit_condition (loop);
595 }
596 
597 
598 /* Function bb_in_loop_p
599 
600    Used as predicate for dfs order traversal of the loop bbs.  */
601 
602 static bool
603 bb_in_loop_p (const_basic_block bb, const void *data)
604 {
605   const struct loop *const loop = (const struct loop *)data;
606   if (flow_bb_inside_loop_p (loop, bb))
607     return true;
608   return false;
609 }
610 
611 
612 /* Function new_loop_vec_info.
613 
614    Create and initialize a new loop_vec_info struct for LOOP, as well as
615    stmt_vec_info structs for all the stmts in LOOP.  */
616 
617 static loop_vec_info
618 new_loop_vec_info (struct loop *loop)
619 {
620   loop_vec_info res;
621   basic_block *bbs;
622   gimple_stmt_iterator si;
623   unsigned int i, nbbs;
624 
625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
626   LOOP_VINFO_LOOP (res) = loop;
627 
628   bbs = get_loop_body (loop);
629 
630   /* Create/Update stmt_info for all stmts in the loop.  */
631   for (i = 0; i < loop->num_nodes; i++)
632     {
633       basic_block bb = bbs[i];
634 
635       /* BBs in a nested inner-loop will have been already processed (because
636          we will have called vect_analyze_loop_form for any nested inner-loop).
637          Therefore, for stmts in an inner-loop we just want to update the
638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
639          loop_info of the outer-loop we are currently considering to vectorize
640          (instead of the loop_info of the inner-loop).
641          For stmts in other BBs we need to create a stmt_info from scratch.  */
642       if (bb->loop_father != loop)
643         {
644           /* Inner-loop bb.  */
645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
647             {
648               gimple phi = gsi_stmt (si);
649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
650               loop_vec_info inner_loop_vinfo =
651                 STMT_VINFO_LOOP_VINFO (stmt_info);
652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
654             }
655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
656            {
657               gimple stmt = gsi_stmt (si);
658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
659               loop_vec_info inner_loop_vinfo =
660                  STMT_VINFO_LOOP_VINFO (stmt_info);
661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
663            }
664         }
665       else
666         {
667           /* bb in current nest.  */
668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
669             {
670               gimple phi = gsi_stmt (si);
671               gimple_set_uid (phi, 0);
672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
673             }
674 
675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
676             {
677               gimple stmt = gsi_stmt (si);
678               gimple_set_uid (stmt, 0);
679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
680             }
681         }
682     }
683 
684   /* CHECKME: We want to visit all BBs before their successors (except for
685      latch blocks, for which this assertion wouldn't hold).  In the simple
686      case of the loop forms we allow, a dfs order of the BBs would the same
687      as reversed postorder traversal, so we are safe.  */
688 
689    free (bbs);
690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
692                               bbs, loop->num_nodes, loop);
693    gcc_assert (nbbs == loop->num_nodes);
694 
695   LOOP_VINFO_BBS (res) = bbs;
696   LOOP_VINFO_NITERS (res) = NULL;
697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
701   LOOP_VINFO_VECT_FACTOR (res) = 0;
702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
706     VEC_alloc (gimple, heap,
707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
709     VEC_alloc (ddr_p, heap,
710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
714   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
715 
716   return res;
717 }
718 
719 
720 /* Function destroy_loop_vec_info.
721 
722    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
723    stmts in the loop.  */
724 
725 void
726 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
727 {
728   struct loop *loop;
729   basic_block *bbs;
730   int nbbs;
731   gimple_stmt_iterator si;
732   int j;
733   VEC (slp_instance, heap) *slp_instances;
734   slp_instance instance;
735 
736   if (!loop_vinfo)
737     return;
738 
739   loop = LOOP_VINFO_LOOP (loop_vinfo);
740 
741   bbs = LOOP_VINFO_BBS (loop_vinfo);
742   nbbs = loop->num_nodes;
743 
744   if (!clean_stmts)
745     {
746       free (LOOP_VINFO_BBS (loop_vinfo));
747       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
748       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
749       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
750 
751       free (loop_vinfo);
752       loop->aux = NULL;
753       return;
754     }
755 
756   for (j = 0; j < nbbs; j++)
757     {
758       basic_block bb = bbs[j];
759       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
760         free_stmt_vec_info (gsi_stmt (si));
761 
762       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
763         {
764           gimple stmt = gsi_stmt (si);
765           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
766 
767           if (stmt_info)
768             {
769               /* Check if this is a "pattern stmt" (introduced by the
770                  vectorizer during the pattern recognition pass).  */
771               bool remove_stmt_p = false;
772               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
773               if (orig_stmt)
774                 {
775                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
776                   if (orig_stmt_info
777                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
778                     remove_stmt_p = true;
779                 }
780 
781               /* Free stmt_vec_info.  */
782               free_stmt_vec_info (stmt);
783 
784               /* Remove dead "pattern stmts".  */
785               if (remove_stmt_p)
786                 gsi_remove (&si, true);
787             }
788           gsi_next (&si);
789         }
790     }
791 
792   free (LOOP_VINFO_BBS (loop_vinfo));
793   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
794   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
795   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
796   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
797   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
798   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
799     vect_free_slp_instance (instance);
800 
801   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
802   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
803 
804   free (loop_vinfo);
805   loop->aux = NULL;
806 }
807 
808 
809 /* Function vect_analyze_loop_1.
810 
811    Apply a set of analyses on LOOP, and create a loop_vec_info struct
812    for it. The different analyses will record information in the
813    loop_vec_info struct.  This is a subset of the analyses applied in
814    vect_analyze_loop, to be applied on an inner-loop nested in the loop
815    that is now considered for (outer-loop) vectorization.  */
816 
817 static loop_vec_info
818 vect_analyze_loop_1 (struct loop *loop)
819 {
820   loop_vec_info loop_vinfo;
821 
822   if (vect_print_dump_info (REPORT_DETAILS))
823     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
824 
825   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
826 
827   loop_vinfo = vect_analyze_loop_form (loop);
828   if (!loop_vinfo)
829     {
830       if (vect_print_dump_info (REPORT_DETAILS))
831         fprintf (vect_dump, "bad inner-loop form.");
832       return NULL;
833     }
834 
835   return loop_vinfo;
836 }
837 
838 
839 /* Function vect_analyze_loop_form.
840 
841    Verify that certain CFG restrictions hold, including:
842    - the loop has a pre-header
843    - the loop has a single entry and exit
844    - the loop exit condition is simple enough, and the number of iterations
845      can be analyzed (a countable loop).  */
846 
847 loop_vec_info
848 vect_analyze_loop_form (struct loop *loop)
849 {
850   loop_vec_info loop_vinfo;
851   gimple loop_cond;
852   tree number_of_iterations = NULL;
853   loop_vec_info inner_loop_vinfo = NULL;
854 
855   if (vect_print_dump_info (REPORT_DETAILS))
856     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
857 
858   /* Different restrictions apply when we are considering an inner-most loop,
859      vs. an outer (nested) loop.
860      (FORNOW. May want to relax some of these restrictions in the future).  */
861 
862   if (!loop->inner)
863     {
864       /* Inner-most loop.  We currently require that the number of BBs is
865 	 exactly 2 (the header and latch).  Vectorizable inner-most loops
866 	 look like this:
867 
868                         (pre-header)
869                            |
870                           header <--------+
871                            | |            |
872                            | +--> latch --+
873                            |
874                         (exit-bb)  */
875 
876       if (loop->num_nodes != 2)
877         {
878           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
879             fprintf (vect_dump, "not vectorized: control flow in loop.");
880           return NULL;
881         }
882 
883       if (empty_block_p (loop->header))
884     {
885           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
886             fprintf (vect_dump, "not vectorized: empty loop.");
887       return NULL;
888     }
889     }
890   else
891     {
892       struct loop *innerloop = loop->inner;
893       edge entryedge;
894 
895       /* Nested loop. We currently require that the loop is doubly-nested,
896 	 contains a single inner loop, and the number of BBs is exactly 5.
897 	 Vectorizable outer-loops look like this:
898 
899 			(pre-header)
900 			   |
901 			  header <---+
902 			   |         |
903 		          inner-loop |
904 			   |         |
905 			  tail ------+
906 			   |
907 		        (exit-bb)
908 
909 	 The inner-loop has the properties expected of inner-most loops
910 	 as described above.  */
911 
912       if ((loop->inner)->inner || (loop->inner)->next)
913 	{
914 	  if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
915 	    fprintf (vect_dump, "not vectorized: multiple nested loops.");
916 	  return NULL;
917 	}
918 
919       /* Analyze the inner-loop.  */
920       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
921       if (!inner_loop_vinfo)
922 	{
923 	  if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
924             fprintf (vect_dump, "not vectorized: Bad inner loop.");
925 	  return NULL;
926 	}
927 
928       if (!expr_invariant_in_loop_p (loop,
929 					LOOP_VINFO_NITERS (inner_loop_vinfo)))
930 	{
931 	  if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
932 	    fprintf (vect_dump,
933 		     "not vectorized: inner-loop count not invariant.");
934 	  destroy_loop_vec_info (inner_loop_vinfo, true);
935 	  return NULL;
936 	}
937 
938       if (loop->num_nodes != 5)
939         {
940 	  if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
941 	    fprintf (vect_dump, "not vectorized: control flow in loop.");
942 	  destroy_loop_vec_info (inner_loop_vinfo, true);
943 	  return NULL;
944         }
945 
946       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
947       entryedge = EDGE_PRED (innerloop->header, 0);
948       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
949 	entryedge = EDGE_PRED (innerloop->header, 1);
950 
951       if (entryedge->src != loop->header
952 	  || !single_exit (innerloop)
953 	  || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
954 	{
955 	  if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
956 	    fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
957 	  destroy_loop_vec_info (inner_loop_vinfo, true);
958 	  return NULL;
959 	}
960 
961       if (vect_print_dump_info (REPORT_DETAILS))
962         fprintf (vect_dump, "Considering outer-loop vectorization.");
963     }
964 
965   if (!single_exit (loop)
966       || EDGE_COUNT (loop->header->preds) != 2)
967     {
968       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
969         {
970           if (!single_exit (loop))
971             fprintf (vect_dump, "not vectorized: multiple exits.");
972           else if (EDGE_COUNT (loop->header->preds) != 2)
973             fprintf (vect_dump, "not vectorized: too many incoming edges.");
974         }
975       if (inner_loop_vinfo)
976 	destroy_loop_vec_info (inner_loop_vinfo, true);
977       return NULL;
978     }
979 
980   /* We assume that the loop exit condition is at the end of the loop. i.e,
981      that the loop is represented as a do-while (with a proper if-guard
982      before the loop if needed), where the loop header contains all the
983      executable statements, and the latch is empty.  */
984   if (!empty_block_p (loop->latch)
985         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
986     {
987       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
988         fprintf (vect_dump, "not vectorized: unexpected loop form.");
989       if (inner_loop_vinfo)
990 	destroy_loop_vec_info (inner_loop_vinfo, true);
991       return NULL;
992     }
993 
994   /* Make sure there exists a single-predecessor exit bb:  */
995   if (!single_pred_p (single_exit (loop)->dest))
996     {
997       edge e = single_exit (loop);
998       if (!(e->flags & EDGE_ABNORMAL))
999 	{
1000 	  split_loop_exit_edge (e);
1001 	  if (vect_print_dump_info (REPORT_DETAILS))
1002 	    fprintf (vect_dump, "split exit edge.");
1003 	}
1004       else
1005 	{
1006 	  if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1007 	    fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1008 	  if (inner_loop_vinfo)
1009 	    destroy_loop_vec_info (inner_loop_vinfo, true);
1010 	  return NULL;
1011 	}
1012     }
1013 
1014   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1015   if (!loop_cond)
1016     {
1017       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1018 	fprintf (vect_dump, "not vectorized: complicated exit condition.");
1019       if (inner_loop_vinfo)
1020 	destroy_loop_vec_info (inner_loop_vinfo, true);
1021       return NULL;
1022     }
1023 
1024   if (!number_of_iterations)
1025     {
1026       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1027 	fprintf (vect_dump,
1028 		 "not vectorized: number of iterations cannot be computed.");
1029       if (inner_loop_vinfo)
1030 	destroy_loop_vec_info (inner_loop_vinfo, true);
1031       return NULL;
1032     }
1033 
1034   if (chrec_contains_undetermined (number_of_iterations))
1035     {
1036       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1037         fprintf (vect_dump, "Infinite number of iterations.");
1038       if (inner_loop_vinfo)
1039 	destroy_loop_vec_info (inner_loop_vinfo, true);
1040       return NULL;
1041     }
1042 
1043   if (!NITERS_KNOWN_P (number_of_iterations))
1044     {
1045       if (vect_print_dump_info (REPORT_DETAILS))
1046         {
1047           fprintf (vect_dump, "Symbolic number of iterations is ");
1048           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1049         }
1050     }
1051   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1052     {
1053       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1054         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1055       if (inner_loop_vinfo)
1056         destroy_loop_vec_info (inner_loop_vinfo, false);
1057       return NULL;
1058     }
1059 
1060   loop_vinfo = new_loop_vec_info (loop);
1061   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1062   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1063 
1064   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1065 
1066   /* CHECKME: May want to keep it around it in the future.  */
1067   if (inner_loop_vinfo)
1068     destroy_loop_vec_info (inner_loop_vinfo, false);
1069 
1070   gcc_assert (!loop->aux);
1071   loop->aux = loop_vinfo;
1072   return loop_vinfo;
1073 }
1074 
1075 
1076 /* Function vect_analyze_loop_operations.
1077 
1078    Scan the loop stmts and make sure they are all vectorizable.  */
1079 
1080 static bool
1081 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1082 {
1083   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1084   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1085   int nbbs = loop->num_nodes;
1086   gimple_stmt_iterator si;
1087   unsigned int vectorization_factor = 0;
1088   int i;
1089   gimple phi;
1090   stmt_vec_info stmt_info;
1091   bool need_to_vectorize = false;
1092   int min_profitable_iters;
1093   int min_scalar_loop_bound;
1094   unsigned int th;
1095   bool only_slp_in_loop = true, ok;
1096 
1097   if (vect_print_dump_info (REPORT_DETAILS))
1098     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1099 
1100   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1101   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1102 
1103   for (i = 0; i < nbbs; i++)
1104     {
1105       basic_block bb = bbs[i];
1106 
1107       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1108         {
1109           phi = gsi_stmt (si);
1110           ok = true;
1111 
1112           stmt_info = vinfo_for_stmt (phi);
1113           if (vect_print_dump_info (REPORT_DETAILS))
1114             {
1115               fprintf (vect_dump, "examining phi: ");
1116               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1117             }
1118 
1119           if (! is_loop_header_bb_p (bb))
1120             {
1121               /* inner-loop loop-closed exit phi in outer-loop vectorization
1122                  (i.e. a phi in the tail of the outer-loop).
1123                  FORNOW: we currently don't support the case that these phis
1124                  are not used in the outerloop (unless it is double reduction,
1125                  i.e., this phi is vect_reduction_def), cause this case
1126                  requires to actually do something here.  */
1127               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1128                    || STMT_VINFO_LIVE_P (stmt_info))
1129                   && STMT_VINFO_DEF_TYPE (stmt_info)
1130                      != vect_double_reduction_def)
1131                 {
1132                   if (vect_print_dump_info (REPORT_DETAILS))
1133                     fprintf (vect_dump,
1134                              "Unsupported loop-closed phi in outer-loop.");
1135                   return false;
1136                 }
1137               continue;
1138             }
1139 
1140           gcc_assert (stmt_info);
1141 
1142           if (STMT_VINFO_LIVE_P (stmt_info))
1143             {
1144               /* FORNOW: not yet supported.  */
1145               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1146                 fprintf (vect_dump, "not vectorized: value used after loop.");
1147               return false;
1148             }
1149 
1150           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1151               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1152             {
1153               /* A scalar-dependence cycle that we don't support.  */
1154               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1155                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1156               return false;
1157             }
1158 
1159           if (STMT_VINFO_RELEVANT_P (stmt_info))
1160             {
1161               need_to_vectorize = true;
1162               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1163                 ok = vectorizable_induction (phi, NULL, NULL);
1164             }
1165 
1166           if (!ok)
1167             {
1168               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1169                 {
1170                   fprintf (vect_dump,
1171                            "not vectorized: relevant phi not supported: ");
1172                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1173                 }
1174               return false;
1175             }
1176         }
1177 
1178       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1179         {
1180           gimple stmt = gsi_stmt (si);
1181           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1182 
1183           gcc_assert (stmt_info);
1184 
1185 	  if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1186 	    return false;
1187 
1188           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1189                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1190               && !PURE_SLP_STMT (stmt_info))
1191 
1192             /* STMT needs both SLP and loop-based vectorization.  */
1193             only_slp_in_loop = false;
1194         }
1195     } /* bbs */
1196 
1197   /* All operations in the loop are either irrelevant (deal with loop
1198      control, or dead), or only used outside the loop and can be moved
1199      out of the loop (e.g. invariants, inductions).  The loop can be
1200      optimized away by scalar optimizations.  We're better off not
1201      touching this loop.  */
1202   if (!need_to_vectorize)
1203     {
1204       if (vect_print_dump_info (REPORT_DETAILS))
1205         fprintf (vect_dump,
1206                  "All the computation can be taken out of the loop.");
1207       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1208         fprintf (vect_dump,
1209                  "not vectorized: redundant loop. no profit to vectorize.");
1210       return false;
1211     }
1212 
1213   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1214      vectorization factor of the loop is the unrolling factor required by the
1215      SLP instances.  If that unrolling factor is 1, we say, that we perform
1216      pure SLP on loop - cross iteration parallelism is not exploited.  */
1217   if (only_slp_in_loop)
1218     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1219   else
1220     vectorization_factor = least_common_multiple (vectorization_factor,
1221                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1222 
1223   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1224 
1225   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1226       && vect_print_dump_info (REPORT_DETAILS))
1227     fprintf (vect_dump,
1228         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1229         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1230 
1231   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1232       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1233     {
1234       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1235         fprintf (vect_dump, "not vectorized: iteration count too small.");
1236       if (vect_print_dump_info (REPORT_DETAILS))
1237         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1238                  "vectorization factor.");
1239       return false;
1240     }
1241 
1242   /* Analyze cost. Decide if worth while to vectorize.  */
1243 
1244   /* Once VF is set, SLP costs should be updated since the number of created
1245      vector stmts depends on VF.  */
1246   vect_update_slp_costs_according_to_vf (loop_vinfo);
1247 
1248   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1249   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1250 
1251   if (min_profitable_iters < 0)
1252     {
1253       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1254         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1255       if (vect_print_dump_info (REPORT_DETAILS))
1256         fprintf (vect_dump, "not vectorized: vector version will never be "
1257                  "profitable.");
1258       return false;
1259     }
1260 
1261   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1262                             * vectorization_factor) - 1);
1263 
1264   /* Use the cost model only if it is more conservative than user specified
1265      threshold.  */
1266 
1267   th = (unsigned) min_scalar_loop_bound;
1268   if (min_profitable_iters
1269       && (!min_scalar_loop_bound
1270           || min_profitable_iters > min_scalar_loop_bound))
1271     th = (unsigned) min_profitable_iters;
1272 
1273   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1274       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1275     {
1276       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277         fprintf (vect_dump, "not vectorized: vectorization not "
1278                  "profitable.");
1279       if (vect_print_dump_info (REPORT_DETAILS))
1280         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1281                  "user specified loop bound parameter or minimum "
1282                  "profitable iterations (whichever is more conservative).");
1283       return false;
1284     }
1285 
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1287       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1288       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1289     {
1290       if (vect_print_dump_info (REPORT_DETAILS))
1291         fprintf (vect_dump, "epilog loop required.");
1292       if (!vect_can_advance_ivs_p (loop_vinfo))
1293         {
1294           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295             fprintf (vect_dump,
1296                      "not vectorized: can't create epilog loop 1.");
1297           return false;
1298         }
1299       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1300         {
1301           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1302             fprintf (vect_dump,
1303                      "not vectorized: can't create epilog loop 2.");
1304           return false;
1305         }
1306     }
1307 
1308   return true;
1309 }
1310 
1311 
1312 /* Function vect_analyze_loop.
1313 
1314    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1315    for it. The different analyses will record information in the
1316    loop_vec_info struct.  */
1317 loop_vec_info
1318 vect_analyze_loop (struct loop *loop)
1319 {
1320   bool ok;
1321   loop_vec_info loop_vinfo;
1322 
1323   if (vect_print_dump_info (REPORT_DETAILS))
1324     fprintf (vect_dump, "===== analyze_loop_nest =====");
1325 
1326   if (loop_outer (loop)
1327       && loop_vec_info_for_loop (loop_outer (loop))
1328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1329     {
1330       if (vect_print_dump_info (REPORT_DETAILS))
1331 	fprintf (vect_dump, "outer-loop already vectorized.");
1332       return NULL;
1333     }
1334 
1335   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1336 
1337   loop_vinfo = vect_analyze_loop_form (loop);
1338   if (!loop_vinfo)
1339     {
1340       if (vect_print_dump_info (REPORT_DETAILS))
1341 	fprintf (vect_dump, "bad loop form.");
1342       return NULL;
1343     }
1344 
1345   /* Find all data references in the loop (which correspond to vdefs/vuses)
1346      and analyze their evolution in the loop.
1347 
1348      FORNOW: Handle only simple, array references, which
1349      alignment can be forced, and aligned pointer-references.  */
1350 
1351   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1352   if (!ok)
1353     {
1354       if (vect_print_dump_info (REPORT_DETAILS))
1355 	fprintf (vect_dump, "bad data references.");
1356       destroy_loop_vec_info (loop_vinfo, true);
1357       return NULL;
1358     }
1359 
1360   /* Classify all cross-iteration scalar data-flow cycles.
1361      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1362 
1363   vect_analyze_scalar_cycles (loop_vinfo);
1364 
1365   vect_pattern_recog (loop_vinfo);
1366 
1367   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1368 
1369   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1370   if (!ok)
1371     {
1372       if (vect_print_dump_info (REPORT_DETAILS))
1373 	fprintf (vect_dump, "unexpected pattern.");
1374       destroy_loop_vec_info (loop_vinfo, true);
1375       return NULL;
1376     }
1377 
1378   /* Analyze the alignment of the data-refs in the loop.
1379      Fail if a data reference is found that cannot be vectorized.  */
1380 
1381   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1382   if (!ok)
1383     {
1384       if (vect_print_dump_info (REPORT_DETAILS))
1385 	fprintf (vect_dump, "bad data alignment.");
1386       destroy_loop_vec_info (loop_vinfo, true);
1387       return NULL;
1388     }
1389 
1390   ok = vect_determine_vectorization_factor (loop_vinfo);
1391   if (!ok)
1392     {
1393       if (vect_print_dump_info (REPORT_DETAILS))
1394         fprintf (vect_dump, "can't determine vectorization factor.");
1395       destroy_loop_vec_info (loop_vinfo, true);
1396       return NULL;
1397     }
1398 
1399   /* Analyze data dependences between the data-refs in the loop.
1400      FORNOW: fail at the first data dependence that we encounter.  */
1401 
1402   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1403   if (!ok)
1404     {
1405       if (vect_print_dump_info (REPORT_DETAILS))
1406 	fprintf (vect_dump, "bad data dependence.");
1407       destroy_loop_vec_info (loop_vinfo, true);
1408       return NULL;
1409     }
1410 
1411   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1412      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1413 
1414   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418 	fprintf (vect_dump, "bad data access.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422 
1423   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1424      It is important to call pruning after vect_analyze_data_ref_accesses,
1425      since we use grouping information gathered by interleaving analysis.  */
1426   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1427   if (!ok)
1428     {
1429       if (vect_print_dump_info (REPORT_DETAILS))
1430 	fprintf (vect_dump, "too long list of versioning for alias "
1431 			    "run-time tests.");
1432       destroy_loop_vec_info (loop_vinfo, true);
1433       return NULL;
1434     }
1435 
1436   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1437   ok = vect_analyze_slp (loop_vinfo, NULL);
1438   if (ok)
1439     {
1440       /* Decide which possible SLP instances to SLP.  */
1441       vect_make_slp_decision (loop_vinfo);
1442 
1443       /* Find stmts that need to be both vectorized and SLPed.  */
1444       vect_detect_hybrid_slp (loop_vinfo);
1445     }
1446 
1447   /* This pass will decide on using loop versioning and/or loop peeling in
1448      order to enhance the alignment of data references in the loop.  */
1449 
1450   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1451   if (!ok)
1452     {
1453       if (vect_print_dump_info (REPORT_DETAILS))
1454 	fprintf (vect_dump, "bad data alignment.");
1455       destroy_loop_vec_info (loop_vinfo, true);
1456       return NULL;
1457     }
1458 
1459   /* Scan all the operations in the loop and make sure they are
1460      vectorizable.  */
1461 
1462   ok = vect_analyze_loop_operations (loop_vinfo);
1463   if (!ok)
1464     {
1465       if (vect_print_dump_info (REPORT_DETAILS))
1466 	fprintf (vect_dump, "bad operation or unsupported loop bound.");
1467       destroy_loop_vec_info (loop_vinfo, true);
1468       return NULL;
1469     }
1470 
1471   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1472 
1473   return loop_vinfo;
1474 }
1475 
1476 
1477 /* Function reduction_code_for_scalar_code
1478 
1479    Input:
1480    CODE - tree_code of a reduction operations.
1481 
1482    Output:
1483    REDUC_CODE - the corresponding tree-code to be used to reduce the
1484       vector of partial results into a single scalar result (which
1485       will also reside in a vector) or ERROR_MARK if the operation is
1486       a supported reduction operation, but does not have such tree-code.
1487 
1488    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1489 
1490 static bool
1491 reduction_code_for_scalar_code (enum tree_code code,
1492                                 enum tree_code *reduc_code)
1493 {
1494   switch (code)
1495     {
1496       case MAX_EXPR:
1497         *reduc_code = REDUC_MAX_EXPR;
1498         return true;
1499 
1500       case MIN_EXPR:
1501         *reduc_code = REDUC_MIN_EXPR;
1502         return true;
1503 
1504       case PLUS_EXPR:
1505         *reduc_code = REDUC_PLUS_EXPR;
1506         return true;
1507 
1508       case MULT_EXPR:
1509       case MINUS_EXPR:
1510       case BIT_IOR_EXPR:
1511       case BIT_XOR_EXPR:
1512       case BIT_AND_EXPR:
1513         *reduc_code = ERROR_MARK;
1514         return true;
1515 
1516       default:
1517        return false;
1518     }
1519 }
1520 
1521 
1522 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1523    STMT is printed with a message MSG. */
1524 
1525 static void
1526 report_vect_op (gimple stmt, const char *msg)
1527 {
1528   fprintf (vect_dump, "%s", msg);
1529   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1530 }
1531 
1532 
1533 /* Function vect_is_simple_reduction
1534 
1535    (1) Detect a cross-iteration def-use cycle that represents a simple
1536    reduction computation. We look for the following pattern:
1537 
1538    loop_header:
1539      a1 = phi < a0, a2 >
1540      a3 = ...
1541      a2 = operation (a3, a1)
1542 
1543    such that:
1544    1. operation is commutative and associative and it is safe to
1545       change the order of the computation (if CHECK_REDUCTION is true)
1546    2. no uses for a2 in the loop (a2 is used out of the loop)
1547    3. no uses of a1 in the loop besides the reduction operation.
1548 
1549    Condition 1 is tested here.
1550    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1551 
1552    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1553    nested cycles, if CHECK_REDUCTION is false.
1554 
1555    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1556    reductions:
1557 
1558      a1 = phi < a0, a2 >
1559      inner loop (def of a3)
1560      a2 = phi < a3 >
1561 */
1562 
1563 gimple
1564 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1565                           bool check_reduction, bool *double_reduc)
1566 {
1567   struct loop *loop = (gimple_bb (phi))->loop_father;
1568   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1569   edge latch_e = loop_latch_edge (loop);
1570   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1571   gimple def_stmt, def1 = NULL, def2 = NULL;
1572   enum tree_code code;
1573   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1574   tree type;
1575   int nloop_uses;
1576   tree name;
1577   imm_use_iterator imm_iter;
1578   use_operand_p use_p;
1579   bool phi_def;
1580 
1581   *double_reduc = false;
1582 
1583   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1584      otherwise, we assume outer loop vectorization.  */
1585   gcc_assert ((check_reduction && loop == vect_loop)
1586               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1587 
1588   name = PHI_RESULT (phi);
1589   nloop_uses = 0;
1590   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1591     {
1592       gimple use_stmt = USE_STMT (use_p);
1593       if (is_gimple_debug (use_stmt))
1594 	continue;
1595       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1596 	  && vinfo_for_stmt (use_stmt)
1597 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1598         nloop_uses++;
1599       if (nloop_uses > 1)
1600         {
1601           if (vect_print_dump_info (REPORT_DETAILS))
1602             fprintf (vect_dump, "reduction used in loop.");
1603           return NULL;
1604         }
1605     }
1606 
1607   if (TREE_CODE (loop_arg) != SSA_NAME)
1608     {
1609       if (vect_print_dump_info (REPORT_DETAILS))
1610 	{
1611 	  fprintf (vect_dump, "reduction: not ssa_name: ");
1612 	  print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1613 	}
1614       return NULL;
1615     }
1616 
1617   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1618   if (!def_stmt)
1619     {
1620       if (vect_print_dump_info (REPORT_DETAILS))
1621 	fprintf (vect_dump, "reduction: no def_stmt.");
1622       return NULL;
1623     }
1624 
1625   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1626     {
1627       if (vect_print_dump_info (REPORT_DETAILS))
1628         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1629       return NULL;
1630     }
1631 
1632   if (is_gimple_assign (def_stmt))
1633     {
1634       name = gimple_assign_lhs (def_stmt);
1635       phi_def = false;
1636     }
1637   else
1638     {
1639       name = PHI_RESULT (def_stmt);
1640       phi_def = true;
1641     }
1642 
1643   nloop_uses = 0;
1644   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1645     {
1646       gimple use_stmt = USE_STMT (use_p);
1647       if (is_gimple_debug (use_stmt))
1648 	continue;
1649       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1650 	  && vinfo_for_stmt (use_stmt)
1651 	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1652 	nloop_uses++;
1653       if (nloop_uses > 1)
1654 	{
1655 	  if (vect_print_dump_info (REPORT_DETAILS))
1656 	    fprintf (vect_dump, "reduction used in loop.");
1657 	  return NULL;
1658 	}
1659     }
1660 
1661   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1662      defined in the inner loop.  */
1663   if (phi_def)
1664     {
1665       op1 = PHI_ARG_DEF (def_stmt, 0);
1666 
1667       if (gimple_phi_num_args (def_stmt) != 1
1668           || TREE_CODE (op1) != SSA_NAME)
1669         {
1670           if (vect_print_dump_info (REPORT_DETAILS))
1671             fprintf (vect_dump, "unsupported phi node definition.");
1672 
1673           return NULL;
1674         }
1675 
1676       def1 = SSA_NAME_DEF_STMT (op1);
1677       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1678           && loop->inner
1679           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1680           && is_gimple_assign (def1))
1681         {
1682           if (vect_print_dump_info (REPORT_DETAILS))
1683             report_vect_op (def_stmt, "detected double reduction: ");
1684 
1685           *double_reduc = true;
1686           return def_stmt;
1687         }
1688 
1689       return NULL;
1690     }
1691 
1692   code = gimple_assign_rhs_code (def_stmt);
1693 
1694   if (check_reduction
1695       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1696     {
1697       if (vect_print_dump_info (REPORT_DETAILS))
1698         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1699       return NULL;
1700     }
1701 
1702   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1703     {
1704       if (code != COND_EXPR)
1705         {
1706           if (vect_print_dump_info (REPORT_DETAILS))
1707 	    report_vect_op (def_stmt, "reduction: not binary operation: ");
1708 
1709           return NULL;
1710         }
1711 
1712       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1713       if (COMPARISON_CLASS_P (op3))
1714         {
1715           op4 = TREE_OPERAND (op3, 1);
1716           op3 = TREE_OPERAND (op3, 0);
1717         }
1718 
1719       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1720       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1721 
1722       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1723         {
1724           if (vect_print_dump_info (REPORT_DETAILS))
1725             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1726 
1727           return NULL;
1728         }
1729     }
1730   else
1731     {
1732       op1 = gimple_assign_rhs1 (def_stmt);
1733       op2 = gimple_assign_rhs2 (def_stmt);
1734 
1735       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1736         {
1737           if (vect_print_dump_info (REPORT_DETAILS))
1738 	    report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1739 
1740           return NULL;
1741         }
1742    }
1743 
1744   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1745   if ((TREE_CODE (op1) == SSA_NAME
1746        && !types_compatible_p (type,TREE_TYPE (op1)))
1747       || (TREE_CODE (op2) == SSA_NAME
1748           && !types_compatible_p (type, TREE_TYPE (op2)))
1749       || (op3 && TREE_CODE (op3) == SSA_NAME
1750           && !types_compatible_p (type, TREE_TYPE (op3)))
1751       || (op4 && TREE_CODE (op4) == SSA_NAME
1752           && !types_compatible_p (type, TREE_TYPE (op4))))
1753     {
1754       if (vect_print_dump_info (REPORT_DETAILS))
1755         {
1756           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1757           print_generic_expr (vect_dump, type, TDF_SLIM);
1758           fprintf (vect_dump, ", operands types: ");
1759           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1760           fprintf (vect_dump, ",");
1761           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1762           if (op3)
1763             {
1764               fprintf (vect_dump, ",");
1765               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1766             }
1767 
1768           if (op4)
1769             {
1770               fprintf (vect_dump, ",");
1771               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1772             }
1773         }
1774 
1775       return NULL;
1776     }
1777 
1778   /* Check that it's ok to change the order of the computation.
1779      Generally, when vectorizing a reduction we change the order of the
1780      computation.  This may change the behavior of the program in some
1781      cases, so we need to check that this is ok.  One exception is when
1782      vectorizing an outer-loop: the inner-loop is executed sequentially,
1783      and therefore vectorizing reductions in the inner-loop during
1784      outer-loop vectorization is safe.  */
1785 
1786   /* CHECKME: check for !flag_finite_math_only too?  */
1787   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1788       && check_reduction)
1789     {
1790       /* Changing the order of operations changes the semantics.  */
1791       if (vect_print_dump_info (REPORT_DETAILS))
1792 	report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1793       return NULL;
1794     }
1795   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1796 	   && check_reduction)
1797     {
1798       /* Changing the order of operations changes the semantics.  */
1799       if (vect_print_dump_info (REPORT_DETAILS))
1800 	report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1801       return NULL;
1802     }
1803   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1804     {
1805       /* Changing the order of operations changes the semantics.  */
1806       if (vect_print_dump_info (REPORT_DETAILS))
1807 	report_vect_op (def_stmt,
1808 			"reduction: unsafe fixed-point math optimization: ");
1809       return NULL;
1810     }
1811 
1812   /* Reduction is safe. We're dealing with one of the following:
1813      1) integer arithmetic and no trapv
1814      2) floating point arithmetic, and special flags permit this optimization
1815      3) nested cycle (i.e., outer loop vectorization).  */
1816   if (TREE_CODE (op1) == SSA_NAME)
1817     def1 = SSA_NAME_DEF_STMT (op1);
1818 
1819   if (TREE_CODE (op2) == SSA_NAME)
1820     def2 = SSA_NAME_DEF_STMT (op2);
1821 
1822   if (code != COND_EXPR
1823       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1824     {
1825       if (vect_print_dump_info (REPORT_DETAILS))
1826 	report_vect_op (def_stmt, "reduction: no defs for operands: ");
1827       return NULL;
1828     }
1829 
1830   /* Check that one def is the reduction def, defined by PHI,
1831      the other def is either defined in the loop ("vect_internal_def"),
1832      or it's an induction (defined by a loop-header phi-node).  */
1833 
1834   if (def2 && def2 == phi
1835       && (code == COND_EXPR
1836           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1837               && (is_gimple_assign (def1)
1838   	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1839                       == vect_induction_def
1840    	          || (gimple_code (def1) == GIMPLE_PHI
1841 	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1842                           == vect_internal_def
1843  	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
1844     {
1845       if (vect_print_dump_info (REPORT_DETAILS))
1846 	report_vect_op (def_stmt, "detected reduction: ");
1847       return def_stmt;
1848     }
1849   else if (def1 && def1 == phi
1850 	   && (code == COND_EXPR
1851                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1852  	           && (is_gimple_assign (def2)
1853 	               || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1854                            == vect_induction_def
1855  	               || (gimple_code (def2) == GIMPLE_PHI
1856 		           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1857                                == vect_internal_def
1858 		           && !is_loop_header_bb_p (gimple_bb (def2)))))))
1859     {
1860       if (check_reduction)
1861         {
1862           /* Swap operands (just for simplicity - so that the rest of the code
1863 	     can assume that the reduction variable is always the last (second)
1864 	     argument).  */
1865           if (vect_print_dump_info (REPORT_DETAILS))
1866 	    report_vect_op (def_stmt,
1867 	  	            "detected reduction: need to swap operands: ");
1868 
1869           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1870  			      gimple_assign_rhs2_ptr (def_stmt));
1871         }
1872       else
1873         {
1874           if (vect_print_dump_info (REPORT_DETAILS))
1875             report_vect_op (def_stmt, "detected reduction: ");
1876         }
1877 
1878       return def_stmt;
1879     }
1880   else
1881     {
1882       if (vect_print_dump_info (REPORT_DETAILS))
1883 	report_vect_op (def_stmt, "reduction: unknown pattern: ");
1884 
1885       return NULL;
1886     }
1887 }
1888 
1889 
1890 /* Function vect_estimate_min_profitable_iters
1891 
1892    Return the number of iterations required for the vector version of the
1893    loop to be profitable relative to the cost of the scalar version of the
1894    loop.
1895 
1896    TODO: Take profile info into account before making vectorization
1897    decisions, if available.  */
1898 
1899 int
1900 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1901 {
1902   int i;
1903   int min_profitable_iters;
1904   int peel_iters_prologue;
1905   int peel_iters_epilogue;
1906   int vec_inside_cost = 0;
1907   int vec_outside_cost = 0;
1908   int scalar_single_iter_cost = 0;
1909   int scalar_outside_cost = 0;
1910   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1911   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1912   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1913   int nbbs = loop->num_nodes;
1914   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1915   int peel_guard_costs = 0;
1916   int innerloop_iters = 0, factor;
1917   VEC (slp_instance, heap) *slp_instances;
1918   slp_instance instance;
1919 
1920   /* Cost model disabled.  */
1921   if (!flag_vect_cost_model)
1922     {
1923       if (vect_print_dump_info (REPORT_COST))
1924         fprintf (vect_dump, "cost model disabled.");
1925       return 0;
1926     }
1927 
1928   /* Requires loop versioning tests to handle misalignment.  */
1929   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1930     {
1931       /*  FIXME: Make cost depend on complexity of individual check.  */
1932       vec_outside_cost +=
1933 	VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1934       if (vect_print_dump_info (REPORT_COST))
1935         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1936                  "versioning to treat misalignment.\n");
1937     }
1938 
1939   /* Requires loop versioning with alias checks.  */
1940   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1941     {
1942       /*  FIXME: Make cost depend on complexity of individual check.  */
1943       vec_outside_cost +=
1944         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1945       if (vect_print_dump_info (REPORT_COST))
1946         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1947                  "versioning aliasing.\n");
1948     }
1949 
1950   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1951       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1952     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1953 
1954   /* Count statements in scalar loop.  Using this as scalar cost for a single
1955      iteration for now.
1956 
1957      TODO: Add outer loop support.
1958 
1959      TODO: Consider assigning different costs to different scalar
1960      statements.  */
1961 
1962   /* FORNOW.  */
1963   if (loop->inner)
1964     innerloop_iters = 50; /* FIXME */
1965 
1966   for (i = 0; i < nbbs; i++)
1967     {
1968       gimple_stmt_iterator si;
1969       basic_block bb = bbs[i];
1970 
1971       if (bb->loop_father == loop->inner)
1972  	factor = innerloop_iters;
1973       else
1974  	factor = 1;
1975 
1976       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1977 	{
1978 	  gimple stmt = gsi_stmt (si);
1979 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1980 	  /* Skip stmts that are not vectorized inside the loop.  */
1981 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
1982 	      && (!STMT_VINFO_LIVE_P (stmt_info)
1983 		  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1984 	    continue;
1985 	  scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1986 	  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1987 	  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1988 	     some of the "outside" costs are generated inside the outer-loop.  */
1989 	  vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1990 	}
1991     }
1992 
1993   /* Add additional cost for the peeled instructions in prologue and epilogue
1994      loop.
1995 
1996      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1997      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1998 
1999      TODO: Build an expression that represents peel_iters for prologue and
2000      epilogue to be used in a run-time test.  */
2001 
2002   if (byte_misalign < 0)
2003     {
2004       peel_iters_prologue = vf/2;
2005       if (vect_print_dump_info (REPORT_COST))
2006         fprintf (vect_dump, "cost model: "
2007                  "prologue peel iters set to vf/2.");
2008 
2009       /* If peeling for alignment is unknown, loop bound of main loop becomes
2010          unknown.  */
2011       peel_iters_epilogue = vf/2;
2012       if (vect_print_dump_info (REPORT_COST))
2013         fprintf (vect_dump, "cost model: "
2014                  "epilogue peel iters set to vf/2 because "
2015                  "peeling for alignment is unknown .");
2016 
2017       /* If peeled iterations are unknown, count a taken branch and a not taken
2018          branch per peeled loop. Even if scalar loop iterations are known,
2019          vector iterations are not known since peeled prologue iterations are
2020          not known. Hence guards remain the same.  */
2021       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2022                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2023     }
2024   else
2025     {
2026       if (byte_misalign)
2027 	{
2028 	  struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2029 	  int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2030 	  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2031 	  int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2032 
2033 	  peel_iters_prologue = nelements - (byte_misalign / element_size);
2034 	}
2035       else
2036 	peel_iters_prologue = 0;
2037 
2038       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2039         {
2040           peel_iters_epilogue = vf/2;
2041           if (vect_print_dump_info (REPORT_COST))
2042             fprintf (vect_dump, "cost model: "
2043                      "epilogue peel iters set to vf/2 because "
2044                      "loop iterations are unknown .");
2045 
2046 	  /* If peeled iterations are known but number of scalar loop
2047 	     iterations are unknown, count a taken branch per peeled loop.  */
2048 	  peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2049 
2050         }
2051       else
2052 	{
2053 	  int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2054 	  peel_iters_prologue = niters < peel_iters_prologue ?
2055 					niters : peel_iters_prologue;
2056 	  peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2057 	  /* If we need to peel for gaps, but no peeling is required, we have
2058 	     to peel VF iterations.  */
2059 	  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
2060 	    peel_iters_epilogue = vf;
2061 	}
2062     }
2063 
2064   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2065                       + (peel_iters_epilogue * scalar_single_iter_cost)
2066                       + peel_guard_costs;
2067 
2068   /* FORNOW: The scalar outside cost is incremented in one of the
2069      following ways:
2070 
2071      1. The vectorizer checks for alignment and aliasing and generates
2072      a condition that allows dynamic vectorization.  A cost model
2073      check is ANDED with the versioning condition.  Hence scalar code
2074      path now has the added cost of the versioning check.
2075 
2076        if (cost > th & versioning_check)
2077          jmp to vector code
2078 
2079      Hence run-time scalar is incremented by not-taken branch cost.
2080 
2081      2. The vectorizer then checks if a prologue is required.  If the
2082      cost model check was not done before during versioning, it has to
2083      be done before the prologue check.
2084 
2085        if (cost <= th)
2086          prologue = scalar_iters
2087        if (prologue == 0)
2088          jmp to vector code
2089        else
2090          execute prologue
2091        if (prologue == num_iters)
2092 	 go to exit
2093 
2094      Hence the run-time scalar cost is incremented by a taken branch,
2095      plus a not-taken branch, plus a taken branch cost.
2096 
2097      3. The vectorizer then checks if an epilogue is required.  If the
2098      cost model check was not done before during prologue check, it
2099      has to be done with the epilogue check.
2100 
2101        if (prologue == 0)
2102          jmp to vector code
2103        else
2104          execute prologue
2105        if (prologue == num_iters)
2106 	 go to exit
2107        vector code:
2108          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2109            jmp to epilogue
2110 
2111      Hence the run-time scalar cost should be incremented by 2 taken
2112      branches.
2113 
2114      TODO: The back end may reorder the BBS's differently and reverse
2115      conditions/branch directions.  Change the estimates below to
2116      something more reasonable.  */
2117 
2118   /* If the number of iterations is known and we do not do versioning, we can
2119      decide whether to vectorize at compile time. Hence the scalar version
2120      do not carry cost model guard costs.  */
2121   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2122       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2123       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2124     {
2125       /* Cost model check occurs at versioning.  */
2126       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2127           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2128 	scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2129       else
2130 	{
2131 	  /* Cost model check occurs at prologue generation.  */
2132 	  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2133 	    scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2134 	      + TARG_COND_NOT_TAKEN_BRANCH_COST;
2135 	  /* Cost model check occurs at epilogue generation.  */
2136 	  else
2137 	    scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2138 	}
2139     }
2140 
2141   /* Add SLP costs.  */
2142   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2143   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2144     {
2145       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2146       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2147     }
2148 
2149   /* Calculate number of iterations required to make the vector version
2150      profitable, relative to the loop bodies only. The following condition
2151      must hold true:
2152      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2153      where
2154      SIC = scalar iteration cost, VIC = vector iteration cost,
2155      VOC = vector outside cost, VF = vectorization factor,
2156      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2157      SOC = scalar outside cost for run time cost model check.  */
2158 
2159   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2160     {
2161       if (vec_outside_cost <= 0)
2162         min_profitable_iters = 1;
2163       else
2164         {
2165           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2166 				  - vec_inside_cost * peel_iters_prologue
2167                                   - vec_inside_cost * peel_iters_epilogue)
2168                                  / ((scalar_single_iter_cost * vf)
2169                                     - vec_inside_cost);
2170 
2171           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2172               <= ((vec_inside_cost * min_profitable_iters)
2173                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2174             min_profitable_iters++;
2175         }
2176     }
2177   /* vector version will never be profitable.  */
2178   else
2179     {
2180       if (vect_print_dump_info (REPORT_COST))
2181         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2182 		 "divided by the scalar iteration cost = %d "
2183 		 "is greater or equal to the vectorization factor = %d.",
2184                  vec_inside_cost, scalar_single_iter_cost, vf);
2185       return -1;
2186     }
2187 
2188   if (vect_print_dump_info (REPORT_COST))
2189     {
2190       fprintf (vect_dump, "Cost model analysis: \n");
2191       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2192 	       vec_inside_cost);
2193       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2194 	       vec_outside_cost);
2195       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2196 	       scalar_single_iter_cost);
2197       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2198       fprintf (vect_dump, "  prologue iterations: %d\n",
2199                peel_iters_prologue);
2200       fprintf (vect_dump, "  epilogue iterations: %d\n",
2201                peel_iters_epilogue);
2202       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2203 	       min_profitable_iters);
2204     }
2205 
2206   min_profitable_iters =
2207 	min_profitable_iters < vf ? vf : min_profitable_iters;
2208 
2209   /* Because the condition we create is:
2210      if (niters <= min_profitable_iters)
2211        then skip the vectorized loop.  */
2212   min_profitable_iters--;
2213 
2214   if (vect_print_dump_info (REPORT_COST))
2215     fprintf (vect_dump, "  Profitability threshold = %d\n",
2216 	     min_profitable_iters);
2217 
2218   return min_profitable_iters;
2219 }
2220 
2221 
2222 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2223    functions. Design better to avoid maintenance issues.  */
2224 
2225 /* Function vect_model_reduction_cost.
2226 
2227    Models cost for a reduction operation, including the vector ops
2228    generated within the strip-mine loop, the initial definition before
2229    the loop, and the epilogue code that must be generated.  */
2230 
2231 static bool
2232 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2233 			   int ncopies)
2234 {
2235   int outer_cost = 0;
2236   enum tree_code code;
2237   optab optab;
2238   tree vectype;
2239   gimple stmt, orig_stmt;
2240   tree reduction_op;
2241   enum machine_mode mode;
2242   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2243   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2244 
2245 
2246   /* Cost of reduction op inside loop.  */
2247   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2248 
2249   stmt = STMT_VINFO_STMT (stmt_info);
2250 
2251   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2252     {
2253     case GIMPLE_SINGLE_RHS:
2254       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2255       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2256       break;
2257     case GIMPLE_UNARY_RHS:
2258       reduction_op = gimple_assign_rhs1 (stmt);
2259       break;
2260     case GIMPLE_BINARY_RHS:
2261       reduction_op = gimple_assign_rhs2 (stmt);
2262       break;
2263     default:
2264       gcc_unreachable ();
2265     }
2266 
2267   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2268   if (!vectype)
2269     {
2270       if (vect_print_dump_info (REPORT_COST))
2271         {
2272           fprintf (vect_dump, "unsupported data-type ");
2273           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2274         }
2275       return false;
2276    }
2277 
2278   mode = TYPE_MODE (vectype);
2279   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2280 
2281   if (!orig_stmt)
2282     orig_stmt = STMT_VINFO_STMT (stmt_info);
2283 
2284   code = gimple_assign_rhs_code (orig_stmt);
2285 
2286   /* Add in cost for initial definition.  */
2287   outer_cost += TARG_SCALAR_TO_VEC_COST;
2288 
2289   /* Determine cost of epilogue code.
2290 
2291      We have a reduction operator that will reduce the vector in one statement.
2292      Also requires scalar extract.  */
2293 
2294   if (!nested_in_vect_loop_p (loop, orig_stmt))
2295     {
2296       if (reduc_code != ERROR_MARK)
2297 	outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2298       else
2299 	{
2300 	  int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2301 	  tree bitsize =
2302 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2303 	  int element_bitsize = tree_low_cst (bitsize, 1);
2304 	  int nelements = vec_size_in_bits / element_bitsize;
2305 
2306 	  optab = optab_for_tree_code (code, vectype, optab_default);
2307 
2308 	  /* We have a whole vector shift available.  */
2309 	  if (VECTOR_MODE_P (mode)
2310 	      && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2311 	      && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2312 	    /* Final reduction via vector shifts and the reduction operator. Also
2313 	       requires scalar extract.  */
2314 	    outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2315 				+ TARG_VEC_TO_SCALAR_COST);
2316 	  else
2317 	    /* Use extracts and reduction op for final reduction.  For N elements,
2318                we have N extracts and N-1 reduction ops.  */
2319 	    outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2320 	}
2321     }
2322 
2323   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2324 
2325   if (vect_print_dump_info (REPORT_COST))
2326     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2327              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2328              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2329 
2330   return true;
2331 }
2332 
2333 
2334 /* Function vect_model_induction_cost.
2335 
2336    Models cost for induction operations.  */
2337 
2338 static void
2339 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2340 {
2341   /* loop cost for vec_loop.  */
2342   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2343   /* prologue cost for vec_init and vec_step.  */
2344   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2345 
2346   if (vect_print_dump_info (REPORT_COST))
2347     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2348              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2349              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2350 }
2351 
2352 
2353 /* Function get_initial_def_for_induction
2354 
2355    Input:
2356    STMT - a stmt that performs an induction operation in the loop.
2357    IV_PHI - the initial value of the induction variable
2358 
2359    Output:
2360    Return a vector variable, initialized with the first VF values of
2361    the induction variable. E.g., for an iv with IV_PHI='X' and
2362    evolution S, for a vector of 4 units, we want to return:
2363    [X, X + S, X + 2*S, X + 3*S].  */
2364 
2365 static tree
2366 get_initial_def_for_induction (gimple iv_phi)
2367 {
2368   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2369   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2370   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2371   tree scalar_type;
2372   tree vectype;
2373   int nunits;
2374   edge pe = loop_preheader_edge (loop);
2375   struct loop *iv_loop;
2376   basic_block new_bb;
2377   tree vec, vec_init, vec_step, t;
2378   tree access_fn;
2379   tree new_var;
2380   tree new_name;
2381   gimple init_stmt, induction_phi, new_stmt;
2382   tree induc_def, vec_def, vec_dest;
2383   tree init_expr, step_expr;
2384   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2385   int i;
2386   bool ok;
2387   int ncopies;
2388   tree expr;
2389   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2390   bool nested_in_vect_loop = false;
2391   gimple_seq stmts = NULL;
2392   imm_use_iterator imm_iter;
2393   use_operand_p use_p;
2394   gimple exit_phi;
2395   edge latch_e;
2396   tree loop_arg;
2397   gimple_stmt_iterator si;
2398   basic_block bb = gimple_bb (iv_phi);
2399   tree stepvectype;
2400   tree resvectype;
2401 
2402   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2403   if (nested_in_vect_loop_p (loop, iv_phi))
2404     {
2405       nested_in_vect_loop = true;
2406       iv_loop = loop->inner;
2407     }
2408   else
2409     iv_loop = loop;
2410   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2411 
2412   latch_e = loop_latch_edge (iv_loop);
2413   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2414 
2415   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2416   gcc_assert (access_fn);
2417   STRIP_NOPS (access_fn);
2418   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2419                                     &init_expr, &step_expr);
2420   gcc_assert (ok);
2421   pe = loop_preheader_edge (iv_loop);
2422 
2423   scalar_type = TREE_TYPE (init_expr);
2424   vectype = get_vectype_for_scalar_type (scalar_type);
2425   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2426   gcc_assert (vectype);
2427   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2428   ncopies = vf / nunits;
2429 
2430   gcc_assert (phi_info);
2431   gcc_assert (ncopies >= 1);
2432 
2433   /* Find the first insertion point in the BB.  */
2434   si = gsi_after_labels (bb);
2435 
2436   /* Create the vector that holds the initial_value of the induction.  */
2437   if (nested_in_vect_loop)
2438     {
2439       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2440 	 been created during vectorization of previous stmts; We obtain it from
2441 	 the STMT_VINFO_VEC_STMT of the defining stmt. */
2442       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2443                                            loop_preheader_edge (iv_loop));
2444       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2445     }
2446   else
2447     {
2448       /* iv_loop is the loop to be vectorized. Create:
2449 	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2450       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2451       add_referenced_var (new_var);
2452 
2453       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2454       if (stmts)
2455 	{
2456 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2457 	  gcc_assert (!new_bb);
2458 	}
2459 
2460       t = NULL_TREE;
2461       t = tree_cons (NULL_TREE, new_name, t);
2462       for (i = 1; i < nunits; i++)
2463 	{
2464 	  /* Create: new_name_i = new_name + step_expr  */
2465 	  enum tree_code code = POINTER_TYPE_P (scalar_type)
2466 				? POINTER_PLUS_EXPR : PLUS_EXPR;
2467 	  init_stmt = gimple_build_assign_with_ops (code, new_var,
2468 						    new_name, step_expr);
2469 	  new_name = make_ssa_name (new_var, init_stmt);
2470 	  gimple_assign_set_lhs (init_stmt, new_name);
2471 
2472 	  new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2473 	  gcc_assert (!new_bb);
2474 
2475 	  if (vect_print_dump_info (REPORT_DETAILS))
2476 	    {
2477 	      fprintf (vect_dump, "created new init_stmt: ");
2478 	      print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2479 	    }
2480 	  t = tree_cons (NULL_TREE, new_name, t);
2481 	}
2482       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2483       vec = build_constructor_from_list (vectype, nreverse (t));
2484       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2485     }
2486 
2487 
2488   /* Create the vector that holds the step of the induction.  */
2489   if (nested_in_vect_loop)
2490     /* iv_loop is nested in the loop to be vectorized. Generate:
2491        vec_step = [S, S, S, S]  */
2492     new_name = step_expr;
2493   else
2494     {
2495       /* iv_loop is the loop to be vectorized. Generate:
2496 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2497       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2498       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2499 			      expr, step_expr);
2500     }
2501 
2502   t = NULL_TREE;
2503   for (i = 0; i < nunits; i++)
2504     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2505   gcc_assert (CONSTANT_CLASS_P (new_name));
2506   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2507   gcc_assert (stepvectype);
2508   vec = build_vector (stepvectype, t);
2509   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2510 
2511 
2512   /* Create the following def-use cycle:
2513      loop prolog:
2514          vec_init = ...
2515 	 vec_step = ...
2516      loop:
2517          vec_iv = PHI <vec_init, vec_loop>
2518          ...
2519          STMT
2520          ...
2521          vec_loop = vec_iv + vec_step;  */
2522 
2523   /* Create the induction-phi that defines the induction-operand.  */
2524   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2525   add_referenced_var (vec_dest);
2526   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2527   set_vinfo_for_stmt (induction_phi,
2528 		      new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2529   induc_def = PHI_RESULT (induction_phi);
2530 
2531   /* Create the iv update inside the loop  */
2532   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2533 					   induc_def, vec_step);
2534   vec_def = make_ssa_name (vec_dest, new_stmt);
2535   gimple_assign_set_lhs (new_stmt, vec_def);
2536   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2537   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2538                                                    NULL));
2539 
2540   /* Set the arguments of the phi node:  */
2541   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2542   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2543 	       UNKNOWN_LOCATION);
2544 
2545 
2546   /* In case that vectorization factor (VF) is bigger than the number
2547      of elements that we can fit in a vectype (nunits), we have to generate
2548      more than one vector stmt - i.e - we need to "unroll" the
2549      vector stmt by a factor VF/nunits.  For more details see documentation
2550      in vectorizable_operation.  */
2551 
2552   if (ncopies > 1)
2553     {
2554       stmt_vec_info prev_stmt_vinfo;
2555       /* FORNOW. This restriction should be relaxed.  */
2556       gcc_assert (!nested_in_vect_loop);
2557 
2558       /* Create the vector that holds the step of the induction.  */
2559       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2560       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2561 			      expr, step_expr);
2562       t = NULL_TREE;
2563       for (i = 0; i < nunits; i++)
2564 	t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2565       gcc_assert (CONSTANT_CLASS_P (new_name));
2566       vec = build_vector (stepvectype, t);
2567       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2568 
2569       vec_def = induc_def;
2570       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2571       for (i = 1; i < ncopies; i++)
2572 	{
2573 	  /* vec_i = vec_prev + vec_step  */
2574 	  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2575 						   vec_def, vec_step);
2576 	  vec_def = make_ssa_name (vec_dest, new_stmt);
2577 	  gimple_assign_set_lhs (new_stmt, vec_def);
2578 
2579 	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2580 	  if (!useless_type_conversion_p (resvectype, vectype))
2581 	    {
2582 	      new_stmt = gimple_build_assign_with_ops
2583 		  (VIEW_CONVERT_EXPR,
2584 		   vect_get_new_vect_var (resvectype, vect_simple_var,
2585 					  "vec_iv_"),
2586 		   build1 (VIEW_CONVERT_EXPR, resvectype,
2587 			   gimple_assign_lhs (new_stmt)), NULL_TREE);
2588 	      gimple_assign_set_lhs (new_stmt,
2589 				     make_ssa_name
2590 				       (gimple_assign_lhs (new_stmt), new_stmt));
2591 	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2592 	    }
2593 	  set_vinfo_for_stmt (new_stmt,
2594 			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2595 	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2596 	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2597 	}
2598     }
2599 
2600   if (nested_in_vect_loop)
2601     {
2602       /* Find the loop-closed exit-phi of the induction, and record
2603          the final vector of induction results:  */
2604       exit_phi = NULL;
2605       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2606         {
2607 	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2608 	    {
2609 	      exit_phi = USE_STMT (use_p);
2610 	      break;
2611 	    }
2612         }
2613       if (exit_phi)
2614 	{
2615 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2616 	  /* FORNOW. Currently not supporting the case that an inner-loop induction
2617 	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
2618 	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2619 		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
2620 
2621 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2622 	  if (vect_print_dump_info (REPORT_DETAILS))
2623 	    {
2624 	      fprintf (vect_dump, "vector of inductions after inner-loop:");
2625 	      print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2626 	    }
2627 	}
2628     }
2629 
2630 
2631   if (vect_print_dump_info (REPORT_DETAILS))
2632     {
2633       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2634       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2635       fprintf (vect_dump, "\n");
2636       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2637     }
2638 
2639   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2640   if (!useless_type_conversion_p (resvectype, vectype))
2641     {
2642       new_stmt = gimple_build_assign_with_ops
2643 	 (VIEW_CONVERT_EXPR,
2644 	  vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2645 	  build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2646       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2647       gimple_assign_set_lhs (new_stmt, induc_def);
2648       si = gsi_start_bb (bb);
2649       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2650       set_vinfo_for_stmt (new_stmt,
2651 			  new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2652       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2653 	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2654     }
2655 
2656   return induc_def;
2657 }
2658 
2659 
2660 /* Function get_initial_def_for_reduction
2661 
2662    Input:
2663    STMT - a stmt that performs a reduction operation in the loop.
2664    INIT_VAL - the initial value of the reduction variable
2665 
2666    Output:
2667    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2668         of the reduction (used for adjusting the epilog - see below).
2669    Return a vector variable, initialized according to the operation that STMT
2670         performs. This vector will be used as the initial value of the
2671         vector of partial results.
2672 
2673    Option1 (adjust in epilog): Initialize the vector as follows:
2674      add/bit or/xor:    [0,0,...,0,0]
2675      mult/bit and:      [1,1,...,1,1]
2676      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2677    and when necessary (e.g. add/mult case) let the caller know
2678    that it needs to adjust the result by init_val.
2679 
2680    Option2: Initialize the vector as follows:
2681      add/bit or/xor:    [init_val,0,0,...,0]
2682      mult/bit and:      [init_val,1,1,...,1]
2683      min/max/cond_expr: [init_val,init_val,...,init_val]
2684    and no adjustments are needed.
2685 
2686    For example, for the following code:
2687 
2688    s = init_val;
2689    for (i=0;i<n;i++)
2690      s = s + a[i];
2691 
2692    STMT is 's = s + a[i]', and the reduction variable is 's'.
2693    For a vector of 4 units, we want to return either [0,0,0,init_val],
2694    or [0,0,0,0] and let the caller know that it needs to adjust
2695    the result at the end by 'init_val'.
2696 
2697    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2698    initialization vector is simpler (same element in all entries), if
2699    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2700 
2701    A cost model should help decide between these two schemes.  */
2702 
2703 tree
2704 get_initial_def_for_reduction (gimple stmt, tree init_val,
2705                                tree *adjustment_def)
2706 {
2707   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2708   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2709   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2710   tree scalar_type = TREE_TYPE (init_val);
2711   tree vectype = get_vectype_for_scalar_type (scalar_type);
2712   int nunits;
2713   enum tree_code code = gimple_assign_rhs_code (stmt);
2714   tree def_for_init;
2715   tree init_def;
2716   tree t = NULL_TREE;
2717   int i;
2718   bool nested_in_vect_loop = false;
2719   tree init_value;
2720   REAL_VALUE_TYPE real_init_val = dconst0;
2721   int int_init_val = 0;
2722   gimple def_stmt = NULL;
2723 
2724   gcc_assert (vectype);
2725   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2726 
2727   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2728 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
2729 
2730   if (nested_in_vect_loop_p (loop, stmt))
2731     nested_in_vect_loop = true;
2732   else
2733     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2734 
2735   /* In case of double reduction we only create a vector variable to be put
2736      in the reduction phi node. The actual statement creation is done in
2737      vect_create_epilog_for_reduction.  */
2738   if (adjustment_def && nested_in_vect_loop
2739       && TREE_CODE (init_val) == SSA_NAME
2740       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2741       && gimple_code (def_stmt) == GIMPLE_PHI
2742       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2743       && vinfo_for_stmt (def_stmt)
2744       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2745           == vect_double_reduction_def)
2746     {
2747       *adjustment_def = NULL;
2748       return vect_create_destination_var (init_val, vectype);
2749     }
2750 
2751   if (TREE_CONSTANT (init_val))
2752     {
2753       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2754         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2755       else
2756         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2757     }
2758   else
2759     init_value = init_val;
2760 
2761   switch (code)
2762     {
2763       case WIDEN_SUM_EXPR:
2764       case DOT_PROD_EXPR:
2765       case PLUS_EXPR:
2766       case MINUS_EXPR:
2767       case BIT_IOR_EXPR:
2768       case BIT_XOR_EXPR:
2769       case MULT_EXPR:
2770       case BIT_AND_EXPR:
2771         /* ADJUSMENT_DEF is NULL when called from
2772            vect_create_epilog_for_reduction to vectorize double reduction.  */
2773         if (adjustment_def)
2774           {
2775             if (nested_in_vect_loop)
2776               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2777                                                               NULL);
2778             else
2779               *adjustment_def = init_val;
2780           }
2781 
2782         if (code == MULT_EXPR)
2783           {
2784             real_init_val = dconst1;
2785             int_init_val = 1;
2786           }
2787 
2788         if (code == BIT_AND_EXPR)
2789           int_init_val = -1;
2790 
2791         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2792           def_for_init = build_real (scalar_type, real_init_val);
2793         else
2794           def_for_init = build_int_cst (scalar_type, int_init_val);
2795 
2796         /* Create a vector of '0' or '1' except the first element.  */
2797         for (i = nunits - 2; i >= 0; --i)
2798           t = tree_cons (NULL_TREE, def_for_init, t);
2799 
2800         /* Option1: the first element is '0' or '1' as well.  */
2801         if (adjustment_def)
2802           {
2803             t = tree_cons (NULL_TREE, def_for_init, t);
2804             init_def = build_vector (vectype, t);
2805             break;
2806           }
2807 
2808         /* Option2: the first element is INIT_VAL.  */
2809         t = tree_cons (NULL_TREE, init_value, t);
2810         if (TREE_CONSTANT (init_val))
2811           init_def = build_vector (vectype, t);
2812         else
2813           init_def = build_constructor_from_list (vectype, t);
2814 
2815         break;
2816 
2817       case MIN_EXPR:
2818       case MAX_EXPR:
2819       case COND_EXPR:
2820         if (adjustment_def)
2821           {
2822             *adjustment_def = NULL_TREE;
2823             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2824             break;
2825           }
2826 
2827         for (i = nunits - 1; i >= 0; --i)
2828           t = tree_cons (NULL_TREE, init_value, t);
2829 
2830         if (TREE_CONSTANT (init_val))
2831           init_def = build_vector (vectype, t);
2832         else
2833           init_def = build_constructor_from_list (vectype, t);
2834 
2835         break;
2836 
2837       default:
2838         gcc_unreachable ();
2839     }
2840 
2841   return init_def;
2842 }
2843 
2844 
2845 /* Function vect_create_epilog_for_reduction
2846 
2847    Create code at the loop-epilog to finalize the result of a reduction
2848    computation.
2849 
2850    VECT_DEF is a vector of partial results.
2851    REDUC_CODE is the tree-code for the epilog reduction.
2852    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2853      number of elements that we can fit in a vectype (nunits). In this case
2854      we have to generate more than one vector stmt - i.e - we need to "unroll"
2855      the vector stmt by a factor VF/nunits.  For more details see documentation
2856      in vectorizable_operation.
2857    STMT is the scalar reduction stmt that is being vectorized.
2858    REDUCTION_PHI is the phi-node that carries the reduction computation.
2859    REDUC_INDEX is the index of the operand in the right hand side of the
2860      statement that is defined by REDUCTION_PHI.
2861    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2862 
2863    This function:
2864    1. Creates the reduction def-use cycle: sets the arguments for
2865       REDUCTION_PHI:
2866       The loop-entry argument is the vectorized initial-value of the reduction.
2867       The loop-latch argument is VECT_DEF - the vector of partial sums.
2868    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2869       by applying the operation specified by REDUC_CODE if available, or by
2870       other means (whole-vector shifts or a scalar loop).
2871       The function also creates a new phi node at the loop exit to preserve
2872       loop-closed form, as illustrated below.
2873 
2874      The flow at the entry to this function:
2875 
2876         loop:
2877           vec_def = phi <null, null>            # REDUCTION_PHI
2878           VECT_DEF = vector_stmt                # vectorized form of STMT
2879           s_loop = scalar_stmt                  # (scalar) STMT
2880         loop_exit:
2881           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2882           use <s_out0>
2883           use <s_out0>
2884 
2885      The above is transformed by this function into:
2886 
2887         loop:
2888           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2889           VECT_DEF = vector_stmt                # vectorized form of STMT
2890           s_loop = scalar_stmt                  # (scalar) STMT
2891         loop_exit:
2892           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2893           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2894           v_out2 = reduce <v_out1>
2895           s_out3 = extract_field <v_out2, 0>
2896           s_out4 = adjust_result <s_out3>
2897           use <s_out4>
2898           use <s_out4>
2899 */
2900 
2901 static void
2902 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2903 				  int ncopies,
2904 				  enum tree_code reduc_code,
2905 				  gimple reduction_phi,
2906                                   int reduc_index,
2907                                   bool double_reduc)
2908 {
2909   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2910   stmt_vec_info prev_phi_info;
2911   tree vectype;
2912   enum machine_mode mode;
2913   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2914   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2915   basic_block exit_bb;
2916   tree scalar_dest;
2917   tree scalar_type;
2918   gimple new_phi = NULL, phi;
2919   gimple_stmt_iterator exit_gsi;
2920   tree vec_dest;
2921   tree new_temp = NULL_TREE;
2922   tree new_name;
2923   gimple epilog_stmt = NULL;
2924   tree new_scalar_dest, new_dest;
2925   gimple exit_phi;
2926   tree bitsize, bitpos;
2927   enum tree_code code = gimple_assign_rhs_code (stmt);
2928   tree adjustment_def;
2929   tree vec_initial_def, def;
2930   tree orig_name;
2931   imm_use_iterator imm_iter;
2932   use_operand_p use_p;
2933   bool extract_scalar_result = false;
2934   tree reduction_op, expr;
2935   gimple orig_stmt;
2936   gimple use_stmt;
2937   bool nested_in_vect_loop = false;
2938   VEC(gimple,heap) *phis = NULL;
2939   enum vect_def_type dt = vect_unknown_def_type;
2940   int j, i;
2941 
2942   if (nested_in_vect_loop_p (loop, stmt))
2943     {
2944       outer_loop = loop;
2945       loop = loop->inner;
2946       nested_in_vect_loop = true;
2947     }
2948 
2949   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2950     {
2951     case GIMPLE_SINGLE_RHS:
2952       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2953                                        == ternary_op);
2954       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2955       break;
2956     case GIMPLE_UNARY_RHS:
2957       reduction_op = gimple_assign_rhs1 (stmt);
2958       break;
2959     case GIMPLE_BINARY_RHS:
2960       reduction_op = reduc_index ?
2961                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2962       break;
2963     default:
2964       gcc_unreachable ();
2965     }
2966 
2967   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2968   gcc_assert (vectype);
2969   mode = TYPE_MODE (vectype);
2970 
2971   /*** 1. Create the reduction def-use cycle  ***/
2972 
2973   /* For the case of reduction, vect_get_vec_def_for_operand returns
2974      the scalar def before the loop, that defines the initial value
2975      of the reduction variable.  */
2976   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2977 					          &adjustment_def);
2978 
2979   phi = reduction_phi;
2980   def = vect_def;
2981   for (j = 0; j < ncopies; j++)
2982     {
2983       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2984       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2985 		   UNKNOWN_LOCATION);
2986 
2987       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2988       if (j > 0)
2989         def = vect_get_vec_def_for_stmt_copy (dt, def);
2990       add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2991 
2992       if (vect_print_dump_info (REPORT_DETAILS))
2993 	{
2994 	  fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2995 	  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2996           fprintf (vect_dump, "\n");
2997           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2998 	}
2999 
3000       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3001     }
3002 
3003   /*** 2. Create epilog code
3004 	  The reduction epilog code operates across the elements of the vector
3005           of partial results computed by the vectorized loop.
3006           The reduction epilog code consists of:
3007           step 1: compute the scalar result in a vector (v_out2)
3008           step 2: extract the scalar result (s_out3) from the vector (v_out2)
3009           step 3: adjust the scalar result (s_out3) if needed.
3010 
3011           Step 1 can be accomplished using one the following three schemes:
3012           (scheme 1) using reduc_code, if available.
3013           (scheme 2) using whole-vector shifts, if available.
3014           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3015                      combined.
3016 
3017           The overall epilog code looks like this:
3018 
3019           s_out0 = phi <s_loop>         # original EXIT_PHI
3020           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3021           v_out2 = reduce <v_out1>              # step 1
3022           s_out3 = extract_field <v_out2, 0>    # step 2
3023           s_out4 = adjust_result <s_out3>       # step 3
3024 
3025           (step 3 is optional, and steps 1 and 2 may be combined).
3026           Lastly, the uses of s_out0 are replaced by s_out4.
3027 
3028 	  ***/
3029 
3030   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
3031         v_out1 = phi <v_loop>  */
3032 
3033   exit_bb = single_exit (loop)->dest;
3034   def = vect_def;
3035   prev_phi_info = NULL;
3036   for (j = 0; j < ncopies; j++)
3037     {
3038       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
3039       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3040       if (j == 0)
3041 	new_phi = phi;
3042       else
3043 	{
3044 	  def = vect_get_vec_def_for_stmt_copy (dt, def);
3045 	  STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3046 	}
3047       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3048       prev_phi_info = vinfo_for_stmt (phi);
3049     }
3050 
3051   exit_gsi = gsi_after_labels (exit_bb);
3052 
3053   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3054          (i.e. when reduc_code is not available) and in the final adjustment
3055 	 code (if needed).  Also get the original scalar reduction variable as
3056          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3057          represents a reduction pattern), the tree-code and scalar-def are
3058          taken from the original stmt that the pattern-stmt (STMT) replaces.
3059          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3060          are taken from STMT.  */
3061 
3062   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3063   if (!orig_stmt)
3064     {
3065       /* Regular reduction  */
3066       orig_stmt = stmt;
3067     }
3068   else
3069     {
3070       /* Reduction pattern  */
3071       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3072       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3073       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3074     }
3075 
3076   code = gimple_assign_rhs_code (orig_stmt);
3077   scalar_dest = gimple_assign_lhs (orig_stmt);
3078   scalar_type = TREE_TYPE (scalar_dest);
3079   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3080   bitsize = TYPE_SIZE (scalar_type);
3081 
3082   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3083      partial results are added and not subtracted.  */
3084   if (code == MINUS_EXPR)
3085     code = PLUS_EXPR;
3086 
3087   /* In case this is a reduction in an inner-loop while vectorizing an outer
3088      loop - we don't need to extract a single scalar result at the end of the
3089      inner-loop (unless it is double reduction, i.e., the use of reduction is
3090      outside the outer-loop). The final vector of partial results will be used
3091      in the vectorized outer-loop, or reduced to a scalar result at the end of
3092      the outer-loop.  */
3093   if (nested_in_vect_loop && !double_reduc)
3094     goto vect_finalize_reduction;
3095 
3096   /* The epilogue is created for the outer-loop, i.e., for the loop being
3097      vectorized.  */
3098   if (double_reduc)
3099     loop = outer_loop;
3100 
3101   /* FORNOW */
3102   gcc_assert (ncopies == 1);
3103 
3104   /* 2.3 Create the reduction code, using one of the three schemes described
3105          above.  */
3106 
3107   if (reduc_code != ERROR_MARK)
3108     {
3109       tree tmp;
3110 
3111       /*** Case 1:  Create:
3112 	   v_out2 = reduc_expr <v_out1>  */
3113 
3114       if (vect_print_dump_info (REPORT_DETAILS))
3115 	fprintf (vect_dump, "Reduce using direct vector reduction.");
3116 
3117       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3118       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3119       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3120       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3121       gimple_assign_set_lhs (epilog_stmt, new_temp);
3122       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3123 
3124       extract_scalar_result = true;
3125     }
3126   else
3127     {
3128       enum tree_code shift_code = ERROR_MARK;
3129       bool have_whole_vector_shift = true;
3130       int bit_offset;
3131       int element_bitsize = tree_low_cst (bitsize, 1);
3132       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3133       tree vec_temp;
3134 
3135       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3136 	shift_code = VEC_RSHIFT_EXPR;
3137       else
3138 	have_whole_vector_shift = false;
3139 
3140       /* Regardless of whether we have a whole vector shift, if we're
3141 	 emulating the operation via tree-vect-generic, we don't want
3142 	 to use it.  Only the first round of the reduction is likely
3143 	 to still be profitable via emulation.  */
3144       /* ??? It might be better to emit a reduction tree code here, so that
3145 	 tree-vect-generic can expand the first round via bit tricks.  */
3146       if (!VECTOR_MODE_P (mode))
3147 	have_whole_vector_shift = false;
3148       else
3149 	{
3150 	  optab optab = optab_for_tree_code (code, vectype, optab_default);
3151 	  if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3152 	    have_whole_vector_shift = false;
3153 	}
3154 
3155       if (have_whole_vector_shift)
3156         {
3157 	  /*** Case 2: Create:
3158 	     for (offset = VS/2; offset >= element_size; offset/=2)
3159 	        {
3160 	          Create:  va' = vec_shift <va, offset>
3161 	          Create:  va = vop <va, va'>
3162 	        }  */
3163 
3164 	  if (vect_print_dump_info (REPORT_DETAILS))
3165 	    fprintf (vect_dump, "Reduce using vector shifts");
3166 
3167 	  vec_dest = vect_create_destination_var (scalar_dest, vectype);
3168 	  new_temp = PHI_RESULT (new_phi);
3169 
3170 	  for (bit_offset = vec_size_in_bits/2;
3171 	       bit_offset >= element_bitsize;
3172 	       bit_offset /= 2)
3173 	    {
3174 	      tree bitpos = size_int (bit_offset);
3175 
3176 	      epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3177 							  new_temp, bitpos);
3178 	      new_name = make_ssa_name (vec_dest, epilog_stmt);
3179 	      gimple_assign_set_lhs (epilog_stmt, new_name);
3180 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3181 
3182 	      epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3183 							  new_name, new_temp);
3184 	      new_temp = make_ssa_name (vec_dest, epilog_stmt);
3185 	      gimple_assign_set_lhs (epilog_stmt, new_temp);
3186 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3187 	    }
3188 
3189 	  extract_scalar_result = true;
3190 	}
3191       else
3192         {
3193 	  tree rhs;
3194 
3195 	  /*** Case 3: Create:
3196 	     s = extract_field <v_out2, 0>
3197 	     for (offset = element_size;
3198 		  offset < vector_size;
3199 		  offset += element_size;)
3200 	       {
3201 	         Create:  s' = extract_field <v_out2, offset>
3202 	         Create:  s = op <s, s'>
3203 	       }  */
3204 
3205 	  if (vect_print_dump_info (REPORT_DETAILS))
3206 	    fprintf (vect_dump, "Reduce using scalar code. ");
3207 
3208 	  vec_temp = PHI_RESULT (new_phi);
3209 	  vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3210 	  rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3211 			 bitsize_zero_node);
3212 	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3213 	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3214 	  gimple_assign_set_lhs (epilog_stmt, new_temp);
3215 	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3216 
3217 	  for (bit_offset = element_bitsize;
3218 	       bit_offset < vec_size_in_bits;
3219 	       bit_offset += element_bitsize)
3220 	    {
3221 	      tree bitpos = bitsize_int (bit_offset);
3222 	      tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3223 				 bitpos);
3224 
3225 	      epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3226 	      new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3227 	      gimple_assign_set_lhs (epilog_stmt, new_name);
3228 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3229 
3230 	      epilog_stmt = gimple_build_assign_with_ops (code,
3231 							  new_scalar_dest,
3232 							  new_name, new_temp);
3233 	      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3234 	      gimple_assign_set_lhs (epilog_stmt, new_temp);
3235 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3236 	    }
3237 
3238 	  extract_scalar_result = false;
3239 	}
3240     }
3241 
3242   /* 2.4  Extract the final scalar result.  Create:
3243          s_out3 = extract_field <v_out2, bitpos>  */
3244 
3245   if (extract_scalar_result)
3246     {
3247       tree rhs;
3248 
3249       gcc_assert (!nested_in_vect_loop || double_reduc);
3250       if (vect_print_dump_info (REPORT_DETAILS))
3251 	fprintf (vect_dump, "extract scalar result");
3252 
3253       if (BYTES_BIG_ENDIAN)
3254 	bitpos = size_binop (MULT_EXPR,
3255 		       bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3256 		       TYPE_SIZE (scalar_type));
3257       else
3258 	bitpos = bitsize_zero_node;
3259 
3260       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3261       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3262       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3263       gimple_assign_set_lhs (epilog_stmt, new_temp);
3264       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3265     }
3266 
3267 vect_finalize_reduction:
3268 
3269   if (double_reduc)
3270     loop = loop->inner;
3271 
3272   /* 2.5 Adjust the final result by the initial value of the reduction
3273 	 variable. (When such adjustment is not needed, then
3274 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
3275 	 new_temp = loop_exit_def + adjustment_def  */
3276 
3277   if (adjustment_def)
3278     {
3279       if (nested_in_vect_loop)
3280 	{
3281 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3282 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3283 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
3284 	}
3285       else
3286 	{
3287 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3288 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
3289 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3290 	}
3291 
3292       epilog_stmt = gimple_build_assign (new_dest, expr);
3293       new_temp = make_ssa_name (new_dest, epilog_stmt);
3294       gimple_assign_set_lhs (epilog_stmt, new_temp);
3295       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3296       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3297     }
3298 
3299 
3300   /* 2.6  Handle the loop-exit phi  */
3301 
3302   /* Replace uses of s_out0 with uses of s_out3:
3303      Find the loop-closed-use at the loop exit of the original scalar result.
3304      (The reduction result is expected to have two immediate uses - one at the
3305      latch block, and one at the loop exit).  */
3306   phis = VEC_alloc (gimple, heap, 10);
3307   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3308     {
3309       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3310 	{
3311 	  exit_phi = USE_STMT (use_p);
3312 	  VEC_quick_push (gimple, phis, exit_phi);
3313 	}
3314     }
3315 
3316   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3317   gcc_assert (!VEC_empty (gimple, phis));
3318 
3319   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3320     {
3321       if (nested_in_vect_loop)
3322 	{
3323 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3324           gimple vect_phi;
3325 
3326 	  /* FORNOW. Currently not supporting the case that an inner-loop
3327 	     reduction is not used in the outer-loop (but only outside the
3328 	     outer-loop), unless it is double reduction.  */
3329 	  gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3330                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3331 
3332 	  epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3333 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3334 	  set_vinfo_for_stmt (epilog_stmt,
3335 			      new_stmt_vec_info (epilog_stmt, loop_vinfo,
3336 			                         NULL));
3337 	  if (adjustment_def)
3338 	    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3339 		STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3340 
3341           if (!double_reduc
3342               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3343             continue;
3344 
3345           /* Handle double reduction:
3346 
3347              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3348              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3349              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3350              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3351 
3352              At that point the regular reduction (stmt2 and stmt3) is already
3353              vectorized, as well as the exit phi node, stmt4.
3354              Here we vectorize the phi node of double reduction, stmt1, and
3355              update all relevant statements.  */
3356 
3357           /* Go through all the uses of s2 to find double reduction phi node,
3358              i.e., stmt1 above.  */
3359           orig_name = PHI_RESULT (exit_phi);
3360           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3361             {
3362               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3363               stmt_vec_info new_phi_vinfo;
3364               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3365               basic_block bb = gimple_bb (use_stmt);
3366               gimple use;
3367 
3368               /* Check that USE_STMT is really double reduction phi node.  */
3369               if (gimple_code (use_stmt) != GIMPLE_PHI
3370                   || gimple_phi_num_args (use_stmt) != 2
3371                   || !use_stmt_vinfo
3372                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3373                       != vect_double_reduction_def
3374                   || bb->loop_father != outer_loop)
3375                 continue;
3376 
3377               /* Create vector phi node for double reduction:
3378                  vs1 = phi <vs0, vs2>
3379                  vs1 was created previously in this function by a call to
3380                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3381                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3382                  vs0 is created here.  */
3383 
3384               /* Create vector phi node.  */
3385               vect_phi = create_phi_node (vec_initial_def, bb);
3386               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3387                                     loop_vec_info_for_loop (outer_loop), NULL);
3388               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3389 
3390               /* Create vs0 - initial def of the double reduction phi.  */
3391               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3392                                              loop_preheader_edge (outer_loop));
3393               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3394                                                         NULL);
3395               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3396                                                 NULL);
3397 
3398               /* Update phi node arguments with vs0 and vs2.  */
3399               add_phi_arg (vect_phi, vect_phi_init,
3400                            loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3401               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3402                            loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3403               if (vect_print_dump_info (REPORT_DETAILS))
3404                 {
3405                   fprintf (vect_dump, "created double reduction phi node: ");
3406                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3407                 }
3408 
3409               vect_phi_res = PHI_RESULT (vect_phi);
3410 
3411               /* Replace the use, i.e., set the correct vs1 in the regular
3412                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3413                  is redundant.  */
3414               use = reduction_phi;
3415               for (j = 0; j < ncopies; j++)
3416                 {
3417                   edge pr_edge = loop_preheader_edge (loop);
3418                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3419                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3420                 }
3421             }
3422 	}
3423 
3424       /* Replace the uses:  */
3425       orig_name = PHI_RESULT (exit_phi);
3426       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3427 	FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3428 	  SET_USE (use_p, new_temp);
3429     }
3430 
3431   VEC_free (gimple, heap, phis);
3432 }
3433 
3434 
3435 /* Function vectorizable_reduction.
3436 
3437    Check if STMT performs a reduction operation that can be vectorized.
3438    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3439    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3440    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3441 
3442    This function also handles reduction idioms (patterns) that have been
3443    recognized in advance during vect_pattern_recog. In this case, STMT may be
3444    of this form:
3445      X = pattern_expr (arg0, arg1, ..., X)
3446    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3447    sequence that had been detected and replaced by the pattern-stmt (STMT).
3448 
3449    In some cases of reduction patterns, the type of the reduction variable X is
3450    different than the type of the other arguments of STMT.
3451    In such cases, the vectype that is used when transforming STMT into a vector
3452    stmt is different than the vectype that is used to determine the
3453    vectorization factor, because it consists of a different number of elements
3454    than the actual number of elements that are being operated upon in parallel.
3455 
3456    For example, consider an accumulation of shorts into an int accumulator.
3457    On some targets it's possible to vectorize this pattern operating on 8
3458    shorts at a time (hence, the vectype for purposes of determining the
3459    vectorization factor should be V8HI); on the other hand, the vectype that
3460    is used to create the vector form is actually V4SI (the type of the result).
3461 
3462    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3463    indicates what is the actual level of parallelism (V8HI in the example), so
3464    that the right vectorization factor would be derived. This vectype
3465    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3466    be used to create the vectorized stmt. The right vectype for the vectorized
3467    stmt is obtained from the type of the result X:
3468         get_vectype_for_scalar_type (TREE_TYPE (X))
3469 
3470    This means that, contrary to "regular" reductions (or "regular" stmts in
3471    general), the following equation:
3472       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3473    does *NOT* necessarily hold for reduction patterns.  */
3474 
3475 bool
3476 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3477 			gimple *vec_stmt)
3478 {
3479   tree vec_dest;
3480   tree scalar_dest;
3481   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3482   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3483   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3484   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3485   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3486   enum tree_code code, orig_code, epilog_reduc_code;
3487   enum machine_mode vec_mode;
3488   int op_type;
3489   optab optab, reduc_optab;
3490   tree new_temp = NULL_TREE;
3491   tree def;
3492   gimple def_stmt;
3493   enum vect_def_type dt;
3494   gimple new_phi = NULL;
3495   tree scalar_type;
3496   bool is_simple_use;
3497   gimple orig_stmt;
3498   stmt_vec_info orig_stmt_info;
3499   tree expr = NULL_TREE;
3500   int i;
3501   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3502   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3503   int epilog_copies;
3504   stmt_vec_info prev_stmt_info, prev_phi_info;
3505   gimple first_phi = NULL;
3506   bool single_defuse_cycle = false;
3507   tree reduc_def = NULL_TREE;
3508   gimple new_stmt = NULL;
3509   int j;
3510   tree ops[3];
3511   bool nested_cycle = false, found_nested_cycle_def = false;
3512   gimple reduc_def_stmt = NULL;
3513   /* The default is that the reduction variable is the last in statement.  */
3514   int reduc_index = 2;
3515   bool double_reduc = false, dummy;
3516   basic_block def_bb;
3517   struct loop * def_stmt_loop, *outer_loop = NULL;
3518   tree def_arg;
3519   gimple def_arg_stmt;
3520 
3521   if (nested_in_vect_loop_p (loop, stmt))
3522     {
3523       outer_loop = loop;
3524       loop = loop->inner;
3525       nested_cycle = true;
3526     }
3527 
3528   gcc_assert (ncopies >= 1);
3529 
3530   /* FORNOW: SLP not supported.  */
3531   if (STMT_SLP_TYPE (stmt_info))
3532     return false;
3533 
3534   /* 1. Is vectorizable reduction?  */
3535   /* Not supportable if the reduction variable is used in the loop.  */
3536   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3537     return false;
3538 
3539   /* Reductions that are not used even in an enclosing outer-loop,
3540      are expected to be "live" (used out of the loop).  */
3541   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3542       && !STMT_VINFO_LIVE_P (stmt_info))
3543     return false;
3544 
3545   /* Make sure it was already recognized as a reduction computation.  */
3546   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3547       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3548     return false;
3549 
3550   /* 2. Has this been recognized as a reduction pattern?
3551 
3552      Check if STMT represents a pattern that has been recognized
3553      in earlier analysis stages.  For stmts that represent a pattern,
3554      the STMT_VINFO_RELATED_STMT field records the last stmt in
3555      the original sequence that constitutes the pattern.  */
3556 
3557   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3558   if (orig_stmt)
3559     {
3560       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3561       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3562       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3563       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3564     }
3565 
3566   /* 3. Check the operands of the operation. The first operands are defined
3567         inside the loop body. The last operand is the reduction variable,
3568         which is defined by the loop-header-phi.  */
3569 
3570   gcc_assert (is_gimple_assign (stmt));
3571 
3572   /* Flatten RHS */
3573   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3574     {
3575     case GIMPLE_SINGLE_RHS:
3576       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3577       if (op_type == ternary_op)
3578 	{
3579 	  tree rhs = gimple_assign_rhs1 (stmt);
3580 	  ops[0] = TREE_OPERAND (rhs, 0);
3581 	  ops[1] = TREE_OPERAND (rhs, 1);
3582 	  ops[2] = TREE_OPERAND (rhs, 2);
3583 	  code = TREE_CODE (rhs);
3584 	}
3585       else
3586 	return false;
3587       break;
3588 
3589     case GIMPLE_BINARY_RHS:
3590       code = gimple_assign_rhs_code (stmt);
3591       op_type = TREE_CODE_LENGTH (code);
3592       gcc_assert (op_type == binary_op);
3593       ops[0] = gimple_assign_rhs1 (stmt);
3594       ops[1] = gimple_assign_rhs2 (stmt);
3595       break;
3596 
3597     case GIMPLE_UNARY_RHS:
3598       return false;
3599 
3600     default:
3601       gcc_unreachable ();
3602     }
3603 
3604   scalar_dest = gimple_assign_lhs (stmt);
3605   scalar_type = TREE_TYPE (scalar_dest);
3606   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3607       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3608     return false;
3609 
3610   /* All uses but the last are expected to be defined in the loop.
3611      The last use is the reduction variable. In case of nested cycle this
3612      assumption is not true: we use reduc_index to record the index of the
3613      reduction variable.  */
3614   for (i = 0; i < op_type-1; i++)
3615     {
3616       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3617       if (i == 0 && code == COND_EXPR)
3618         continue;
3619 
3620       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3621 					  &def, &dt);
3622       gcc_assert (is_simple_use);
3623       if (dt != vect_internal_def
3624 	  && dt != vect_external_def
3625 	  && dt != vect_constant_def
3626 	  && dt != vect_induction_def
3627           && !(dt == vect_nested_cycle && nested_cycle))
3628 	return false;
3629 
3630       if (dt == vect_nested_cycle)
3631         {
3632           found_nested_cycle_def = true;
3633           reduc_def_stmt = def_stmt;
3634           reduc_index = i;
3635         }
3636     }
3637 
3638   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3639                                       &def, &dt);
3640   gcc_assert (is_simple_use);
3641   gcc_assert (dt == vect_reduction_def
3642               || dt == vect_nested_cycle
3643               || ((dt == vect_internal_def || dt == vect_external_def
3644                    || dt == vect_constant_def || dt == vect_induction_def)
3645                    && nested_cycle && found_nested_cycle_def));
3646   if (!found_nested_cycle_def)
3647     reduc_def_stmt = def_stmt;
3648 
3649   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3650   if (orig_stmt)
3651     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3652                                                        reduc_def_stmt,
3653                                                        !nested_cycle,
3654                                                        &dummy));
3655   else
3656     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3657                                                   !nested_cycle, &dummy));
3658 
3659   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3660     return false;
3661 
3662   vec_mode = TYPE_MODE (vectype);
3663 
3664   if (code == COND_EXPR)
3665     {
3666       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3667         {
3668           if (vect_print_dump_info (REPORT_DETAILS))
3669             fprintf (vect_dump, "unsupported condition in reduction");
3670 
3671             return false;
3672         }
3673     }
3674   else
3675     {
3676       /* 4. Supportable by target?  */
3677 
3678       /* 4.1. check support for the operation in the loop  */
3679       optab = optab_for_tree_code (code, vectype, optab_default);
3680       if (!optab)
3681         {
3682           if (vect_print_dump_info (REPORT_DETAILS))
3683             fprintf (vect_dump, "no optab.");
3684 
3685           return false;
3686         }
3687 
3688       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3689         {
3690           if (vect_print_dump_info (REPORT_DETAILS))
3691             fprintf (vect_dump, "op not supported by target.");
3692 
3693           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3694               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3695 	          < vect_min_worthwhile_factor (code))
3696             return false;
3697 
3698           if (vect_print_dump_info (REPORT_DETAILS))
3699   	    fprintf (vect_dump, "proceeding using word mode.");
3700         }
3701 
3702       /* Worthwhile without SIMD support?  */
3703       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3704           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3705    	     < vect_min_worthwhile_factor (code))
3706         {
3707           if (vect_print_dump_info (REPORT_DETAILS))
3708 	    fprintf (vect_dump, "not worthwhile without SIMD support.");
3709 
3710           return false;
3711         }
3712     }
3713 
3714   /* 4.2. Check support for the epilog operation.
3715 
3716           If STMT represents a reduction pattern, then the type of the
3717           reduction variable may be different than the type of the rest
3718           of the arguments.  For example, consider the case of accumulation
3719           of shorts into an int accumulator; The original code:
3720                         S1: int_a = (int) short_a;
3721           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3722 
3723           was replaced with:
3724                         STMT: int_acc = widen_sum <short_a, int_acc>
3725 
3726           This means that:
3727           1. The tree-code that is used to create the vector operation in the
3728              epilog code (that reduces the partial results) is not the
3729              tree-code of STMT, but is rather the tree-code of the original
3730              stmt from the pattern that STMT is replacing. I.e, in the example
3731              above we want to use 'widen_sum' in the loop, but 'plus' in the
3732              epilog.
3733           2. The type (mode) we use to check available target support
3734              for the vector operation to be created in the *epilog*, is
3735              determined by the type of the reduction variable (in the example
3736              above we'd check this: plus_optab[vect_int_mode]).
3737              However the type (mode) we use to check available target support
3738              for the vector operation to be created *inside the loop*, is
3739              determined by the type of the other arguments to STMT (in the
3740              example we'd check this: widen_sum_optab[vect_short_mode]).
3741 
3742           This is contrary to "regular" reductions, in which the types of all
3743           the arguments are the same as the type of the reduction variable.
3744           For "regular" reductions we can therefore use the same vector type
3745           (and also the same tree-code) when generating the epilog code and
3746           when generating the code inside the loop.  */
3747 
3748   if (orig_stmt)
3749     {
3750       /* This is a reduction pattern: get the vectype from the type of the
3751          reduction variable, and get the tree-code from orig_stmt.  */
3752       orig_code = gimple_assign_rhs_code (orig_stmt);
3753       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3754       if (!vectype)
3755 	{
3756           if (vect_print_dump_info (REPORT_DETAILS))
3757             {
3758               fprintf (vect_dump, "unsupported data-type ");
3759               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3760             }
3761           return false;
3762         }
3763 
3764       vec_mode = TYPE_MODE (vectype);
3765     }
3766   else
3767     {
3768       /* Regular reduction: use the same vectype and tree-code as used for
3769          the vector code inside the loop can be used for the epilog code. */
3770       orig_code = code;
3771     }
3772 
3773   if (nested_cycle)
3774     {
3775       def_bb = gimple_bb (reduc_def_stmt);
3776       def_stmt_loop = def_bb->loop_father;
3777       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3778                                        loop_preheader_edge (def_stmt_loop));
3779       if (TREE_CODE (def_arg) == SSA_NAME
3780           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3781           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3782           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3783           && vinfo_for_stmt (def_arg_stmt)
3784           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3785               == vect_double_reduction_def)
3786         double_reduc = true;
3787     }
3788 
3789   epilog_reduc_code = ERROR_MARK;
3790   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3791     {
3792       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3793                                          optab_default);
3794       if (!reduc_optab)
3795         {
3796           if (vect_print_dump_info (REPORT_DETAILS))
3797             fprintf (vect_dump, "no optab for reduction.");
3798 
3799           epilog_reduc_code = ERROR_MARK;
3800         }
3801 
3802       if (reduc_optab
3803           && optab_handler (reduc_optab, vec_mode)->insn_code
3804               == CODE_FOR_nothing)
3805         {
3806           if (vect_print_dump_info (REPORT_DETAILS))
3807             fprintf (vect_dump, "reduc op not supported by target.");
3808 
3809           epilog_reduc_code = ERROR_MARK;
3810         }
3811     }
3812   else
3813     {
3814       if (!nested_cycle || double_reduc)
3815         {
3816           if (vect_print_dump_info (REPORT_DETAILS))
3817             fprintf (vect_dump, "no reduc code for scalar code.");
3818 
3819           return false;
3820         }
3821     }
3822 
3823   if (double_reduc && ncopies > 1)
3824     {
3825       if (vect_print_dump_info (REPORT_DETAILS))
3826         fprintf (vect_dump, "multiple types in double reduction");
3827 
3828       return false;
3829     }
3830 
3831   if (!vec_stmt) /* transformation not required.  */
3832     {
3833       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3834       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3835         return false;
3836       return true;
3837     }
3838 
3839   /** Transform.  **/
3840 
3841   if (vect_print_dump_info (REPORT_DETAILS))
3842     fprintf (vect_dump, "transform reduction.");
3843 
3844   /* FORNOW: Multiple types are not supported for condition.  */
3845   if (code == COND_EXPR)
3846     gcc_assert (ncopies == 1);
3847 
3848   /* Create the destination vector  */
3849   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3850 
3851   /* In case the vectorization factor (VF) is bigger than the number
3852      of elements that we can fit in a vectype (nunits), we have to generate
3853      more than one vector stmt - i.e - we need to "unroll" the
3854      vector stmt by a factor VF/nunits.  For more details see documentation
3855      in vectorizable_operation.  */
3856 
3857   /* If the reduction is used in an outer loop we need to generate
3858      VF intermediate results, like so (e.g. for ncopies=2):
3859 	r0 = phi (init, r0)
3860 	r1 = phi (init, r1)
3861 	r0 = x0 + r0;
3862         r1 = x1 + r1;
3863     (i.e. we generate VF results in 2 registers).
3864     In this case we have a separate def-use cycle for each copy, and therefore
3865     for each copy we get the vector def for the reduction variable from the
3866     respective phi node created for this copy.
3867 
3868     Otherwise (the reduction is unused in the loop nest), we can combine
3869     together intermediate results, like so (e.g. for ncopies=2):
3870 	r = phi (init, r)
3871 	r = x0 + r;
3872 	r = x1 + r;
3873    (i.e. we generate VF/2 results in a single register).
3874    In this case for each copy we get the vector def for the reduction variable
3875    from the vectorized reduction operation generated in the previous iteration.
3876   */
3877 
3878   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3879     {
3880       single_defuse_cycle = true;
3881       epilog_copies = 1;
3882     }
3883   else
3884     epilog_copies = ncopies;
3885 
3886   prev_stmt_info = NULL;
3887   prev_phi_info = NULL;
3888   for (j = 0; j < ncopies; j++)
3889     {
3890       if (j == 0 || !single_defuse_cycle)
3891 	{
3892 	  /* Create the reduction-phi that defines the reduction-operand.  */
3893 	  new_phi = create_phi_node (vec_dest, loop->header);
3894 	  set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3895 	                                                  NULL));
3896           /* Get the vector def for the reduction variable from the phi
3897              node.  */
3898           reduc_def = PHI_RESULT (new_phi);
3899 	}
3900 
3901       if (code == COND_EXPR)
3902         {
3903           first_phi = new_phi;
3904           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3905           /* Multiple types are not supported for condition.  */
3906           break;
3907         }
3908 
3909       /* Handle uses.  */
3910       if (j == 0)
3911         {
3912 	  loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3913                                                         stmt, NULL);
3914           if (op_type == ternary_op)
3915             {
3916               if (reduc_index == 0)
3917  	        loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3918                                                               NULL);
3919               else
3920                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3921                                                               NULL);
3922             }
3923 
3924           /* Get the vector def for the reduction variable from the phi
3925              node.  */
3926 	  first_phi = new_phi;
3927         }
3928       else
3929         {
3930           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3931           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3932           if (op_type == ternary_op)
3933             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3934 
3935 	  if (single_defuse_cycle)
3936 	    reduc_def = gimple_assign_lhs (new_stmt);
3937 	  else
3938 	    reduc_def = PHI_RESULT (new_phi);
3939 
3940 	  STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3941         }
3942 
3943       /* Arguments are ready. Create the new vector stmt.  */
3944       if (op_type == binary_op)
3945         {
3946           if (reduc_index == 0)
3947             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3948           else
3949             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3950         }
3951       else
3952         {
3953           if (reduc_index == 0)
3954             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3955                            loop_vec_def1);
3956           else
3957             {
3958               if (reduc_index == 1)
3959                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3960                                loop_vec_def1);
3961               else
3962                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3963 	     	               reduc_def);
3964             }
3965         }
3966 
3967       new_stmt = gimple_build_assign (vec_dest, expr);
3968       new_temp = make_ssa_name (vec_dest, new_stmt);
3969       gimple_assign_set_lhs (new_stmt, new_temp);
3970       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3971 
3972       if (j == 0)
3973 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3974       else
3975 	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3976 
3977       prev_stmt_info = vinfo_for_stmt (new_stmt);
3978       prev_phi_info = vinfo_for_stmt (new_phi);
3979     }
3980 
3981   /* Finalize the reduction-phi (set its arguments) and create the
3982      epilog reduction code.  */
3983   if (!single_defuse_cycle || code == COND_EXPR)
3984     new_temp = gimple_assign_lhs (*vec_stmt);
3985 
3986   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3987 				    epilog_reduc_code, first_phi, reduc_index,
3988                                     double_reduc);
3989   return true;
3990 }
3991 
3992 /* Function vect_min_worthwhile_factor.
3993 
3994    For a loop where we could vectorize the operation indicated by CODE,
3995    return the minimum vectorization factor that makes it worthwhile
3996    to use generic vectors.  */
3997 int
3998 vect_min_worthwhile_factor (enum tree_code code)
3999 {
4000   switch (code)
4001     {
4002     case PLUS_EXPR:
4003     case MINUS_EXPR:
4004     case NEGATE_EXPR:
4005       return 4;
4006 
4007     case BIT_AND_EXPR:
4008     case BIT_IOR_EXPR:
4009     case BIT_XOR_EXPR:
4010     case BIT_NOT_EXPR:
4011       return 2;
4012 
4013     default:
4014       return INT_MAX;
4015     }
4016 }
4017 
4018 
4019 /* Function vectorizable_induction
4020 
4021    Check if PHI performs an induction computation that can be vectorized.
4022    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4023    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4024    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4025 
4026 bool
4027 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4028 			gimple *vec_stmt)
4029 {
4030   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4031   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4032   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4033   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4034   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4035   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4036   tree vec_def;
4037 
4038   gcc_assert (ncopies >= 1);
4039   /* FORNOW. This restriction should be relaxed.  */
4040   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4041     {
4042       if (vect_print_dump_info (REPORT_DETAILS))
4043         fprintf (vect_dump, "multiple types in nested loop.");
4044       return false;
4045     }
4046 
4047   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4048     return false;
4049 
4050   /* FORNOW: SLP not supported.  */
4051   if (STMT_SLP_TYPE (stmt_info))
4052     return false;
4053 
4054   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4055 
4056   if (gimple_code (phi) != GIMPLE_PHI)
4057     return false;
4058 
4059   if (!vec_stmt) /* transformation not required.  */
4060     {
4061       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4062       if (vect_print_dump_info (REPORT_DETAILS))
4063         fprintf (vect_dump, "=== vectorizable_induction ===");
4064       vect_model_induction_cost (stmt_info, ncopies);
4065       return true;
4066     }
4067 
4068   /** Transform.  **/
4069 
4070   if (vect_print_dump_info (REPORT_DETAILS))
4071     fprintf (vect_dump, "transform induction phi.");
4072 
4073   vec_def = get_initial_def_for_induction (phi);
4074   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4075   return true;
4076 }
4077 
4078 /* Function vectorizable_live_operation.
4079 
4080    STMT computes a value that is used outside the loop. Check if
4081    it can be supported.  */
4082 
4083 bool
4084 vectorizable_live_operation (gimple stmt,
4085 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4086 			     gimple *vec_stmt ATTRIBUTE_UNUSED)
4087 {
4088   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4089   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4090   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4091   int i;
4092   int op_type;
4093   tree op;
4094   tree def;
4095   gimple def_stmt;
4096   enum vect_def_type dt;
4097   enum tree_code code;
4098   enum gimple_rhs_class rhs_class;
4099 
4100   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4101 
4102   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4103     return false;
4104 
4105   if (!is_gimple_assign (stmt))
4106     return false;
4107 
4108   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4109     return false;
4110 
4111   /* FORNOW. CHECKME. */
4112   if (nested_in_vect_loop_p (loop, stmt))
4113     return false;
4114 
4115   code = gimple_assign_rhs_code (stmt);
4116   op_type = TREE_CODE_LENGTH (code);
4117   rhs_class = get_gimple_rhs_class (code);
4118   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4119   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4120 
4121   /* FORNOW: support only if all uses are invariant. This means
4122      that the scalar operations can remain in place, unvectorized.
4123      The original last scalar value that they compute will be used.  */
4124 
4125   for (i = 0; i < op_type; i++)
4126     {
4127       if (rhs_class == GIMPLE_SINGLE_RHS)
4128 	op = TREE_OPERAND (gimple_op (stmt, 1), i);
4129       else
4130 	op = gimple_op (stmt, i + 1);
4131       if (op
4132           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4133         {
4134           if (vect_print_dump_info (REPORT_DETAILS))
4135             fprintf (vect_dump, "use not simple.");
4136           return false;
4137         }
4138 
4139       if (dt != vect_external_def && dt != vect_constant_def)
4140         return false;
4141     }
4142 
4143   /* No transformation is required for the cases we currently support.  */
4144   return true;
4145 }
4146 
4147 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4148 
4149 static void
4150 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4151 {
4152   ssa_op_iter op_iter;
4153   imm_use_iterator imm_iter;
4154   def_operand_p def_p;
4155   gimple ustmt;
4156 
4157   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4158     {
4159       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4160 	{
4161 	  basic_block bb;
4162 
4163 	  if (!is_gimple_debug (ustmt))
4164 	    continue;
4165 
4166 	  bb = gimple_bb (ustmt);
4167 
4168 	  if (!flow_bb_inside_loop_p (loop, bb))
4169 	    {
4170 	      if (gimple_debug_bind_p (ustmt))
4171 		{
4172 		  if (vect_print_dump_info (REPORT_DETAILS))
4173 		    fprintf (vect_dump, "killing debug use");
4174 
4175 		  gimple_debug_bind_reset_value (ustmt);
4176 		  update_stmt (ustmt);
4177 		}
4178 	      else
4179 		gcc_unreachable ();
4180 	    }
4181 	}
4182     }
4183 }
4184 
4185 /* Function vect_transform_loop.
4186 
4187    The analysis phase has determined that the loop is vectorizable.
4188    Vectorize the loop - created vectorized stmts to replace the scalar
4189    stmts in the loop, and update the loop exit condition.  */
4190 
4191 void
4192 vect_transform_loop (loop_vec_info loop_vinfo)
4193 {
4194   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4195   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4196   int nbbs = loop->num_nodes;
4197   gimple_stmt_iterator si;
4198   int i;
4199   tree ratio = NULL;
4200   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4201   bool strided_store;
4202   bool slp_scheduled = false;
4203   unsigned int nunits;
4204   tree cond_expr = NULL_TREE;
4205   gimple_seq cond_expr_stmt_list = NULL;
4206   bool do_peeling_for_loop_bound;
4207 
4208   if (vect_print_dump_info (REPORT_DETAILS))
4209     fprintf (vect_dump, "=== vec_transform_loop ===");
4210 
4211   /* Peel the loop if there are data refs with unknown alignment.
4212      Only one data ref with unknown store is allowed.  */
4213 
4214   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4215     vect_do_peeling_for_alignment (loop_vinfo);
4216 
4217   do_peeling_for_loop_bound
4218     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4219        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4220 	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
4221        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
4222 
4223   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4224       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4225     vect_loop_versioning (loop_vinfo,
4226 			  !do_peeling_for_loop_bound,
4227 			  &cond_expr, &cond_expr_stmt_list);
4228 
4229   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4230      compile time constant), or it is a constant that doesn't divide by the
4231      vectorization factor, then an epilog loop needs to be created.
4232      We therefore duplicate the loop: the original loop will be vectorized,
4233      and will compute the first (n/VF) iterations. The second copy of the loop
4234      will remain scalar and will compute the remaining (n%VF) iterations.
4235      (VF is the vectorization factor).  */
4236 
4237   if (do_peeling_for_loop_bound)
4238     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4239 				    cond_expr, cond_expr_stmt_list);
4240   else
4241     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4242 		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4243 
4244   /* 1) Make sure the loop header has exactly two entries
4245      2) Make sure we have a preheader basic block.  */
4246 
4247   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4248 
4249   split_edge (loop_preheader_edge (loop));
4250 
4251   /* FORNOW: the vectorizer supports only loops which body consist
4252      of one basic block (header + empty latch). When the vectorizer will
4253      support more involved loop forms, the order by which the BBs are
4254      traversed need to be reconsidered.  */
4255 
4256   for (i = 0; i < nbbs; i++)
4257     {
4258       basic_block bb = bbs[i];
4259       stmt_vec_info stmt_info;
4260       gimple phi;
4261 
4262       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4263         {
4264 	  phi = gsi_stmt (si);
4265 	  if (vect_print_dump_info (REPORT_DETAILS))
4266 	    {
4267 	      fprintf (vect_dump, "------>vectorizing phi: ");
4268 	      print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4269 	    }
4270 	  stmt_info = vinfo_for_stmt (phi);
4271 	  if (!stmt_info)
4272 	    continue;
4273 
4274 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4275 	    vect_loop_kill_debug_uses (loop, phi);
4276 
4277 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
4278 	      && !STMT_VINFO_LIVE_P (stmt_info))
4279 	    continue;
4280 
4281 	  if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4282 	        != (unsigned HOST_WIDE_INT) vectorization_factor)
4283 	      && vect_print_dump_info (REPORT_DETAILS))
4284 	    fprintf (vect_dump, "multiple-types.");
4285 
4286 	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4287 	    {
4288 	      if (vect_print_dump_info (REPORT_DETAILS))
4289 		fprintf (vect_dump, "transform phi.");
4290 	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4291 	    }
4292 	}
4293 
4294       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4295 	{
4296 	  gimple stmt = gsi_stmt (si);
4297 	  bool is_store;
4298 
4299 	  if (vect_print_dump_info (REPORT_DETAILS))
4300 	    {
4301 	      fprintf (vect_dump, "------>vectorizing statement: ");
4302 	      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4303 	    }
4304 
4305 	  stmt_info = vinfo_for_stmt (stmt);
4306 
4307 	  /* vector stmts created in the outer-loop during vectorization of
4308 	     stmts in an inner-loop may not have a stmt_info, and do not
4309 	     need to be vectorized.  */
4310 	  if (!stmt_info)
4311 	    {
4312 	      gsi_next (&si);
4313 	      continue;
4314 	    }
4315 
4316 	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4317 	    vect_loop_kill_debug_uses (loop, stmt);
4318 
4319 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
4320 	      && !STMT_VINFO_LIVE_P (stmt_info))
4321 	    {
4322 	      gsi_next (&si);
4323 	      continue;
4324 	    }
4325 
4326 	  gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4327 	  nunits =
4328 	    (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4329 	  if (!STMT_SLP_TYPE (stmt_info)
4330 	      && nunits != (unsigned int) vectorization_factor
4331               && vect_print_dump_info (REPORT_DETAILS))
4332 	    /* For SLP VF is set according to unrolling factor, and not to
4333 	       vector size, hence for SLP this print is not valid.  */
4334             fprintf (vect_dump, "multiple-types.");
4335 
4336 	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
4337 	     reached.  */
4338 	  if (STMT_SLP_TYPE (stmt_info))
4339 	    {
4340 	      if (!slp_scheduled)
4341 		{
4342 		  slp_scheduled = true;
4343 
4344 		  if (vect_print_dump_info (REPORT_DETAILS))
4345 		    fprintf (vect_dump, "=== scheduling SLP instances ===");
4346 
4347 		  vect_schedule_slp (loop_vinfo, NULL);
4348 		}
4349 
4350 	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4351 	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4352 		{
4353 		  gsi_next (&si);
4354 		  continue;
4355 		}
4356 	    }
4357 
4358 	  /* -------- vectorize statement ------------ */
4359 	  if (vect_print_dump_info (REPORT_DETAILS))
4360 	    fprintf (vect_dump, "transform statement.");
4361 
4362 	  strided_store = false;
4363 	  is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4364           if (is_store)
4365             {
4366 	      if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4367 		{
4368 		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4369 		     interleaving chain was completed - free all the stores in
4370 		     the chain.  */
4371 		  vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4372 		  gsi_remove (&si, true);
4373 		  continue;
4374 		}
4375 	      else
4376 		{
4377 		  /* Free the attached stmt_vec_info and remove the stmt.  */
4378 		  free_stmt_vec_info (stmt);
4379 		  gsi_remove (&si, true);
4380 		  continue;
4381 		}
4382 	    }
4383 	  gsi_next (&si);
4384 	}		        /* stmts in BB */
4385     }				/* BBs in loop */
4386 
4387   slpeel_make_loop_iterate_ntimes (loop, ratio);
4388 
4389   /* The memory tags and pointers in vectorized statements need to
4390      have their SSA forms updated.  FIXME, why can't this be delayed
4391      until all the loops have been transformed?  */
4392   update_ssa (TODO_update_ssa);
4393 
4394   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4395     fprintf (vect_dump, "LOOP VECTORIZED.");
4396   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4397     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4398 }
4399