xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-slp.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h"		/* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79 
80 /* Extract the location of the basic block in the source code.
81    Return the basic block location if succeed and NULL if not.  */
82 
83 source_location
84 find_bb_location (basic_block bb)
85 {
86   gimple stmt = NULL;
87   gimple_stmt_iterator si;
88 
89   if (!bb)
90     return UNKNOWN_LOCATION;
91 
92   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93     {
94       stmt = gsi_stmt (si);
95       if (gimple_location (stmt) != UNKNOWN_LOCATION)
96         return gimple_location (stmt);
97     }
98 
99   return UNKNOWN_LOCATION;
100 }
101 
102 
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104 
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108   int i;
109   slp_tree child;
110 
111   if (!node)
112     return;
113 
114   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115     vect_free_slp_tree (child);
116 
117   SLP_TREE_CHILDREN (node).release ();
118   SLP_TREE_SCALAR_STMTS (node).release ();
119   SLP_TREE_VEC_STMTS (node).release ();
120   SLP_TREE_LOAD_PERMUTATION (node).release ();
121 
122   free (node);
123 }
124 
125 
126 /* Free the memory allocated for the SLP instance.  */
127 
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132   SLP_INSTANCE_LOADS (instance).release ();
133   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134   free (instance);
135 }
136 
137 
138 /* Create an SLP node for SCALAR_STMTS.  */
139 
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143   slp_tree node;
144   gimple stmt = scalar_stmts[0];
145   unsigned int nops;
146 
147   if (is_gimple_call (stmt))
148     nops = gimple_call_num_args (stmt);
149   else if (is_gimple_assign (stmt))
150     {
151       nops = gimple_num_ops (stmt) - 1;
152       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153 	nops++;
154     }
155   else
156     return NULL;
157 
158   node = XNEW (struct _slp_tree);
159   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160   SLP_TREE_VEC_STMTS (node).create (0);
161   SLP_TREE_CHILDREN (node).create (nops);
162   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163 
164   return node;
165 }
166 
167 
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169    operand.  */
170 static vec<slp_oprnd_info>
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173   int i;
174   slp_oprnd_info oprnd_info;
175   vec<slp_oprnd_info> oprnds_info;
176 
177   oprnds_info.create (nops);
178   for (i = 0; i < nops; i++)
179     {
180       oprnd_info = XNEW (struct _slp_oprnd_info);
181       oprnd_info->def_stmts.create (group_size);
182       oprnd_info->first_dt = vect_uninitialized_def;
183       oprnd_info->first_op_type = NULL_TREE;
184       oprnd_info->first_pattern = false;
185       oprnds_info.quick_push (oprnd_info);
186     }
187 
188   return oprnds_info;
189 }
190 
191 
192 /* Free operands info.  */
193 
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197   int i;
198   slp_oprnd_info oprnd_info;
199 
200   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201     {
202       oprnd_info->def_stmts.release ();
203       XDELETE (oprnd_info);
204     }
205 
206   oprnds_info.release ();
207 }
208 
209 
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212 
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216   gimple next_stmt = first_stmt;
217   int result = 0;
218 
219   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220     return -1;
221 
222   do
223     {
224       if (next_stmt == stmt)
225 	return result;
226       result++;
227       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228     }
229   while (next_stmt);
230 
231   return -1;
232 }
233 
234 
235 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236    they are of a valid type and that they match the defs of the first stmt of
237    the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
238    return -1, if the error could be corrected by swapping operands of the
239    operation return 1, if everything is ok return 0.  */
240 
241 static int
242 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243                              gimple stmt, bool first,
244                              vec<slp_oprnd_info> *oprnds_info)
245 {
246   tree oprnd;
247   unsigned int i, number_of_oprnds;
248   tree def;
249   gimple def_stmt;
250   enum vect_def_type dt = vect_uninitialized_def;
251   struct loop *loop = NULL;
252   bool pattern = false;
253   slp_oprnd_info oprnd_info;
254   int first_op_idx = 1;
255   bool commutative = false;
256   bool first_op_cond = false;
257 
258   if (loop_vinfo)
259     loop = LOOP_VINFO_LOOP (loop_vinfo);
260 
261   if (is_gimple_call (stmt))
262     {
263       number_of_oprnds = gimple_call_num_args (stmt);
264       first_op_idx = 3;
265     }
266   else if (is_gimple_assign (stmt))
267     {
268       enum tree_code code = gimple_assign_rhs_code (stmt);
269       number_of_oprnds = gimple_num_ops (stmt) - 1;
270       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271 	{
272 	  first_op_cond = true;
273 	  commutative = true;
274 	  number_of_oprnds++;
275 	}
276       else
277 	commutative = commutative_tree_code (code);
278     }
279   else
280     return -1;
281 
282   bool swapped = false;
283   for (i = 0; i < number_of_oprnds; i++)
284     {
285 again:
286       if (first_op_cond)
287 	{
288 	  if (i == 0 || i == 1)
289 	    oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290 				  swapped ? !i : i);
291 	  else
292 	    oprnd = gimple_op (stmt, first_op_idx + i - 1);
293 	}
294       else
295         oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296 
297       oprnd_info = (*oprnds_info)[i];
298 
299       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300 			       &def, &dt)
301 	  || (!def_stmt && dt != vect_constant_def))
302 	{
303 	  if (dump_enabled_p ())
304 	    {
305 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306 			       "Build SLP failed: can't find def for ");
307 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309 	    }
310 
311 	  return -1;
312 	}
313 
314       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315          from the pattern.  Check that all the stmts of the node are in the
316          pattern.  */
317       if (def_stmt && gimple_bb (def_stmt)
318           && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319 	      || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320 		  && gimple_code (def_stmt) != GIMPLE_PHI))
321           && vinfo_for_stmt (def_stmt)
322           && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323 	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324 	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325         {
326           pattern = true;
327           if (!first && !oprnd_info->first_pattern)
328 	    {
329 	      if (i == 0
330 		  && !swapped
331 		  && commutative)
332 		{
333 		  swapped = true;
334 		  goto again;
335 		}
336 
337 	      if (dump_enabled_p ())
338 		{
339 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 				   "Build SLP failed: some of the stmts"
341 				   " are in a pattern, and others are not ");
342 		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 		}
345 
346 	      return 1;
347             }
348 
349           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350           dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351 
352           if (dt == vect_unknown_def_type)
353             {
354               if (dump_enabled_p ())
355                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 				 "Unsupported pattern.\n");
357               return -1;
358             }
359 
360           switch (gimple_code (def_stmt))
361             {
362               case GIMPLE_PHI:
363                 def = gimple_phi_result (def_stmt);
364                 break;
365 
366               case GIMPLE_ASSIGN:
367                 def = gimple_assign_lhs (def_stmt);
368                 break;
369 
370               default:
371                 if (dump_enabled_p ())
372                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373 				   "unsupported defining stmt:\n");
374                 return -1;
375             }
376         }
377 
378       if (first)
379 	{
380 	  oprnd_info->first_dt = dt;
381 	  oprnd_info->first_pattern = pattern;
382 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
383 	}
384       else
385 	{
386 	  /* Not first stmt of the group, check that the def-stmt/s match
387 	     the def-stmt/s of the first stmt.  Allow different definition
388 	     types for reduction chains: the first stmt must be a
389 	     vect_reduction_def (a phi node), and the rest
390 	     vect_internal_def.  */
391 	  if (((oprnd_info->first_dt != dt
392                 && !(oprnd_info->first_dt == vect_reduction_def
393                      && dt == vect_internal_def)
394 		&& !((oprnd_info->first_dt == vect_external_def
395 		      || oprnd_info->first_dt == vect_constant_def)
396 		     && (dt == vect_external_def
397 			 || dt == vect_constant_def)))
398                || !types_compatible_p (oprnd_info->first_op_type,
399 				       TREE_TYPE (oprnd))))
400 	    {
401 	      /* Try swapping operands if we got a mismatch.  */
402 	      if (i == 0
403 		  && !swapped
404 		  && commutative)
405 		{
406 		  swapped = true;
407 		  goto again;
408 		}
409 
410 	      if (dump_enabled_p ())
411 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 				 "Build SLP failed: different types\n");
413 
414 	      return 1;
415 	    }
416 	}
417 
418       /* Check the types of the definitions.  */
419       switch (dt)
420 	{
421 	case vect_constant_def:
422 	case vect_external_def:
423         case vect_reduction_def:
424 	  break;
425 
426 	case vect_internal_def:
427 	  oprnd_info->def_stmts.quick_push (def_stmt);
428 	  break;
429 
430 	default:
431 	  /* FORNOW: Not supported.  */
432 	  if (dump_enabled_p ())
433 	    {
434 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 			       "Build SLP failed: illegal type of def ");
436 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438 	    }
439 
440 	  return -1;
441 	}
442     }
443 
444   /* Swap operands.  */
445   if (swapped)
446     {
447       if (first_op_cond)
448 	{
449 	  tree cond = gimple_assign_rhs1 (stmt);
450 	  swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451 			     &TREE_OPERAND (cond, 1));
452 	  TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453 	}
454       else
455 	swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456 			   gimple_assign_rhs2_ptr (stmt));
457     }
458 
459   return 0;
460 }
461 
462 
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464    permutation or are of unsupported types of operation.  Return
465    true if they are, otherwise return false and indicate in *MATCHES
466    which stmts are not isomorphic to the first one.  If MATCHES[0]
467    is false then this indicates the comparison could not be
468    carried out or the stmts will never be vectorized by SLP.  */
469 
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472 		       vec<gimple> stmts, unsigned int group_size,
473 		       unsigned nops, unsigned int *max_nunits,
474 		       unsigned int vectorization_factor, bool *matches)
475 {
476   unsigned int i;
477   gimple stmt = stmts[0];
478   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479   enum tree_code first_cond_code = ERROR_MARK;
480   tree lhs;
481   bool need_same_oprnds = false;
482   tree vectype, scalar_type, first_op1 = NULL_TREE;
483   optab optab;
484   int icode;
485   machine_mode optab_op2_mode;
486   machine_mode vec_mode;
487   struct data_reference *first_dr;
488   HOST_WIDE_INT dummy;
489   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490   tree cond;
491 
492   /* For every stmt in NODE find its def stmt/s.  */
493   FOR_EACH_VEC_ELT (stmts, i, stmt)
494     {
495       matches[i] = false;
496 
497       if (dump_enabled_p ())
498 	{
499 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501           dump_printf (MSG_NOTE, "\n");
502 	}
503 
504       /* Fail to vectorize statements marked as unvectorizable.  */
505       if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506         {
507           if (dump_enabled_p ())
508             {
509               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510 			       "Build SLP failed: unvectorizable statement ");
511               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513             }
514 	  /* Fatal mismatch.  */
515 	  matches[0] = false;
516           return false;
517         }
518 
519       lhs = gimple_get_lhs (stmt);
520       if (lhs == NULL_TREE)
521 	{
522 	  if (dump_enabled_p ())
523 	    {
524 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525 			       "Build SLP failed: not GIMPLE_ASSIGN nor "
526 			       "GIMPLE_CALL ");
527 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529 	    }
530 	  /* Fatal mismatch.  */
531 	  matches[0] = false;
532 	  return false;
533 	}
534 
535        if (is_gimple_assign (stmt)
536 	   && gimple_assign_rhs_code (stmt) == COND_EXPR
537            && (cond = gimple_assign_rhs1 (stmt))
538            && !COMPARISON_CLASS_P (cond))
539         {
540           if (dump_enabled_p ())
541             {
542               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543 			       "Build SLP failed: condition is not "
544 			       "comparison ");
545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547             }
548 	  /* Fatal mismatch.  */
549 	  matches[0] = false;
550           return false;
551         }
552 
553       scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554       vectype = get_vectype_for_scalar_type (scalar_type);
555       if (!vectype)
556         {
557           if (dump_enabled_p ())
558             {
559               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 			       "Build SLP failed: unsupported data-type ");
561               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562 				 scalar_type);
563               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564             }
565 	  /* Fatal mismatch.  */
566 	  matches[0] = false;
567           return false;
568         }
569 
570       /* In case of multiple types we need to detect the smallest type.  */
571       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572         {
573           *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574           if (bb_vinfo)
575             vectorization_factor = *max_nunits;
576         }
577 
578       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579 	{
580 	  rhs_code = CALL_EXPR;
581 	  if (gimple_call_internal_p (call_stmt)
582 	      || gimple_call_tail_p (call_stmt)
583 	      || gimple_call_noreturn_p (call_stmt)
584 	      || !gimple_call_nothrow_p (call_stmt)
585 	      || gimple_call_chain (call_stmt))
586 	    {
587 	      if (dump_enabled_p ())
588 		{
589 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 				   "Build SLP failed: unsupported call type ");
591 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592 				    call_stmt, 0);
593                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594 		}
595 	      /* Fatal mismatch.  */
596 	      matches[0] = false;
597 	      return false;
598 	    }
599 	}
600       else
601 	rhs_code = gimple_assign_rhs_code (stmt);
602 
603       /* Check the operation.  */
604       if (i == 0)
605 	{
606 	  first_stmt_code = rhs_code;
607 
608 	  /* Shift arguments should be equal in all the packed stmts for a
609 	     vector shift with scalar shift operand.  */
610 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611 	      || rhs_code == LROTATE_EXPR
612 	      || rhs_code == RROTATE_EXPR)
613 	    {
614 	      vec_mode = TYPE_MODE (vectype);
615 
616 	      /* First see if we have a vector/vector shift.  */
617 	      optab = optab_for_tree_code (rhs_code, vectype,
618 					   optab_vector);
619 
620 	      if (!optab
621 		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622 		{
623 		  /* No vector/vector shift, try for a vector/scalar shift.  */
624 		  optab = optab_for_tree_code (rhs_code, vectype,
625 					       optab_scalar);
626 
627 		  if (!optab)
628 		    {
629 		      if (dump_enabled_p ())
630 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631 					 "Build SLP failed: no optab.\n");
632 		      /* Fatal mismatch.  */
633 		      matches[0] = false;
634 		      return false;
635 		    }
636 		  icode = (int) optab_handler (optab, vec_mode);
637 		  if (icode == CODE_FOR_nothing)
638 		    {
639 		      if (dump_enabled_p ())
640 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 					 "Build SLP failed: "
642 					 "op not supported by target.\n");
643 		      /* Fatal mismatch.  */
644 		      matches[0] = false;
645 		      return false;
646 		    }
647 		  optab_op2_mode = insn_data[icode].operand[2].mode;
648 		  if (!VECTOR_MODE_P (optab_op2_mode))
649 		    {
650 		      need_same_oprnds = true;
651 		      first_op1 = gimple_assign_rhs2 (stmt);
652 		    }
653 		}
654 	    }
655 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
656             {
657               need_same_oprnds = true;
658               first_op1 = gimple_assign_rhs2 (stmt);
659             }
660 	}
661       else
662 	{
663 	  if (first_stmt_code != rhs_code
664 	      && (first_stmt_code != IMAGPART_EXPR
665 		  || rhs_code != REALPART_EXPR)
666 	      && (first_stmt_code != REALPART_EXPR
667 		  || rhs_code != IMAGPART_EXPR)
668               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                    && (first_stmt_code == ARRAY_REF
670                        || first_stmt_code == BIT_FIELD_REF
671                        || first_stmt_code == INDIRECT_REF
672                        || first_stmt_code == COMPONENT_REF
673                        || first_stmt_code == MEM_REF)))
674 	    {
675 	      if (dump_enabled_p ())
676 		{
677 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 				   "Build SLP failed: different operation "
679 				   "in stmt ");
680 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682 		}
683 	      /* Mismatch.  */
684 	      continue;
685 	    }
686 
687 	  if (need_same_oprnds
688 	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689 	    {
690 	      if (dump_enabled_p ())
691 		{
692 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693 				   "Build SLP failed: different shift "
694 				   "arguments in ");
695 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697 		}
698 	      /* Mismatch.  */
699 	      continue;
700 	    }
701 
702 	  if (rhs_code == CALL_EXPR)
703 	    {
704 	      gimple first_stmt = stmts[0];
705 	      if (gimple_call_num_args (stmt) != nops
706 		  || !operand_equal_p (gimple_call_fn (first_stmt),
707 				       gimple_call_fn (stmt), 0)
708 		  || gimple_call_fntype (first_stmt)
709 		     != gimple_call_fntype (stmt))
710 		{
711 		  if (dump_enabled_p ())
712 		    {
713 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 				       "Build SLP failed: different calls in ");
715 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716 					stmt, 0);
717                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718 		    }
719 		  /* Mismatch.  */
720 		  continue;
721 		}
722 	    }
723 	}
724 
725       /* Grouped store or load.  */
726       if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727 	{
728 	  if (REFERENCE_CLASS_P (lhs))
729 	    {
730 	      /* Store.  */
731 	      ;
732 	    }
733 	  else
734 	    {
735 	      /* Load.  */
736 	      unsigned unrolling_factor
737 		= least_common_multiple
738 		    (*max_nunits, group_size) / group_size;
739               /* FORNOW: Check that there is no gap between the loads
740 		 and no gap between the groups when we need to load
741 		 multiple groups at once.
742 		 ???  We should enhance this to only disallow gaps
743 		 inside vectors.  */
744               if ((unrolling_factor > 1
745 		   && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746 			&& GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747 		       /* If the group is split up then GROUP_GAP
748 			  isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
749 		       || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750 		  || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751 		      && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752                 {
753                   if (dump_enabled_p ())
754                     {
755                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756 				       "Build SLP failed: grouped "
757 				       "loads have gaps ");
758                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759 					stmt, 0);
760                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761                     }
762 		  /* Fatal mismatch.  */
763 		  matches[0] = false;
764                   return false;
765                 }
766 
767               /* Check that the size of interleaved loads group is not
768                  greater than the SLP group size.  */
769 	      unsigned ncopies
770 		= vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771               if (loop_vinfo
772 		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                   && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774 		       - GROUP_GAP (vinfo_for_stmt (stmt)))
775 		      > ncopies * group_size))
776                 {
777                   if (dump_enabled_p ())
778                     {
779                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780 				       "Build SLP failed: the number "
781 				       "of interleaved loads is greater than "
782 				       "the SLP group size ");
783                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784 					stmt, 0);
785                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786                     }
787 		  /* Fatal mismatch.  */
788 		  matches[0] = false;
789                   return false;
790                 }
791 
792 	      old_first_load = first_load;
793               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794               if (prev_first_load)
795                 {
796                   /* Check that there are no loads from different interleaving
797                      chains in the same node.  */
798                   if (prev_first_load != first_load)
799                     {
800                       if (dump_enabled_p ())
801                         {
802                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803 					   vect_location,
804 					   "Build SLP failed: different "
805 					   "interleaving chains in one node ");
806                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807 					    stmt, 0);
808                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809                         }
810 		      /* Mismatch.  */
811 		      continue;
812                     }
813                 }
814               else
815                 prev_first_load = first_load;
816 
817 	      /* In some cases a group of loads is just the same load
818 		 repeated N times.  Only analyze its cost once.  */
819               if (first_load == stmt && old_first_load != first_load)
820                 {
821                   first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822                   if (vect_supportable_dr_alignment (first_dr, false)
823                       == dr_unaligned_unsupported)
824                     {
825                       if (dump_enabled_p ())
826                         {
827                           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828 					   vect_location,
829 					   "Build SLP failed: unsupported "
830 					   "unaligned load ");
831                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832 					    stmt, 0);
833                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834                         }
835 		      /* Fatal mismatch.  */
836 		      matches[0] = false;
837                       return false;
838                     }
839                 }
840            }
841         } /* Grouped access.  */
842       else
843 	{
844 	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845 	    {
846 	      /* Not grouped load.  */
847 	      if (dump_enabled_p ())
848 		{
849 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
850 				   "Build SLP failed: not grouped load ");
851 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853 		}
854 
855 	      /* FORNOW: Not grouped loads are not supported.  */
856 	      /* Fatal mismatch.  */
857 	      matches[0] = false;
858 	      return false;
859 	    }
860 
861 	  /* Not memory operation.  */
862 	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863 	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
864 	      && rhs_code != COND_EXPR
865 	      && rhs_code != CALL_EXPR)
866 	    {
867 	      if (dump_enabled_p ())
868 		{
869 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870 				   "Build SLP failed: operation");
871 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874 		}
875 	      /* Fatal mismatch.  */
876 	      matches[0] = false;
877 	      return false;
878 	    }
879 
880           if (rhs_code == COND_EXPR)
881             {
882               tree cond_expr = gimple_assign_rhs1 (stmt);
883 
884 	      if (i == 0)
885 		first_cond_code = TREE_CODE (cond_expr);
886               else if (first_cond_code != TREE_CODE (cond_expr))
887                 {
888                   if (dump_enabled_p ())
889                     {
890                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891 				       "Build SLP failed: different"
892 				       " operation");
893                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894 					stmt, 0);
895                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896                     }
897 		  /* Mismatch.  */
898 		  continue;
899 		}
900             }
901 	}
902 
903       matches[i] = true;
904     }
905 
906   for (i = 0; i < group_size; ++i)
907     if (!matches[i])
908       return false;
909 
910   return true;
911 }
912 
913 /* Recursively build an SLP tree starting from NODE.
914    Fail (and return a value not equal to zero) if def-stmts are not
915    isomorphic, require data permutation or are of unsupported types of
916    operation.  Otherwise, return 0.
917    The value returned is the depth in the SLP tree where a mismatch
918    was found.  */
919 
920 static bool
921 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922                      slp_tree *node, unsigned int group_size,
923                      unsigned int *max_nunits,
924                      vec<slp_tree> *loads,
925                      unsigned int vectorization_factor,
926 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
927 		     unsigned max_tree_size)
928 {
929   unsigned nops, i, this_tree_size = 0;
930   gimple stmt;
931 
932   matches[0] = false;
933 
934   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
935   if (is_gimple_call (stmt))
936     nops = gimple_call_num_args (stmt);
937   else if (is_gimple_assign (stmt))
938     {
939       nops = gimple_num_ops (stmt) - 1;
940       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
941 	nops++;
942     }
943   else
944     return false;
945 
946   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
947 			      SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
948 			      max_nunits, vectorization_factor, matches))
949     return false;
950 
951   /* If the SLP node is a load, terminate the recursion.  */
952   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
953       && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
954     {
955       loads->safe_push (*node);
956       return true;
957     }
958 
959   /* Get at the operands, verifying they are compatible.  */
960   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
961   slp_oprnd_info oprnd_info;
962   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
963     {
964       switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
965 					   stmt, (i == 0), &oprnds_info))
966 	{
967 	case 0:
968 	  break;
969 	case -1:
970 	  matches[0] = false;
971 	  vect_free_oprnd_info (oprnds_info);
972 	  return false;
973 	case 1:
974 	  matches[i] = false;
975 	  break;
976 	}
977     }
978   for (i = 0; i < group_size; ++i)
979     if (!matches[i])
980       {
981 	vect_free_oprnd_info (oprnds_info);
982 	return false;
983       }
984 
985   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
986 
987   /* Create SLP_TREE nodes for the definition node/s.  */
988   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
989     {
990       slp_tree child;
991       unsigned old_nloads = loads->length ();
992       unsigned old_max_nunits = *max_nunits;
993 
994       if (oprnd_info->first_dt != vect_internal_def)
995         continue;
996 
997       if (++this_tree_size > max_tree_size)
998 	{
999 	  vect_free_oprnd_info (oprnds_info);
1000 	  return false;
1001 	}
1002 
1003       child = vect_create_new_slp_node (oprnd_info->def_stmts);
1004       if (!child)
1005 	{
1006 	  vect_free_oprnd_info (oprnds_info);
1007 	  return false;
1008 	}
1009 
1010       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1011 			       group_size, max_nunits, loads,
1012 			       vectorization_factor, matches,
1013 			       npermutes, &this_tree_size, max_tree_size))
1014 	{
1015 	  oprnd_info->def_stmts = vNULL;
1016 	  SLP_TREE_CHILDREN (*node).quick_push (child);
1017 	  continue;
1018 	}
1019 
1020       /* If the SLP build for operand zero failed and operand zero
1021 	 and one can be commutated try that for the scalar stmts
1022 	 that failed the match.  */
1023       if (i == 0
1024 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
1025 	  && matches[0]
1026 	  /* ???  For COND_EXPRs we can swap the comparison operands
1027 	     as well as the arms under some constraints.  */
1028 	  && nops == 2
1029 	  && oprnds_info[1]->first_dt == vect_internal_def
1030 	  && is_gimple_assign (stmt)
1031 	  && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032 	  /* Do so only if the number of not successful permutes was nor more
1033 	     than a cut-ff as re-trying the recursive match on
1034 	     possibly each level of the tree would expose exponential
1035 	     behavior.  */
1036 	  && *npermutes < 4)
1037 	{
1038 	  unsigned int j;
1039 	  slp_tree grandchild;
1040 
1041 	  /* Roll back.  */
1042 	  *max_nunits = old_max_nunits;
1043 	  loads->truncate (old_nloads);
1044 	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1045 	    vect_free_slp_tree (grandchild);
1046 	  SLP_TREE_CHILDREN (child).truncate (0);
1047 
1048 	  /* Swap mismatched definition stmts.  */
1049 	  dump_printf_loc (MSG_NOTE, vect_location,
1050 			   "Re-trying with swapped operands of stmts ");
1051 	  for (j = 0; j < group_size; ++j)
1052 	    if (!matches[j])
1053 	      {
1054 		gimple tem = oprnds_info[0]->def_stmts[j];
1055 		oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1056 		oprnds_info[1]->def_stmts[j] = tem;
1057 		dump_printf (MSG_NOTE, "%d ", j);
1058 	      }
1059 	  dump_printf (MSG_NOTE, "\n");
1060 	  /* And try again ... */
1061 	  if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1062 				   group_size, max_nunits, loads,
1063 				   vectorization_factor,
1064 				   matches, npermutes, &this_tree_size,
1065 				   max_tree_size))
1066 	    {
1067 	      oprnd_info->def_stmts = vNULL;
1068 	      SLP_TREE_CHILDREN (*node).quick_push (child);
1069 	      continue;
1070 	    }
1071 
1072 	  ++*npermutes;
1073 	}
1074 
1075       oprnd_info->def_stmts = vNULL;
1076       vect_free_slp_tree (child);
1077       vect_free_oprnd_info (oprnds_info);
1078       return false;
1079     }
1080 
1081   if (tree_size)
1082     *tree_size += this_tree_size;
1083 
1084   vect_free_oprnd_info (oprnds_info);
1085   return true;
1086 }
1087 
1088 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1089 
1090 static void
1091 vect_print_slp_tree (int dump_kind, slp_tree node)
1092 {
1093   int i;
1094   gimple stmt;
1095   slp_tree child;
1096 
1097   if (!node)
1098     return;
1099 
1100   dump_printf (dump_kind, "node ");
1101   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1102     {
1103       dump_printf (dump_kind, "\n\tstmt %d ", i);
1104       dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1105     }
1106   dump_printf (dump_kind, "\n");
1107 
1108   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1109     vect_print_slp_tree (dump_kind, child);
1110 }
1111 
1112 
1113 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1114    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1115    J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1116    stmts in NODE are to be marked.  */
1117 
1118 static void
1119 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1120 {
1121   int i;
1122   gimple stmt;
1123   slp_tree child;
1124 
1125   if (!node)
1126     return;
1127 
1128   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1129     if (j < 0 || i == j)
1130       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1131 
1132   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1133     vect_mark_slp_stmts (child, mark, j);
1134 }
1135 
1136 
1137 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1138 
1139 static void
1140 vect_mark_slp_stmts_relevant (slp_tree node)
1141 {
1142   int i;
1143   gimple stmt;
1144   stmt_vec_info stmt_info;
1145   slp_tree child;
1146 
1147   if (!node)
1148     return;
1149 
1150   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1151     {
1152       stmt_info = vinfo_for_stmt (stmt);
1153       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1154                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1155       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1156     }
1157 
1158   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1159     vect_mark_slp_stmts_relevant (child);
1160 }
1161 
1162 
1163 /* Rearrange the statements of NODE according to PERMUTATION.  */
1164 
1165 static void
1166 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1167                           vec<unsigned> permutation)
1168 {
1169   gimple stmt;
1170   vec<gimple> tmp_stmts;
1171   unsigned int i;
1172   slp_tree child;
1173 
1174   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1175     vect_slp_rearrange_stmts (child, group_size, permutation);
1176 
1177   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1178   tmp_stmts.create (group_size);
1179   tmp_stmts.quick_grow_cleared (group_size);
1180 
1181   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1182     tmp_stmts[permutation[i]] = stmt;
1183 
1184   SLP_TREE_SCALAR_STMTS (node).release ();
1185   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1186 }
1187 
1188 
1189 /* Check if the required load permutations in the SLP instance
1190    SLP_INSTN are supported.  */
1191 
1192 static bool
1193 vect_supported_load_permutation_p (slp_instance slp_instn)
1194 {
1195   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1196   unsigned int i, j, k, next;
1197   sbitmap load_index;
1198   slp_tree node;
1199   gimple stmt, load, next_load, first_load;
1200   struct data_reference *dr;
1201 
1202   if (dump_enabled_p ())
1203     {
1204       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1205       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1206 	if (node->load_permutation.exists ())
1207 	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1208 	    dump_printf (MSG_NOTE, "%d ", next);
1209 	else
1210 	  for (k = 0; k < group_size; ++k)
1211 	    dump_printf (MSG_NOTE, "%d ", k);
1212       dump_printf (MSG_NOTE, "\n");
1213     }
1214 
1215   /* In case of reduction every load permutation is allowed, since the order
1216      of the reduction statements is not important (as opposed to the case of
1217      grouped stores).  The only condition we need to check is that all the
1218      load nodes are of the same size and have the same permutation (and then
1219      rearrange all the nodes of the SLP instance according to this
1220      permutation).  */
1221 
1222   /* Check that all the load nodes are of the same size.  */
1223   /* ???  Can't we assert this? */
1224   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1225     if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1226       return false;
1227 
1228   node = SLP_INSTANCE_TREE (slp_instn);
1229   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1230 
1231   /* Reduction (there are no data-refs in the root).
1232      In reduction chain the order of the loads is important.  */
1233   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1234       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1235     {
1236       slp_tree load;
1237       unsigned int lidx;
1238 
1239       /* Compare all the permutation sequences to the first one.  We know
1240          that at least one load is permuted.  */
1241       node = SLP_INSTANCE_LOADS (slp_instn)[0];
1242       if (!node->load_permutation.exists ())
1243 	return false;
1244       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1245 	{
1246 	  if (!load->load_permutation.exists ())
1247 	    return false;
1248 	  FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1249 	    if (lidx != node->load_permutation[j])
1250 	      return false;
1251 	}
1252 
1253       /* Check that the loads in the first sequence are different and there
1254 	 are no gaps between them.  */
1255       load_index = sbitmap_alloc (group_size);
1256       bitmap_clear (load_index);
1257       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1258 	{
1259 	  if (bitmap_bit_p (load_index, lidx))
1260 	    {
1261 	      sbitmap_free (load_index);
1262 	      return false;
1263 	    }
1264 	  bitmap_set_bit (load_index, lidx);
1265 	}
1266       for (i = 0; i < group_size; i++)
1267 	if (!bitmap_bit_p (load_index, i))
1268 	  {
1269 	    sbitmap_free (load_index);
1270 	    return false;
1271 	  }
1272       sbitmap_free (load_index);
1273 
1274       /* This permutation is valid for reduction.  Since the order of the
1275 	 statements in the nodes is not important unless they are memory
1276 	 accesses, we can rearrange the statements in all the nodes
1277 	 according to the order of the loads.  */
1278       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1279 				node->load_permutation);
1280 
1281       /* We are done, no actual permutations need to be generated.  */
1282       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1283 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1284       return true;
1285     }
1286 
1287   /* In basic block vectorization we allow any subchain of an interleaving
1288      chain.
1289      FORNOW: not supported in loop SLP because of realignment compications.  */
1290   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1291     {
1292       /* Check that for every node in the instance the loads
1293 	 form a subchain.  */
1294       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1295         {
1296           next_load = NULL;
1297           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1298             {
1299               if (j != 0 && next_load != load)
1300 		return false;
1301               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1302             }
1303         }
1304 
1305       /* Check that the alignment of the first load in every subchain, i.e.,
1306          the first statement in every load node, is supported.
1307 	 ???  This belongs in alignment checking.  */
1308       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1309 	{
1310 	  first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1311 	  if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1312 	    {
1313 	      dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1314 	      if (vect_supportable_dr_alignment (dr, false)
1315 		  == dr_unaligned_unsupported)
1316 		{
1317 		  if (dump_enabled_p ())
1318 		    {
1319 		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1320 				       vect_location,
1321 				       "unsupported unaligned load ");
1322 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1323 					first_load, 0);
1324                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1325 		    }
1326 		  return false;
1327 		}
1328 	    }
1329 	}
1330 
1331       /* We are done, no actual permutations need to be generated.  */
1332       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333 	SLP_TREE_LOAD_PERMUTATION (node).release ();
1334       return true;
1335     }
1336 
1337   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1338      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1339      well (unless it's reduction).  */
1340   if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1341     return false;
1342   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1343     if (!node->load_permutation.exists ())
1344       return false;
1345 
1346   load_index = sbitmap_alloc (group_size);
1347   bitmap_clear (load_index);
1348   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1349     {
1350       unsigned int lidx = node->load_permutation[0];
1351       if (bitmap_bit_p (load_index, lidx))
1352 	{
1353 	  sbitmap_free (load_index);
1354 	  return false;
1355 	}
1356       bitmap_set_bit (load_index, lidx);
1357       FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1358 	if (k != lidx)
1359 	  {
1360 	    sbitmap_free (load_index);
1361 	    return false;
1362 	  }
1363     }
1364   for (i = 0; i < group_size; i++)
1365     if (!bitmap_bit_p (load_index, i))
1366       {
1367 	sbitmap_free (load_index);
1368 	return false;
1369       }
1370   sbitmap_free (load_index);
1371 
1372   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1373     if (node->load_permutation.exists ()
1374 	&& !vect_transform_slp_perm_load
1375 	      (node, vNULL, NULL,
1376 	       SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1377       return false;
1378   return true;
1379 }
1380 
1381 
1382 /* Find the first load in the loop that belongs to INSTANCE.
1383    When loads are in several SLP nodes, there can be a case in which the first
1384    load does not appear in the first SLP node to be transformed, causing
1385    incorrect order of statements.  Since we generate all the loads together,
1386    they must be inserted before the first load of the SLP instance and not
1387    before the first load of the first node of the instance.  */
1388 
1389 static gimple
1390 vect_find_first_load_in_slp_instance (slp_instance instance)
1391 {
1392   int i, j;
1393   slp_tree load_node;
1394   gimple first_load = NULL, load;
1395 
1396   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1397     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1398       first_load = get_earlier_stmt (load, first_load);
1399 
1400   return first_load;
1401 }
1402 
1403 
1404 /* Find the last store in SLP INSTANCE.  */
1405 
1406 static gimple
1407 vect_find_last_store_in_slp_instance (slp_instance instance)
1408 {
1409   int i;
1410   slp_tree node;
1411   gimple last_store = NULL, store;
1412 
1413   node = SLP_INSTANCE_TREE (instance);
1414   for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1415     last_store = get_later_stmt (store, last_store);
1416 
1417   return last_store;
1418 }
1419 
1420 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1421 
1422 static void
1423 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1424 			 slp_instance instance, slp_tree node,
1425 			 stmt_vector_for_cost *prologue_cost_vec,
1426 			 unsigned ncopies_for_cost)
1427 {
1428   stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1429 
1430   unsigned i;
1431   slp_tree child;
1432   gimple stmt, s;
1433   stmt_vec_info stmt_info;
1434   tree lhs;
1435   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1436 
1437   /* Recurse down the SLP tree.  */
1438   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1439     vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1440 			     instance, child, prologue_cost_vec,
1441 			     ncopies_for_cost);
1442 
1443   /* Look at the first scalar stmt to determine the cost.  */
1444   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1445   stmt_info = vinfo_for_stmt (stmt);
1446   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1447     {
1448       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1449 	vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1450 			       vect_uninitialized_def,
1451 			       node, prologue_cost_vec, body_cost_vec);
1452       else
1453 	{
1454 	  int i;
1455 	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1456 	  vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1457 				node, prologue_cost_vec, body_cost_vec);
1458 	  /* If the load is permuted record the cost for the permutation.
1459 	     ???  Loads from multiple chains are let through here only
1460 	     for a single special case involving complex numbers where
1461 	     in the end no permutation is necessary.  */
1462 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1463 	    if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1464 		 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1465 		&& vect_get_place_in_interleaving_chain
1466 		     (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1467 	      {
1468 		record_stmt_cost (body_cost_vec, group_size, vec_perm,
1469 				  stmt_info, 0, vect_body);
1470 		break;
1471 	      }
1472 	}
1473     }
1474   else
1475     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1476 		      stmt_info, 0, vect_body);
1477 
1478   /* Scan operands and account for prologue cost of constants/externals.
1479      ???  This over-estimates cost for multiple uses and should be
1480      re-engineered.  */
1481   lhs = gimple_get_lhs (stmt);
1482   for (i = 0; i < gimple_num_ops (stmt); ++i)
1483     {
1484       tree def, op = gimple_op (stmt, i);
1485       gimple def_stmt;
1486       enum vect_def_type dt;
1487       if (!op || op == lhs)
1488 	continue;
1489       if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1490 			      &def_stmt, &def, &dt)
1491 	  && (dt == vect_constant_def || dt == vect_external_def))
1492 	record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1493 			  stmt_info, 0, vect_prologue);
1494     }
1495 }
1496 
1497 /* Compute the cost for the SLP instance INSTANCE.  */
1498 
1499 static void
1500 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1501 		       slp_instance instance, unsigned nunits)
1502 {
1503   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1504   unsigned ncopies_for_cost;
1505   stmt_info_for_cost *si;
1506   unsigned i;
1507 
1508   /* Calculate the number of vector stmts to create based on the unrolling
1509      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1510      GROUP_SIZE / NUNITS otherwise.  */
1511   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1512   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1513 
1514   prologue_cost_vec.create (10);
1515   body_cost_vec.create (10);
1516   SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1517   vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1518 			   instance, SLP_INSTANCE_TREE (instance),
1519 			   &prologue_cost_vec, ncopies_for_cost);
1520 
1521   /* Record the prologue costs, which were delayed until we were
1522      sure that SLP was successful.  Unlike the body costs, we know
1523      the final values now regardless of the loop vectorization factor.  */
1524   void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1525 		: BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1526   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1527     {
1528       struct _stmt_vec_info *stmt_info
1529 	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1530       (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1531 			    si->misalign, vect_prologue);
1532     }
1533 
1534   prologue_cost_vec.release ();
1535 }
1536 
1537 /* Analyze an SLP instance starting from a group of grouped stores.  Call
1538    vect_build_slp_tree to build a tree of packed stmts if possible.
1539    Return FALSE if it's impossible to SLP any stmt in the loop.  */
1540 
1541 static bool
1542 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1543                            gimple stmt, unsigned max_tree_size)
1544 {
1545   slp_instance new_instance;
1546   slp_tree node;
1547   unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1548   unsigned int unrolling_factor = 1, nunits;
1549   tree vectype, scalar_type = NULL_TREE;
1550   gimple next;
1551   unsigned int vectorization_factor = 0;
1552   int i;
1553   unsigned int max_nunits = 0;
1554   vec<slp_tree> loads;
1555   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1556   vec<gimple> scalar_stmts;
1557 
1558   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1559     {
1560       if (dr)
1561         {
1562           scalar_type = TREE_TYPE (DR_REF (dr));
1563           vectype = get_vectype_for_scalar_type (scalar_type);
1564         }
1565       else
1566         {
1567           gcc_assert (loop_vinfo);
1568           vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1569         }
1570 
1571       group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1572     }
1573   else
1574     {
1575       gcc_assert (loop_vinfo);
1576       vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1577       group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1578     }
1579 
1580   if (!vectype)
1581     {
1582       if (dump_enabled_p ())
1583         {
1584           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585 			   "Build SLP failed: unsupported data-type ");
1586           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1587           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1588         }
1589 
1590       return false;
1591     }
1592 
1593   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1594   if (loop_vinfo)
1595     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1596   else
1597     vectorization_factor = nunits;
1598 
1599   /* Calculate the unrolling factor.  */
1600   unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1601   if (unrolling_factor != 1 && !loop_vinfo)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 			 "Build SLP failed: unrolling required in basic"
1606 			 " block SLP\n");
1607 
1608       return false;
1609     }
1610 
1611   /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1612   scalar_stmts.create (group_size);
1613   next = stmt;
1614   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1615     {
1616       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1617       while (next)
1618         {
1619 	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1620 	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1621 	    scalar_stmts.safe_push (
1622 		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1623 	  else
1624             scalar_stmts.safe_push (next);
1625           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1626         }
1627     }
1628   else
1629     {
1630       /* Collect reduction statements.  */
1631       vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1632       for (i = 0; reductions.iterate (i, &next); i++)
1633 	scalar_stmts.safe_push (next);
1634     }
1635 
1636   node = vect_create_new_slp_node (scalar_stmts);
1637 
1638   loads.create (group_size);
1639 
1640   /* Build the tree for the SLP instance.  */
1641   bool *matches = XALLOCAVEC (bool, group_size);
1642   unsigned npermutes = 0;
1643   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1644 			   &max_nunits, &loads,
1645 			   vectorization_factor, matches, &npermutes, NULL,
1646 			   max_tree_size))
1647     {
1648       /* Calculate the unrolling factor based on the smallest type.  */
1649       if (max_nunits > nunits)
1650         unrolling_factor = least_common_multiple (max_nunits, group_size)
1651                            / group_size;
1652 
1653       if (unrolling_factor != 1 && !loop_vinfo)
1654         {
1655           if (dump_enabled_p ())
1656             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657 			     "Build SLP failed: unrolling required in basic"
1658 			     " block SLP\n");
1659 	  vect_free_slp_tree (node);
1660 	  loads.release ();
1661           return false;
1662         }
1663 
1664       /* Create a new SLP instance.  */
1665       new_instance = XNEW (struct _slp_instance);
1666       SLP_INSTANCE_TREE (new_instance) = node;
1667       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1668       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1669       SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1670       SLP_INSTANCE_LOADS (new_instance) = loads;
1671       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1672 
1673       /* Compute the load permutation.  */
1674       slp_tree load_node;
1675       bool loads_permuted = false;
1676       FOR_EACH_VEC_ELT (loads, i, load_node)
1677 	{
1678 	  vec<unsigned> load_permutation;
1679 	  int j;
1680 	  gimple load, first_stmt;
1681 	  bool this_load_permuted = false;
1682 	  load_permutation.create (group_size);
1683 	  first_stmt = GROUP_FIRST_ELEMENT
1684 	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1685 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1686 	    {
1687 	      int load_place
1688 		= vect_get_place_in_interleaving_chain (load, first_stmt);
1689 	      gcc_assert (load_place != -1);
1690 	      if (load_place != j)
1691 		this_load_permuted = true;
1692 	      load_permutation.safe_push (load_place);
1693 	    }
1694 	  if (!this_load_permuted)
1695 	    {
1696 	      load_permutation.release ();
1697 	      continue;
1698 	    }
1699 	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1700 	  loads_permuted = true;
1701 	}
1702 
1703       if (loads_permuted)
1704         {
1705           if (!vect_supported_load_permutation_p (new_instance))
1706             {
1707               if (dump_enabled_p ())
1708                 {
1709                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 				   "Build SLP failed: unsupported load "
1711 				   "permutation ");
1712                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1713                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1714                 }
1715               vect_free_slp_instance (new_instance);
1716               return false;
1717             }
1718 
1719           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1720 	    = vect_find_first_load_in_slp_instance (new_instance);
1721         }
1722 
1723       /* Compute the costs of this SLP instance.  */
1724       vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1725 			     new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1726 
1727       if (loop_vinfo)
1728         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1729       else
1730         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1731 
1732       if (dump_enabled_p ())
1733 	vect_print_slp_tree (MSG_NOTE, node);
1734 
1735       return true;
1736     }
1737 
1738   /* Failed to SLP.  */
1739   /* Free the allocated memory.  */
1740   vect_free_slp_tree (node);
1741   loads.release ();
1742 
1743   return false;
1744 }
1745 
1746 
1747 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1748    trees of packed scalar stmts if SLP is possible.  */
1749 
1750 bool
1751 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1752 		  unsigned max_tree_size)
1753 {
1754   unsigned int i;
1755   vec<gimple> grouped_stores;
1756   vec<gimple> reductions = vNULL;
1757   vec<gimple> reduc_chains = vNULL;
1758   gimple first_element;
1759   bool ok = false;
1760 
1761   if (dump_enabled_p ())
1762     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1763 
1764   if (loop_vinfo)
1765     {
1766       grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1767       reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1768       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1769     }
1770   else
1771     grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1772 
1773   /* Find SLP sequences starting from groups of grouped stores.  */
1774   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1775     if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1776 				   max_tree_size))
1777       ok = true;
1778 
1779   if (bb_vinfo && !ok)
1780     {
1781       if (dump_enabled_p ())
1782         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1783 			 "Failed to SLP the basic block.\n");
1784 
1785       return false;
1786     }
1787 
1788   if (loop_vinfo
1789       && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1790     {
1791       /* Find SLP sequences starting from reduction chains.  */
1792       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1793         if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1794 				       max_tree_size))
1795           ok = true;
1796         else
1797           return false;
1798 
1799       /* Don't try to vectorize SLP reductions if reduction chain was
1800          detected.  */
1801       return ok;
1802     }
1803 
1804   /* Find SLP sequences starting from groups of reductions.  */
1805   if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1806       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1807 				    max_tree_size))
1808     ok = true;
1809 
1810   return true;
1811 }
1812 
1813 
1814 /* For each possible SLP instance decide whether to SLP it and calculate overall
1815    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1816    least one instance.  */
1817 
1818 bool
1819 vect_make_slp_decision (loop_vec_info loop_vinfo)
1820 {
1821   unsigned int i, unrolling_factor = 1;
1822   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1823   slp_instance instance;
1824   int decided_to_slp = 0;
1825 
1826   if (dump_enabled_p ())
1827     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1828                      "\n");
1829 
1830   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1831     {
1832       /* FORNOW: SLP if you can.  */
1833       if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1834 	unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1835 
1836       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1837 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1838 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1839       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1840       decided_to_slp++;
1841     }
1842 
1843   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1844 
1845   if (decided_to_slp && dump_enabled_p ())
1846     dump_printf_loc (MSG_NOTE, vect_location,
1847 		     "Decided to SLP %d instances. Unrolling factor %d\n",
1848 		     decided_to_slp, unrolling_factor);
1849 
1850   return (decided_to_slp > 0);
1851 }
1852 
1853 
1854 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1855    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1856 
1857 static void
1858 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1859 {
1860   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1861   imm_use_iterator imm_iter;
1862   gimple use_stmt;
1863   stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1864   slp_tree child;
1865   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1866   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1867   int j;
1868 
1869   /* Propagate hybrid down the SLP tree.  */
1870   if (stype == hybrid)
1871     ;
1872   else if (HYBRID_SLP_STMT (stmt_vinfo))
1873     stype = hybrid;
1874   else
1875     {
1876       /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1877       gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1878       /* We always get the pattern stmt here, but for immediate
1879 	 uses we have to use the LHS of the original stmt.  */
1880       gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1881       if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1882 	stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1883       if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1884 	FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1885 	  {
1886 	    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1887 	      continue;
1888 	    use_vinfo = vinfo_for_stmt (use_stmt);
1889 	    if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1890 		&& STMT_VINFO_RELATED_STMT (use_vinfo))
1891 	      use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1892 	    if (!STMT_SLP_TYPE (use_vinfo)
1893 		&& (STMT_VINFO_RELEVANT (use_vinfo)
1894 		    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1895 		&& !(gimple_code (use_stmt) == GIMPLE_PHI
1896 		     && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1897 	      stype = hybrid;
1898 	  }
1899     }
1900 
1901   if (stype == hybrid)
1902     STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1903 
1904   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1905     vect_detect_hybrid_slp_stmts (child, i, stype);
1906 }
1907 
1908 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1909 
1910 static tree
1911 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1912 {
1913   walk_stmt_info *wi = (walk_stmt_info *)data;
1914   struct loop *loopp = (struct loop *)wi->info;
1915 
1916   if (wi->is_lhs)
1917     return NULL_TREE;
1918 
1919   if (TREE_CODE (*tp) == SSA_NAME
1920       && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1921     {
1922       gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1923       if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1924 	  && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1925 	STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1926     }
1927 
1928   return NULL_TREE;
1929 }
1930 
1931 static tree
1932 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1933 			  walk_stmt_info *)
1934 {
1935   /* If the stmt is in a SLP instance then this isn't a reason
1936      to mark use definitions in other SLP instances as hybrid.  */
1937   if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1938     *handled = true;
1939   return NULL_TREE;
1940 }
1941 
1942 /* Find stmts that must be both vectorized and SLPed.  */
1943 
1944 void
1945 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1946 {
1947   unsigned int i;
1948   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1949   slp_instance instance;
1950 
1951   if (dump_enabled_p ())
1952     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1953                      "\n");
1954 
1955   /* First walk all pattern stmt in the loop and mark defs of uses as
1956      hybrid because immediate uses in them are not recorded.  */
1957   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1958     {
1959       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1960       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1961 	   gsi_next (&gsi))
1962 	{
1963 	  gimple stmt = gsi_stmt (gsi);
1964 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1965 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1966 	    {
1967 	      walk_stmt_info wi;
1968 	      memset (&wi, 0, sizeof (wi));
1969 	      wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1970 	      gimple_stmt_iterator gsi2
1971 		= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1972 	      walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1973 				vect_detect_hybrid_slp_1, &wi);
1974 	      walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1975 			       vect_detect_hybrid_slp_2,
1976 			       vect_detect_hybrid_slp_1, &wi);
1977 	    }
1978 	}
1979     }
1980 
1981   /* Then walk the SLP instance trees marking stmts with uses in
1982      non-SLP stmts as hybrid, also propagating hybrid down the
1983      SLP tree, collecting the above info on-the-fly.  */
1984   FOR_EACH_VEC_ELT (slp_instances, i, instance)
1985     {
1986       for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1987 	vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1988 				      i, pure_slp);
1989     }
1990 }
1991 
1992 
1993 /* Create and initialize a new bb_vec_info struct for BB, as well as
1994    stmt_vec_info structs for all the stmts in it.  */
1995 
1996 static bb_vec_info
1997 new_bb_vec_info (basic_block bb)
1998 {
1999   bb_vec_info res = NULL;
2000   gimple_stmt_iterator gsi;
2001 
2002   res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2003   BB_VINFO_BB (res) = bb;
2004 
2005   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2006     {
2007       gimple stmt = gsi_stmt (gsi);
2008       gimple_set_uid (stmt, 0);
2009       set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2010     }
2011 
2012   BB_VINFO_GROUPED_STORES (res).create (10);
2013   BB_VINFO_SLP_INSTANCES (res).create (2);
2014   BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2015 
2016   bb->aux = res;
2017   return res;
2018 }
2019 
2020 
2021 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2022    stmts in the basic block.  */
2023 
2024 static void
2025 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2026 {
2027   vec<slp_instance> slp_instances;
2028   slp_instance instance;
2029   basic_block bb;
2030   gimple_stmt_iterator si;
2031   unsigned i;
2032 
2033   if (!bb_vinfo)
2034     return;
2035 
2036   bb = BB_VINFO_BB (bb_vinfo);
2037 
2038   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2039     {
2040       gimple stmt = gsi_stmt (si);
2041       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2042 
2043       if (stmt_info)
2044         /* Free stmt_vec_info.  */
2045         free_stmt_vec_info (stmt);
2046     }
2047 
2048   vect_destroy_datarefs (NULL, bb_vinfo);
2049   free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2050   BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2051   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2052   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2053     vect_free_slp_instance (instance);
2054   BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2055   destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2056   free (bb_vinfo);
2057   bb->aux = NULL;
2058 }
2059 
2060 
2061 /* Analyze statements contained in SLP tree node after recursively analyzing
2062    the subtree. Return TRUE if the operations are supported.  */
2063 
2064 static bool
2065 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2066 {
2067   bool dummy;
2068   int i;
2069   gimple stmt;
2070   slp_tree child;
2071 
2072   if (!node)
2073     return true;
2074 
2075   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2076     if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2077       return false;
2078 
2079   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2080     {
2081       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2082       gcc_assert (stmt_info);
2083       gcc_assert (PURE_SLP_STMT (stmt_info));
2084 
2085       if (!vect_analyze_stmt (stmt, &dummy, node))
2086         return false;
2087     }
2088 
2089   return true;
2090 }
2091 
2092 
2093 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2094    operations are supported. */
2095 
2096 static bool
2097 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2098 {
2099   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2100   slp_instance instance;
2101   int i;
2102 
2103   for (i = 0; slp_instances.iterate (i, &instance); )
2104     {
2105       if (!vect_slp_analyze_node_operations (bb_vinfo,
2106                                              SLP_INSTANCE_TREE (instance)))
2107         {
2108  	  vect_free_slp_instance (instance);
2109           slp_instances.ordered_remove (i);
2110 	}
2111       else
2112         i++;
2113     }
2114 
2115   if (!slp_instances.length ())
2116     return false;
2117 
2118   return true;
2119 }
2120 
2121 
2122 /* Compute the scalar cost of the SLP node NODE and its children
2123    and return it.  Do not account defs that are marked in LIFE and
2124    update LIFE according to uses of NODE.  */
2125 
2126 static unsigned
2127 vect_bb_slp_scalar_cost (basic_block bb,
2128 			 slp_tree node, vec<bool, va_heap> *life)
2129 {
2130   unsigned scalar_cost = 0;
2131   unsigned i;
2132   gimple stmt;
2133   slp_tree child;
2134 
2135   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2136     {
2137       unsigned stmt_cost;
2138       ssa_op_iter op_iter;
2139       def_operand_p def_p;
2140       stmt_vec_info stmt_info;
2141 
2142       if ((*life)[i])
2143 	continue;
2144 
2145       /* If there is a non-vectorized use of the defs then the scalar
2146          stmt is kept live in which case we do not account it or any
2147 	 required defs in the SLP children in the scalar cost.  This
2148 	 way we make the vectorization more costly when compared to
2149 	 the scalar cost.  */
2150       FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2151 	{
2152 	  imm_use_iterator use_iter;
2153 	  gimple use_stmt;
2154 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2155 	    if (!is_gimple_debug (use_stmt)
2156 		&& (gimple_code (use_stmt) == GIMPLE_PHI
2157 		    || gimple_bb (use_stmt) != bb
2158 		    || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2159 	      {
2160 		(*life)[i] = true;
2161 		BREAK_FROM_IMM_USE_STMT (use_iter);
2162 	      }
2163 	}
2164       if ((*life)[i])
2165 	continue;
2166 
2167       stmt_info = vinfo_for_stmt (stmt);
2168       if (STMT_VINFO_DATA_REF (stmt_info))
2169         {
2170           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2171             stmt_cost = vect_get_stmt_cost (scalar_load);
2172           else
2173             stmt_cost = vect_get_stmt_cost (scalar_store);
2174         }
2175       else
2176         stmt_cost = vect_get_stmt_cost (scalar_stmt);
2177 
2178       scalar_cost += stmt_cost;
2179     }
2180 
2181   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2182     scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2183 
2184   return scalar_cost;
2185 }
2186 
2187 /* Check if vectorization of the basic block is profitable.  */
2188 
2189 static bool
2190 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2191 {
2192   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2193   slp_instance instance;
2194   int i, j;
2195   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2196   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2197   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2198   stmt_vec_info stmt_info = NULL;
2199   stmt_vector_for_cost body_cost_vec;
2200   stmt_info_for_cost *ci;
2201 
2202   /* Calculate vector costs.  */
2203   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2204     {
2205       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2206 
2207       FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2208 	{
2209 	  stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2210 	  (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2211 				stmt_info, ci->misalign, vect_body);
2212 	}
2213     }
2214 
2215   /* Calculate scalar cost.  */
2216   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2217     {
2218       auto_vec<bool, 20> life;
2219       life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2220       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2221 					      SLP_INSTANCE_TREE (instance),
2222 					      &life);
2223     }
2224 
2225   /* Complete the target-specific cost calculation.  */
2226   finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2227 	       &vec_inside_cost, &vec_epilogue_cost);
2228 
2229   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2230 
2231   if (dump_enabled_p ())
2232     {
2233       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2234       dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2235 		   vec_inside_cost);
2236       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2237       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2238       dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2239     }
2240 
2241   /* Vectorization is profitable if its cost is less than the cost of scalar
2242      version.  */
2243   if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2244     return false;
2245 
2246   return true;
2247 }
2248 
2249 /* Check if the basic block can be vectorized.  */
2250 
2251 static bb_vec_info
2252 vect_slp_analyze_bb_1 (basic_block bb)
2253 {
2254   bb_vec_info bb_vinfo;
2255   vec<slp_instance> slp_instances;
2256   slp_instance instance;
2257   int i;
2258   int min_vf = 2;
2259   unsigned n_stmts = 0;
2260 
2261   bb_vinfo = new_bb_vec_info (bb);
2262   if (!bb_vinfo)
2263     return NULL;
2264 
2265   if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2266     {
2267       if (dump_enabled_p ())
2268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 			 "not vectorized: unhandled data-ref in basic "
2270 			 "block.\n");
2271 
2272       destroy_bb_vec_info (bb_vinfo);
2273       return NULL;
2274     }
2275 
2276   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2277     {
2278       if (dump_enabled_p ())
2279         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280 			 "not vectorized: not enough data-refs in "
2281 			 "basic block.\n");
2282 
2283       destroy_bb_vec_info (bb_vinfo);
2284       return NULL;
2285     }
2286 
2287   if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2288     {
2289      if (dump_enabled_p ())
2290        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 			"not vectorized: unhandled data access in "
2292 			"basic block.\n");
2293 
2294       destroy_bb_vec_info (bb_vinfo);
2295       return NULL;
2296     }
2297 
2298   vect_pattern_recog (NULL, bb_vinfo);
2299 
2300   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2301     {
2302       if (dump_enabled_p ())
2303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304 			 "not vectorized: bad data alignment in basic "
2305 			 "block.\n");
2306 
2307       destroy_bb_vec_info (bb_vinfo);
2308       return NULL;
2309     }
2310 
2311   /* Check the SLP opportunities in the basic block, analyze and build SLP
2312      trees.  */
2313   if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2314     {
2315       if (dump_enabled_p ())
2316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 			 "not vectorized: failed to find SLP opportunities "
2318 			 "in basic block.\n");
2319 
2320       destroy_bb_vec_info (bb_vinfo);
2321       return NULL;
2322     }
2323 
2324   slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2325 
2326   /* Mark all the statements that we want to vectorize as pure SLP and
2327      relevant.  */
2328   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2329     {
2330       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2331       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2332     }
2333 
2334   /* Mark all the statements that we do not want to vectorize.  */
2335   for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2336        !gsi_end_p (gsi); gsi_next (&gsi))
2337     {
2338       stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2339       if (STMT_SLP_TYPE (vinfo) != pure_slp)
2340 	STMT_VINFO_VECTORIZABLE (vinfo) = false;
2341     }
2342 
2343   /* Analyze dependences.  At this point all stmts not participating in
2344      vectorization have to be marked.  Dependence analysis assumes
2345      that we either vectorize all SLP instances or none at all.  */
2346   if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2347      {
2348        if (dump_enabled_p ())
2349 	 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350 			  "not vectorized: unhandled data dependence "
2351 			  "in basic block.\n");
2352 
2353        destroy_bb_vec_info (bb_vinfo);
2354        return NULL;
2355      }
2356 
2357   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2358     {
2359       if (dump_enabled_p ())
2360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                          "not vectorized: unsupported alignment in basic "
2362                          "block.\n");
2363       destroy_bb_vec_info (bb_vinfo);
2364       return NULL;
2365     }
2366 
2367   if (!vect_slp_analyze_operations (bb_vinfo))
2368     {
2369       if (dump_enabled_p ())
2370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371 			 "not vectorized: bad operation in basic block.\n");
2372 
2373       destroy_bb_vec_info (bb_vinfo);
2374       return NULL;
2375     }
2376 
2377   /* Cost model: check if the vectorization is worthwhile.  */
2378   if (!unlimited_cost_model (NULL)
2379       && !vect_bb_vectorization_profitable_p (bb_vinfo))
2380     {
2381       if (dump_enabled_p ())
2382         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 			 "not vectorized: vectorization is not "
2384 			 "profitable.\n");
2385 
2386       destroy_bb_vec_info (bb_vinfo);
2387       return NULL;
2388     }
2389 
2390   if (dump_enabled_p ())
2391     dump_printf_loc (MSG_NOTE, vect_location,
2392 		     "Basic block will be vectorized using SLP\n");
2393 
2394   return bb_vinfo;
2395 }
2396 
2397 
2398 bb_vec_info
2399 vect_slp_analyze_bb (basic_block bb)
2400 {
2401   bb_vec_info bb_vinfo;
2402   int insns = 0;
2403   gimple_stmt_iterator gsi;
2404   unsigned int vector_sizes;
2405 
2406   if (dump_enabled_p ())
2407     dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2408 
2409   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2410     {
2411       gimple stmt = gsi_stmt (gsi);
2412       if (!is_gimple_debug (stmt)
2413           && !gimple_nop_p (stmt)
2414           && gimple_code (stmt) != GIMPLE_LABEL)
2415         insns++;
2416     }
2417 
2418   if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2419     {
2420       if (dump_enabled_p ())
2421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422 			 "not vectorized: too many instructions in "
2423 			 "basic block.\n");
2424 
2425       return NULL;
2426     }
2427 
2428   /* Autodetect first vector size we try.  */
2429   current_vector_size = 0;
2430   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2431 
2432   while (1)
2433     {
2434       bb_vinfo = vect_slp_analyze_bb_1 (bb);
2435       if (bb_vinfo)
2436         return bb_vinfo;
2437 
2438       destroy_bb_vec_info (bb_vinfo);
2439 
2440       vector_sizes &= ~current_vector_size;
2441       if (vector_sizes == 0
2442           || current_vector_size == 0)
2443         return NULL;
2444 
2445       /* Try the next biggest vector size.  */
2446       current_vector_size = 1 << floor_log2 (vector_sizes);
2447       if (dump_enabled_p ())
2448         dump_printf_loc (MSG_NOTE, vect_location,
2449 			 "***** Re-trying analysis with "
2450 			 "vector size %d\n", current_vector_size);
2451     }
2452 }
2453 
2454 
2455 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2456    the number of created vector stmts depends on the unrolling factor).
2457    However, the actual number of vector stmts for every SLP node depends on
2458    VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2459    should be updated.  In this function we assume that the inside costs
2460    calculated in vect_model_xxx_cost are linear in ncopies.  */
2461 
2462 void
2463 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2464 {
2465   unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2466   vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2467   slp_instance instance;
2468   stmt_vector_for_cost body_cost_vec;
2469   stmt_info_for_cost *si;
2470   void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2471 
2472   if (dump_enabled_p ())
2473     dump_printf_loc (MSG_NOTE, vect_location,
2474 		     "=== vect_update_slp_costs_according_to_vf ===\n");
2475 
2476   FOR_EACH_VEC_ELT (slp_instances, i, instance)
2477     {
2478       /* We assume that costs are linear in ncopies.  */
2479       int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2480 
2481       /* Record the instance's instructions in the target cost model.
2482 	 This was delayed until here because the count of instructions
2483 	 isn't known beforehand.  */
2484       body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2485 
2486       FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2487 	(void) add_stmt_cost (data, si->count * ncopies, si->kind,
2488 			      vinfo_for_stmt (si->stmt), si->misalign,
2489 			      vect_body);
2490     }
2491 }
2492 
2493 
2494 /* For constant and loop invariant defs of SLP_NODE this function returns
2495    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2496    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2497    scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2498    REDUC_INDEX is the index of the reduction operand in the statements, unless
2499    it is -1.  */
2500 
2501 static void
2502 vect_get_constant_vectors (tree op, slp_tree slp_node,
2503                            vec<tree> *vec_oprnds,
2504 			   unsigned int op_num, unsigned int number_of_vectors,
2505                            int reduc_index)
2506 {
2507   vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2508   gimple stmt = stmts[0];
2509   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2510   unsigned nunits;
2511   tree vec_cst;
2512   tree *elts;
2513   unsigned j, number_of_places_left_in_vector;
2514   tree vector_type;
2515   tree vop;
2516   int group_size = stmts.length ();
2517   unsigned int vec_num, i;
2518   unsigned number_of_copies = 1;
2519   vec<tree> voprnds;
2520   voprnds.create (number_of_vectors);
2521   bool constant_p, is_store;
2522   tree neutral_op = NULL;
2523   enum tree_code code = gimple_expr_code (stmt);
2524   gimple def_stmt;
2525   struct loop *loop;
2526   gimple_seq ctor_seq = NULL;
2527 
2528   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2529       && reduc_index != -1)
2530     {
2531       op_num = reduc_index - 1;
2532       op = gimple_op (stmt, reduc_index);
2533       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2534          we need either neutral operands or the original operands.  See
2535          get_initial_def_for_reduction() for details.  */
2536       switch (code)
2537         {
2538           case WIDEN_SUM_EXPR:
2539           case DOT_PROD_EXPR:
2540           case PLUS_EXPR:
2541           case MINUS_EXPR:
2542           case BIT_IOR_EXPR:
2543           case BIT_XOR_EXPR:
2544              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2545                neutral_op = build_real (TREE_TYPE (op), dconst0);
2546              else
2547                neutral_op = build_int_cst (TREE_TYPE (op), 0);
2548 
2549              break;
2550 
2551           case MULT_EXPR:
2552              if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2553                neutral_op = build_real (TREE_TYPE (op), dconst1);
2554              else
2555                neutral_op = build_int_cst (TREE_TYPE (op), 1);
2556 
2557              break;
2558 
2559           case BIT_AND_EXPR:
2560             neutral_op = build_int_cst (TREE_TYPE (op), -1);
2561             break;
2562 
2563 	  /* For MIN/MAX we don't have an easy neutral operand but
2564 	     the initial values can be used fine here.  Only for
2565 	     a reduction chain we have to force a neutral element.  */
2566 	  case MAX_EXPR:
2567 	  case MIN_EXPR:
2568 	    if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2569 	      neutral_op = NULL;
2570 	    else
2571 	      {
2572 		def_stmt = SSA_NAME_DEF_STMT (op);
2573 		loop = (gimple_bb (stmt))->loop_father;
2574 		neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2575 						    loop_preheader_edge (loop));
2576 	      }
2577 	    break;
2578 
2579           default:
2580             neutral_op = NULL;
2581         }
2582     }
2583 
2584   if (STMT_VINFO_DATA_REF (stmt_vinfo))
2585     {
2586       is_store = true;
2587       op = gimple_assign_rhs1 (stmt);
2588     }
2589   else
2590     is_store = false;
2591 
2592   gcc_assert (op);
2593 
2594   if (CONSTANT_CLASS_P (op))
2595     constant_p = true;
2596   else
2597     constant_p = false;
2598 
2599   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2600   gcc_assert (vector_type);
2601   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2602 
2603   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2604      created vectors. It is greater than 1 if unrolling is performed.
2605 
2606      For example, we have two scalar operands, s1 and s2 (e.g., group of
2607      strided accesses of size two), while NUNITS is four (i.e., four scalars
2608      of this type can be packed in a vector).  The output vector will contain
2609      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2610      will be 2).
2611 
2612      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2613      containing the operands.
2614 
2615      For example, NUNITS is four as before, and the group size is 8
2616      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2617      {s5, s6, s7, s8}.  */
2618 
2619   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2620 
2621   number_of_places_left_in_vector = nunits;
2622   elts = XALLOCAVEC (tree, nunits);
2623   for (j = 0; j < number_of_copies; j++)
2624     {
2625       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2626         {
2627           if (is_store)
2628             op = gimple_assign_rhs1 (stmt);
2629           else
2630 	    {
2631 	      switch (code)
2632 		{
2633 		  case COND_EXPR:
2634 		    if (op_num == 0 || op_num == 1)
2635 		      {
2636 			tree cond = gimple_assign_rhs1 (stmt);
2637 			op = TREE_OPERAND (cond, op_num);
2638 		      }
2639 		    else
2640 		      {
2641 			if (op_num == 2)
2642 			  op = gimple_assign_rhs2 (stmt);
2643 			else
2644 			  op = gimple_assign_rhs3 (stmt);
2645 		      }
2646 		    break;
2647 
2648 		  case CALL_EXPR:
2649 		    op = gimple_call_arg (stmt, op_num);
2650 		    break;
2651 
2652 		  case LSHIFT_EXPR:
2653 		  case RSHIFT_EXPR:
2654 		  case LROTATE_EXPR:
2655 		  case RROTATE_EXPR:
2656 		    op = gimple_op (stmt, op_num + 1);
2657 		    /* Unlike the other binary operators, shifts/rotates have
2658 		       the shift count being int, instead of the same type as
2659 		       the lhs, so make sure the scalar is the right type if
2660 		       we are dealing with vectors of
2661 		       long long/long/short/char.  */
2662 		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2663 		      op = fold_convert (TREE_TYPE (vector_type), op);
2664 		    break;
2665 
2666 		  default:
2667 		    op = gimple_op (stmt, op_num + 1);
2668 		    break;
2669 		}
2670 	    }
2671 
2672           if (reduc_index != -1)
2673             {
2674               loop = (gimple_bb (stmt))->loop_father;
2675               def_stmt = SSA_NAME_DEF_STMT (op);
2676 
2677               gcc_assert (loop);
2678 
2679               /* Get the def before the loop.  In reduction chain we have only
2680                  one initial value.  */
2681               if ((j != (number_of_copies - 1)
2682                    || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2683                        && i != 0))
2684                   && neutral_op)
2685                 op = neutral_op;
2686               else
2687                 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2688                                             loop_preheader_edge (loop));
2689             }
2690 
2691           /* Create 'vect_ = {op0,op1,...,opn}'.  */
2692           number_of_places_left_in_vector--;
2693 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2694 	    {
2695 	      if (CONSTANT_CLASS_P (op))
2696 		{
2697 		  op = fold_unary (VIEW_CONVERT_EXPR,
2698 				   TREE_TYPE (vector_type), op);
2699 		  gcc_assert (op && CONSTANT_CLASS_P (op));
2700 		}
2701 	      else
2702 		{
2703 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2704 		  gimple init_stmt;
2705 		  op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2706 		  init_stmt
2707 		    = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2708 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
2709 		  op = new_temp;
2710 		}
2711 	    }
2712 	  elts[number_of_places_left_in_vector] = op;
2713 	  if (!CONSTANT_CLASS_P (op))
2714 	    constant_p = false;
2715 
2716           if (number_of_places_left_in_vector == 0)
2717             {
2718               number_of_places_left_in_vector = nunits;
2719 
2720 	      if (constant_p)
2721 		vec_cst = build_vector (vector_type, elts);
2722 	      else
2723 		{
2724 		  vec<constructor_elt, va_gc> *v;
2725 		  unsigned k;
2726 		  vec_alloc (v, nunits);
2727 		  for (k = 0; k < nunits; ++k)
2728 		    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2729 		  vec_cst = build_constructor (vector_type, v);
2730 		}
2731               voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2732 						    vector_type, NULL));
2733 	      if (ctor_seq != NULL)
2734 		{
2735 		  gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2736 		  gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2737 		  gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2738 							GSI_SAME_STMT);
2739 		  ctor_seq = NULL;
2740 		}
2741             }
2742         }
2743     }
2744 
2745   /* Since the vectors are created in the reverse order, we should invert
2746      them.  */
2747   vec_num = voprnds.length ();
2748   for (j = vec_num; j != 0; j--)
2749     {
2750       vop = voprnds[j - 1];
2751       vec_oprnds->quick_push (vop);
2752     }
2753 
2754   voprnds.release ();
2755 
2756   /* In case that VF is greater than the unrolling factor needed for the SLP
2757      group of stmts, NUMBER_OF_VECTORS to be created is greater than
2758      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2759      to replicate the vectors.  */
2760   while (number_of_vectors > vec_oprnds->length ())
2761     {
2762       tree neutral_vec = NULL;
2763 
2764       if (neutral_op)
2765         {
2766           if (!neutral_vec)
2767 	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
2768 
2769           vec_oprnds->quick_push (neutral_vec);
2770         }
2771       else
2772         {
2773           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2774             vec_oprnds->quick_push (vop);
2775         }
2776     }
2777 }
2778 
2779 
2780 /* Get vectorized definitions from SLP_NODE that contains corresponding
2781    vectorized def-stmts.  */
2782 
2783 static void
2784 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2785 {
2786   tree vec_oprnd;
2787   gimple vec_def_stmt;
2788   unsigned int i;
2789 
2790   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2791 
2792   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2793     {
2794       gcc_assert (vec_def_stmt);
2795       vec_oprnd = gimple_get_lhs (vec_def_stmt);
2796       vec_oprnds->quick_push (vec_oprnd);
2797     }
2798 }
2799 
2800 
2801 /* Get vectorized definitions for SLP_NODE.
2802    If the scalar definitions are loop invariants or constants, collect them and
2803    call vect_get_constant_vectors() to create vector stmts.
2804    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2805    must be stored in the corresponding child of SLP_NODE, and we call
2806    vect_get_slp_vect_defs () to retrieve them.  */
2807 
2808 void
2809 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2810 		   vec<vec<tree> > *vec_oprnds, int reduc_index)
2811 {
2812   gimple first_stmt;
2813   int number_of_vects = 0, i;
2814   unsigned int child_index = 0;
2815   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2816   slp_tree child = NULL;
2817   vec<tree> vec_defs;
2818   tree oprnd;
2819   bool vectorized_defs;
2820 
2821   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2822   FOR_EACH_VEC_ELT (ops, i, oprnd)
2823     {
2824       /* For each operand we check if it has vectorized definitions in a child
2825 	 node or we need to create them (for invariants and constants).  We
2826 	 check if the LHS of the first stmt of the next child matches OPRND.
2827 	 If it does, we found the correct child.  Otherwise, we call
2828 	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2829 	 to check this child node for the next operand.  */
2830       vectorized_defs = false;
2831       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2832         {
2833           child = SLP_TREE_CHILDREN (slp_node)[child_index];
2834 
2835 	  /* We have to check both pattern and original def, if available.  */
2836 	  gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2837 	  gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2838 
2839 	  if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2840 	      || (related
2841 		  && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2842 	    {
2843 	      /* The number of vector defs is determined by the number of
2844 		 vector statements in the node from which we get those
2845 		 statements.  */
2846 	      number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2847 	      vectorized_defs = true;
2848 	      child_index++;
2849 	    }
2850         }
2851 
2852       if (!vectorized_defs)
2853         {
2854           if (i == 0)
2855             {
2856               number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2857               /* Number of vector stmts was calculated according to LHS in
2858                  vect_schedule_slp_instance (), fix it by replacing LHS with
2859                  RHS, if necessary.  See vect_get_smallest_scalar_type () for
2860                  details.  */
2861               vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2862                                              &rhs_size_unit);
2863               if (rhs_size_unit != lhs_size_unit)
2864                 {
2865                   number_of_vects *= rhs_size_unit;
2866                   number_of_vects /= lhs_size_unit;
2867                 }
2868             }
2869         }
2870 
2871       /* Allocate memory for vectorized defs.  */
2872       vec_defs = vNULL;
2873       vec_defs.create (number_of_vects);
2874 
2875       /* For reduction defs we call vect_get_constant_vectors (), since we are
2876          looking for initial loop invariant values.  */
2877       if (vectorized_defs && reduc_index == -1)
2878         /* The defs are already vectorized.  */
2879 	vect_get_slp_vect_defs (child, &vec_defs);
2880       else
2881         /* Build vectors from scalar defs.  */
2882 	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2883                                    number_of_vects, reduc_index);
2884 
2885       vec_oprnds->quick_push (vec_defs);
2886 
2887       /* For reductions, we only need initial values.  */
2888       if (reduc_index != -1)
2889         return;
2890     }
2891 }
2892 
2893 
2894 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2895    building a vector of type MASK_TYPE from it) and two input vectors placed in
2896    DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2897    shifting by STRIDE elements of DR_CHAIN for every copy.
2898    (STRIDE is the number of vectorized stmts for NODE divided by the number of
2899    copies).
2900    VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2901    the created stmts must be inserted.  */
2902 
2903 static inline void
2904 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2905                            tree mask, int first_vec_indx, int second_vec_indx,
2906                            gimple_stmt_iterator *gsi, slp_tree node,
2907                            tree vectype, vec<tree> dr_chain,
2908                            int ncopies, int vect_stmts_counter)
2909 {
2910   tree perm_dest;
2911   gimple perm_stmt = NULL;
2912   stmt_vec_info next_stmt_info;
2913   int i, stride;
2914   tree first_vec, second_vec, data_ref;
2915 
2916   stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2917 
2918   /* Initialize the vect stmts of NODE to properly insert the generated
2919      stmts later.  */
2920   for (i = SLP_TREE_VEC_STMTS (node).length ();
2921        i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2922     SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2923 
2924   perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2925   for (i = 0; i < ncopies; i++)
2926     {
2927       first_vec = dr_chain[first_vec_indx];
2928       second_vec = dr_chain[second_vec_indx];
2929 
2930       /* Generate the permute statement.  */
2931       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2932 				       first_vec, second_vec, mask);
2933       data_ref = make_ssa_name (perm_dest, perm_stmt);
2934       gimple_set_lhs (perm_stmt, data_ref);
2935       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2936 
2937       /* Store the vector statement in NODE.  */
2938       SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2939 
2940       first_vec_indx += stride;
2941       second_vec_indx += stride;
2942     }
2943 
2944   /* Mark the scalar stmt as vectorized.  */
2945   next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2946   STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2947 }
2948 
2949 
2950 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2951    return in CURRENT_MASK_ELEMENT its equivalent in target specific
2952    representation.  Check that the mask is valid and return FALSE if not.
2953    Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2954    the next vector, i.e., the current first vector is not needed.  */
2955 
2956 static bool
2957 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2958                        int mask_nunits, bool only_one_vec, int index,
2959 		       unsigned char *mask, int *current_mask_element,
2960                        bool *need_next_vector, int *number_of_mask_fixes,
2961                        bool *mask_fixed, bool *needs_first_vector)
2962 {
2963   int i;
2964 
2965   /* Convert to target specific representation.  */
2966   *current_mask_element = first_mask_element + m;
2967   /* Adjust the value in case it's a mask for second and third vectors.  */
2968   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2969 
2970   if (*current_mask_element < mask_nunits)
2971     *needs_first_vector = true;
2972 
2973   /* We have only one input vector to permute but the mask accesses values in
2974      the next vector as well.  */
2975   if (only_one_vec && *current_mask_element >= mask_nunits)
2976     {
2977       if (dump_enabled_p ())
2978         {
2979           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2980 			   "permutation requires at least two vectors ");
2981           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2982           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2983         }
2984 
2985       return false;
2986     }
2987 
2988   /* The mask requires the next vector.  */
2989   while (*current_mask_element >= mask_nunits * 2)
2990     {
2991       if (*needs_first_vector || *mask_fixed)
2992         {
2993           /* We either need the first vector too or have already moved to the
2994              next vector. In both cases, this permutation needs three
2995              vectors.  */
2996           if (dump_enabled_p ())
2997             {
2998               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2999 			       "permutation requires at "
3000 			       "least three vectors ");
3001               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3002               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3003             }
3004 
3005           return false;
3006         }
3007 
3008       /* We move to the next vector, dropping the first one and working with
3009          the second and the third - we need to adjust the values of the mask
3010          accordingly.  */
3011       *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3012 
3013       for (i = 0; i < index; i++)
3014         mask[i] -= mask_nunits * *number_of_mask_fixes;
3015 
3016       (*number_of_mask_fixes)++;
3017       *mask_fixed = true;
3018     }
3019 
3020   *need_next_vector = *mask_fixed;
3021 
3022   /* This was the last element of this mask. Start a new one.  */
3023   if (index == mask_nunits - 1)
3024     {
3025       *number_of_mask_fixes = 1;
3026       *mask_fixed = false;
3027       *needs_first_vector = false;
3028     }
3029 
3030   return true;
3031 }
3032 
3033 
3034 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3035    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3036    permute statements for the SLP node NODE of the SLP instance
3037    SLP_NODE_INSTANCE.  */
3038 
3039 bool
3040 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3041                               gimple_stmt_iterator *gsi, int vf,
3042                               slp_instance slp_node_instance, bool analyze_only)
3043 {
3044   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3046   tree mask_element_type = NULL_TREE, mask_type;
3047   int i, j, k, nunits, vec_index = 0, scalar_index;
3048   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3049   gimple next_scalar_stmt;
3050   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3051   int first_mask_element;
3052   int index, unroll_factor, current_mask_element, ncopies;
3053   unsigned char *mask;
3054   bool only_one_vec = false, need_next_vector = false;
3055   int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3056   int number_of_mask_fixes = 1;
3057   bool mask_fixed = false;
3058   bool needs_first_vector = false;
3059   machine_mode mode;
3060 
3061   mode = TYPE_MODE (vectype);
3062 
3063   if (!can_vec_perm_p (mode, false, NULL))
3064     {
3065       if (dump_enabled_p ())
3066         {
3067           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3068 			   "no vect permute for ");
3069           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3070           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3071         }
3072       return false;
3073     }
3074 
3075   /* The generic VEC_PERM_EXPR code always uses an integral type of the
3076      same size as the vector element being permuted.  */
3077   mask_element_type = lang_hooks.types.type_for_mode
3078 		(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3079   mask_type = get_vectype_for_scalar_type (mask_element_type);
3080   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3081   mask = XALLOCAVEC (unsigned char, nunits);
3082   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3083 
3084   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3085      unrolling factor.  */
3086   orig_vec_stmts_num = group_size *
3087                 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3088   if (orig_vec_stmts_num == 1)
3089     only_one_vec = true;
3090 
3091   /* Number of copies is determined by the final vectorization factor
3092      relatively to SLP_NODE_INSTANCE unrolling factor.  */
3093   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3094 
3095   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3096     return false;
3097 
3098   /* Generate permutation masks for every NODE. Number of masks for each NODE
3099      is equal to GROUP_SIZE.
3100      E.g., we have a group of three nodes with three loads from the same
3101      location in each node, and the vector size is 4. I.e., we have a
3102      a0b0c0a1b1c1... sequence and we need to create the following vectors:
3103      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3104      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3105      ...
3106 
3107      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3108      The last mask is illegal since we assume two operands for permute
3109      operation, and the mask element values can't be outside that range.
3110      Hence, the last mask must be converted into {2,5,5,5}.
3111      For the first two permutations we need the first and the second input
3112      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3113      we need the second and the third vectors: {b1,c1,a2,b2} and
3114      {c2,a3,b3,c3}.  */
3115 
3116     {
3117       scalar_index = 0;
3118       index = 0;
3119       vect_stmts_counter = 0;
3120       vec_index = 0;
3121       first_vec_index = vec_index++;
3122       if (only_one_vec)
3123         second_vec_index = first_vec_index;
3124       else
3125         second_vec_index =  vec_index++;
3126 
3127       for (j = 0; j < unroll_factor; j++)
3128         {
3129           for (k = 0; k < group_size; k++)
3130             {
3131 	      i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3132               first_mask_element = i + j * group_size;
3133               if (!vect_get_mask_element (stmt, first_mask_element, 0,
3134 					  nunits, only_one_vec, index,
3135 					  mask, &current_mask_element,
3136 					  &need_next_vector,
3137 					  &number_of_mask_fixes, &mask_fixed,
3138 					  &needs_first_vector))
3139 		return false;
3140 	      gcc_assert (current_mask_element < 2 * nunits);
3141 	      mask[index++] = current_mask_element;
3142 
3143               if (index == nunits)
3144                 {
3145 		  index = 0;
3146 		  if (!can_vec_perm_p (mode, false, mask))
3147 		    {
3148 		      if (dump_enabled_p ())
3149 			{
3150 			  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3151 					   vect_location,
3152 					   "unsupported vect permute { ");
3153 			  for (i = 0; i < nunits; ++i)
3154 			    dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3155 					 mask[i]);
3156 			  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3157 			}
3158 		      return false;
3159 		    }
3160 
3161                   if (!analyze_only)
3162                     {
3163 		      int l;
3164 		      tree mask_vec, *mask_elts;
3165 		      mask_elts = XALLOCAVEC (tree, nunits);
3166 		      for (l = 0; l < nunits; ++l)
3167 			mask_elts[l] = build_int_cst (mask_element_type,
3168 						      mask[l]);
3169 		      mask_vec = build_vector (mask_type, mask_elts);
3170 
3171 		      if (need_next_vector)
3172                         {
3173                           first_vec_index = second_vec_index;
3174                           second_vec_index = vec_index;
3175                         }
3176 
3177                       next_scalar_stmt
3178 			  = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3179 
3180                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
3181                                mask_vec, first_vec_index, second_vec_index,
3182 			       gsi, node, vectype, dr_chain,
3183 			       ncopies, vect_stmts_counter++);
3184                     }
3185                 }
3186             }
3187         }
3188     }
3189 
3190   return true;
3191 }
3192 
3193 
3194 
3195 /* Vectorize SLP instance tree in postorder.  */
3196 
3197 static bool
3198 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3199                             unsigned int vectorization_factor)
3200 {
3201   gimple stmt;
3202   bool grouped_store, is_store;
3203   gimple_stmt_iterator si;
3204   stmt_vec_info stmt_info;
3205   unsigned int vec_stmts_size, nunits, group_size;
3206   tree vectype;
3207   int i;
3208   slp_tree child;
3209 
3210   if (!node)
3211     return false;
3212 
3213   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3214     vect_schedule_slp_instance (child, instance, vectorization_factor);
3215 
3216   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3217   stmt_info = vinfo_for_stmt (stmt);
3218 
3219   /* VECTYPE is the type of the destination.  */
3220   vectype = STMT_VINFO_VECTYPE (stmt_info);
3221   nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3222   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3223 
3224   /* For each SLP instance calculate number of vector stmts to be created
3225      for the scalar stmts in each node of the SLP tree.  Number of vector
3226      elements in one vector iteration is the number of scalar elements in
3227      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3228      size.  */
3229   vec_stmts_size = (vectorization_factor * group_size) / nunits;
3230 
3231   if (!SLP_TREE_VEC_STMTS (node).exists ())
3232     {
3233       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3234       SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3235     }
3236 
3237   if (dump_enabled_p ())
3238     {
3239       dump_printf_loc (MSG_NOTE,vect_location,
3240 		       "------>vectorizing SLP node starting from: ");
3241       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3242       dump_printf (MSG_NOTE, "\n");
3243     }
3244 
3245   /* Loads should be inserted before the first load.  */
3246   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3247       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3248       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3249       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3250     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3251   else if (is_pattern_stmt_p (stmt_info))
3252     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3253   else
3254     si = gsi_for_stmt (stmt);
3255 
3256   /* Stores should be inserted just before the last store.  */
3257   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3258       && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3259     {
3260       gimple last_store = vect_find_last_store_in_slp_instance (instance);
3261       if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3262        last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3263       si = gsi_for_stmt (last_store);
3264     }
3265 
3266   /* Mark the first element of the reduction chain as reduction to properly
3267      transform the node.  In the analysis phase only the last element of the
3268      chain is marked as reduction.  */
3269   if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3270       && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3271     {
3272       STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3273       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3274     }
3275 
3276   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3277   return is_store;
3278 }
3279 
3280 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3281    For loop vectorization this is done in vectorizable_call, but for SLP
3282    it needs to be deferred until end of vect_schedule_slp, because multiple
3283    SLP instances may refer to the same scalar stmt.  */
3284 
3285 static void
3286 vect_remove_slp_scalar_calls (slp_tree node)
3287 {
3288   gimple stmt, new_stmt;
3289   gimple_stmt_iterator gsi;
3290   int i;
3291   slp_tree child;
3292   tree lhs;
3293   stmt_vec_info stmt_info;
3294 
3295   if (!node)
3296     return;
3297 
3298   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3299     vect_remove_slp_scalar_calls (child);
3300 
3301   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3302     {
3303       if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3304 	continue;
3305       stmt_info = vinfo_for_stmt (stmt);
3306       if (stmt_info == NULL
3307 	  || is_pattern_stmt_p (stmt_info)
3308 	  || !PURE_SLP_STMT (stmt_info))
3309 	continue;
3310       lhs = gimple_call_lhs (stmt);
3311       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3312       set_vinfo_for_stmt (new_stmt, stmt_info);
3313       set_vinfo_for_stmt (stmt, NULL);
3314       STMT_VINFO_STMT (stmt_info) = new_stmt;
3315       gsi = gsi_for_stmt (stmt);
3316       gsi_replace (&gsi, new_stmt, false);
3317       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3318     }
3319 }
3320 
3321 /* Generate vector code for all SLP instances in the loop/basic block.  */
3322 
3323 bool
3324 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3325 {
3326   vec<slp_instance> slp_instances;
3327   slp_instance instance;
3328   unsigned int i, vf;
3329   bool is_store = false;
3330 
3331   if (loop_vinfo)
3332     {
3333       slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3334       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3335     }
3336   else
3337     {
3338       slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3339       vf = 1;
3340     }
3341 
3342   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3343     {
3344       /* Schedule the tree of INSTANCE.  */
3345       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3346                                              instance, vf);
3347       if (dump_enabled_p ())
3348 	dump_printf_loc (MSG_NOTE, vect_location,
3349                          "vectorizing stmts using SLP.\n");
3350     }
3351 
3352   FOR_EACH_VEC_ELT (slp_instances, i, instance)
3353     {
3354       slp_tree root = SLP_INSTANCE_TREE (instance);
3355       gimple store;
3356       unsigned int j;
3357       gimple_stmt_iterator gsi;
3358 
3359       /* Remove scalar call stmts.  Do not do this for basic-block
3360 	 vectorization as not all uses may be vectorized.
3361 	 ???  Why should this be necessary?  DCE should be able to
3362 	 remove the stmts itself.
3363 	 ???  For BB vectorization we can as well remove scalar
3364 	 stmts starting from the SLP tree root if they have no
3365 	 uses.  */
3366       if (loop_vinfo)
3367 	vect_remove_slp_scalar_calls (root);
3368 
3369       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3370                   && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3371         {
3372           if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3373             break;
3374 
3375          if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3376            store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3377           /* Free the attached stmt_vec_info and remove the stmt.  */
3378           gsi = gsi_for_stmt (store);
3379 	  unlink_stmt_vdef (store);
3380           gsi_remove (&gsi, true);
3381 	  release_defs (store);
3382           free_stmt_vec_info (store);
3383         }
3384     }
3385 
3386   return is_store;
3387 }
3388 
3389 
3390 /* Vectorize the basic block.  */
3391 
3392 void
3393 vect_slp_transform_bb (basic_block bb)
3394 {
3395   bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3396   gimple_stmt_iterator si;
3397 
3398   gcc_assert (bb_vinfo);
3399 
3400   if (dump_enabled_p ())
3401     dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3402 
3403   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3404     {
3405       gimple stmt = gsi_stmt (si);
3406       stmt_vec_info stmt_info;
3407 
3408       if (dump_enabled_p ())
3409         {
3410           dump_printf_loc (MSG_NOTE, vect_location,
3411                            "------>SLPing statement: ");
3412           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3413           dump_printf (MSG_NOTE, "\n");
3414         }
3415 
3416       stmt_info = vinfo_for_stmt (stmt);
3417       gcc_assert (stmt_info);
3418 
3419       /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3420       if (STMT_SLP_TYPE (stmt_info))
3421         {
3422           vect_schedule_slp (NULL, bb_vinfo);
3423           break;
3424         }
3425     }
3426 
3427   if (dump_enabled_p ())
3428     dump_printf_loc (MSG_NOTE, vect_location,
3429 		     "BASIC BLOCK VECTORIZED\n");
3430 
3431   destroy_bb_vec_info (bb_vinfo);
3432 }
3433