xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-vect-slp.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"		/* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52 
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 					  slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56 
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59 
60 void
vect_slp_init(void)61 vect_slp_init (void)
62 {
63   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65 
66 void
vect_slp_fini(void)67 vect_slp_fini (void)
68 {
69   while (slp_first_node)
70     delete slp_first_node;
71   delete slp_tree_pool;
72   slp_tree_pool = NULL;
73 }
74 
75 void *
operator new(size_t n)76 _slp_tree::operator new (size_t n)
77 {
78   gcc_assert (n == sizeof (_slp_tree));
79   return slp_tree_pool->allocate_raw ();
80 }
81 
82 void
operator delete(void * node,size_t n)83 _slp_tree::operator delete (void *node, size_t n)
84 {
85   gcc_assert (n == sizeof (_slp_tree));
86   slp_tree_pool->remove_raw (node);
87 }
88 
89 
90 /* Initialize a SLP node.  */
91 
_slp_tree()92 _slp_tree::_slp_tree ()
93 {
94   this->prev_node = NULL;
95   if (slp_first_node)
96     slp_first_node->prev_node = this;
97   this->next_node = slp_first_node;
98   slp_first_node = this;
99   SLP_TREE_SCALAR_STMTS (this) = vNULL;
100   SLP_TREE_SCALAR_OPS (this) = vNULL;
101   SLP_TREE_VEC_STMTS (this) = vNULL;
102   SLP_TREE_VEC_DEFS (this) = vNULL;
103   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104   SLP_TREE_CHILDREN (this) = vNULL;
105   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108   SLP_TREE_CODE (this) = ERROR_MARK;
109   SLP_TREE_VECTYPE (this) = NULL_TREE;
110   SLP_TREE_REPRESENTATIVE (this) = NULL;
111   SLP_TREE_REF_COUNT (this) = 1;
112   this->failed = NULL;
113   this->max_nunits = 1;
114   this->lanes = 0;
115 }
116 
117 /* Tear down a SLP node.  */
118 
~_slp_tree()119 _slp_tree::~_slp_tree ()
120 {
121   if (this->prev_node)
122     this->prev_node->next_node = this->next_node;
123   else
124     slp_first_node = this->next_node;
125   if (this->next_node)
126     this->next_node->prev_node = this->prev_node;
127   SLP_TREE_CHILDREN (this).release ();
128   SLP_TREE_SCALAR_STMTS (this).release ();
129   SLP_TREE_SCALAR_OPS (this).release ();
130   SLP_TREE_VEC_STMTS (this).release ();
131   SLP_TREE_VEC_DEFS (this).release ();
132   SLP_TREE_LOAD_PERMUTATION (this).release ();
133   SLP_TREE_LANE_PERMUTATION (this).release ();
134   if (this->failed)
135     free (failed);
136 }
137 
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
139 
140 void
vect_free_slp_tree(slp_tree node)141 vect_free_slp_tree (slp_tree node)
142 {
143   int i;
144   slp_tree child;
145 
146   if (--SLP_TREE_REF_COUNT (node) != 0)
147     return;
148 
149   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150     if (child)
151       vect_free_slp_tree (child);
152 
153   /* If the node defines any SLP only patterns then those patterns are no
154      longer valid and should be removed.  */
155   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157     {
158       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161     }
162 
163   delete node;
164 }
165 
166 /* Return a location suitable for dumpings related to the SLP instance.  */
167 
168 dump_user_location_t
location() const169 _slp_instance::location () const
170 {
171   if (!root_stmts.is_empty ())
172     return root_stmts[0]->stmt;
173   else
174     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176 
177 
178 /* Free the memory allocated for the SLP instance.  */
179 
180 void
vect_free_slp_instance(slp_instance instance)181 vect_free_slp_instance (slp_instance instance)
182 {
183   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184   SLP_INSTANCE_LOADS (instance).release ();
185   SLP_INSTANCE_ROOT_STMTS (instance).release ();
186   instance->subgraph_entries.release ();
187   instance->cost_vec.release ();
188   free (instance);
189 }
190 
191 
192 /* Create an SLP node for SCALAR_STMTS.  */
193 
194 slp_tree
vect_create_new_slp_node(unsigned nops,tree_code code)195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197   slp_tree node = new _slp_tree;
198   SLP_TREE_SCALAR_STMTS (node) = vNULL;
199   SLP_TREE_CHILDREN (node).create (nops);
200   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201   SLP_TREE_CODE (node) = code;
202   return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS.  */
205 
206 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<stmt_vec_info> scalar_stmts,unsigned nops)207 vect_create_new_slp_node (slp_tree node,
208 			  vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211   SLP_TREE_CHILDREN (node).create (nops);
212   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214   SLP_TREE_LANES (node) = scalar_stmts.length ();
215   return node;
216 }
217 
218 /* Create an SLP node for SCALAR_STMTS.  */
219 
220 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts,unsigned nops)221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225 
226 /* Create an SLP node for OPS.  */
227 
228 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<tree> ops)229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231   SLP_TREE_SCALAR_OPS (node) = ops;
232   SLP_TREE_DEF_TYPE (node) = vect_external_def;
233   SLP_TREE_LANES (node) = ops.length ();
234   return node;
235 }
236 
237 /* Create an SLP node for OPS.  */
238 
239 static slp_tree
vect_create_new_slp_node(vec<tree> ops)240 vect_create_new_slp_node (vec<tree> ops)
241 {
242   return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244 
245 
246 /* This structure is used in creation of an SLP tree.  Each instance
247    corresponds to the same operand in a group of scalar stmts in an SLP
248    node.  */
249 typedef struct _slp_oprnd_info
250 {
251   /* Def-stmts for the operands.  */
252   vec<stmt_vec_info> def_stmts;
253   /* Operands.  */
254   vec<tree> ops;
255   /* Information about the first statement, its vector def-type, type, the
256      operand itself in case it's constant, and an indication if it's a pattern
257      stmt.  */
258   tree first_op_type;
259   enum vect_def_type first_dt;
260   bool any_pattern;
261 } *slp_oprnd_info;
262 
263 
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265    operand.  */
266 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)267 vect_create_oprnd_info (int nops, int group_size)
268 {
269   int i;
270   slp_oprnd_info oprnd_info;
271   vec<slp_oprnd_info> oprnds_info;
272 
273   oprnds_info.create (nops);
274   for (i = 0; i < nops; i++)
275     {
276       oprnd_info = XNEW (struct _slp_oprnd_info);
277       oprnd_info->def_stmts.create (group_size);
278       oprnd_info->ops.create (group_size);
279       oprnd_info->first_dt = vect_uninitialized_def;
280       oprnd_info->first_op_type = NULL_TREE;
281       oprnd_info->any_pattern = false;
282       oprnds_info.quick_push (oprnd_info);
283     }
284 
285   return oprnds_info;
286 }
287 
288 
289 /* Free operands info.  */
290 
291 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294   int i;
295   slp_oprnd_info oprnd_info;
296 
297   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298     {
299       oprnd_info->def_stmts.release ();
300       oprnd_info->ops.release ();
301       XDELETE (oprnd_info);
302     }
303 
304   oprnds_info.release ();
305 }
306 
307 
308 /* Return true if STMTS contains a pattern statement.  */
309 
310 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313   stmt_vec_info stmt_info;
314   unsigned int i;
315   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316     if (is_pattern_stmt_p (stmt_info))
317       return true;
318   return false;
319 }
320 
321 /* Return true when all lanes in the external or constant NODE have
322    the same value.  */
323 
324 static bool
vect_slp_tree_uniform_p(slp_tree node)325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 	      || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329 
330   /* Pre-exsting vectors.  */
331   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332     return false;
333 
334   unsigned i;
335   tree op, first = NULL_TREE;
336   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337     if (!first)
338       first = op;
339     else if (!operand_equal_p (first, op, 0))
340       return false;
341 
342   return true;
343 }
344 
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
347    of the chain.  */
348 
349 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 				      stmt_vec_info first_stmt_info)
352 {
353   stmt_vec_info next_stmt_info = first_stmt_info;
354   int result = 0;
355 
356   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357     return -1;
358 
359   do
360     {
361       if (next_stmt_info == stmt_info)
362 	return result;
363       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364       if (next_stmt_info)
365 	result += DR_GROUP_GAP (next_stmt_info);
366     }
367   while (next_stmt_info);
368 
369   return -1;
370 }
371 
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373    using the method implemented by duplicate_and_interleave.  Return true
374    if so, returning the number of intermediate vectors in *NVECTORS_OUT
375    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376    (if nonnull).  */
377 
378 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 				tree elt_type, unsigned int *nvectors_out,
381 				tree *vector_type_out,
382 				tree *permutes)
383 {
384   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386     return false;
387 
388   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390   unsigned int nvectors = 1;
391   for (;;)
392     {
393       scalar_int_mode int_mode;
394       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 	{
397 	  /* Get the natural vector type for this SLP group size.  */
398 	  tree int_type = build_nonstandard_integer_type
399 	    (GET_MODE_BITSIZE (int_mode), 1);
400 	  tree vector_type
401 	    = get_vectype_for_scalar_type (vinfo, int_type, count);
402 	  if (vector_type
403 	      && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 	      && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 			   GET_MODE_SIZE (base_vector_mode)))
406 	    {
407 	      /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 		 together into elements of type INT_TYPE and using the result
409 		 to build NVECTORS vectors.  */
410 	      poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 	      vec_perm_builder sel1 (nelts, 2, 3);
412 	      vec_perm_builder sel2 (nelts, 2, 3);
413 	      poly_int64 half_nelts = exact_div (nelts, 2);
414 	      for (unsigned int i = 0; i < 3; ++i)
415 		{
416 		  sel1.quick_push (i);
417 		  sel1.quick_push (i + nelts);
418 		  sel2.quick_push (half_nelts + i);
419 		  sel2.quick_push (half_nelts + i + nelts);
420 		}
421 	      vec_perm_indices indices1 (sel1, 2, nelts);
422 	      vec_perm_indices indices2 (sel2, 2, nelts);
423 	      if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 		  && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 		{
426 		  if (nvectors_out)
427 		    *nvectors_out = nvectors;
428 		  if (vector_type_out)
429 		    *vector_type_out = vector_type;
430 		  if (permutes)
431 		    {
432 		      permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 								indices1);
434 		      permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 								indices2);
436 		    }
437 		  return true;
438 		}
439 	    }
440 	}
441       if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 	return false;
443       nvectors *= 2;
444     }
445 }
446 
447 /* Return true if DTA and DTB match.  */
448 
449 static bool
vect_def_types_match(enum vect_def_type dta,enum vect_def_type dtb)450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452   return (dta == dtb
453 	  || ((dta == vect_external_def || dta == vect_constant_def)
454 	      && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456 
457 static const int cond_expr_maps[3][5] = {
458   { 4, -1, -2, 1, 2 },
459   { 4, -2, -1, 1, 2 },
460   { 4, -1, -2, 2, 1 }
461 };
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
465 
466 /* For most SLP statements, there is a one-to-one mapping between
467    gimple arguments and child nodes.  If that is not true for STMT,
468    return an array that contains:
469 
470    - the number of child nodes, followed by
471    - for each child node, the index of the argument associated with that node.
472      The special index -1 is the first operand of an embedded comparison and
473      the special index -2 is the second operand of an embedded comparison.
474 
475    SWAP is as for vect_get_and_check_slp_defs.  */
476 
477 static const int *
vect_get_operand_map(const gimple * stmt,unsigned char swap=0)478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
479 {
480   if (auto assign = dyn_cast<const gassign *> (stmt))
481     {
482       if (gimple_assign_rhs_code (assign) == COND_EXPR
483 	  && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484 	return cond_expr_maps[swap];
485     }
486   gcc_assert (!swap);
487   if (auto call = dyn_cast<const gcall *> (stmt))
488     {
489       if (gimple_call_internal_p (call))
490 	switch (gimple_call_internal_fn (call))
491 	  {
492 	  case IFN_MASK_LOAD:
493 	    return arg2_map;
494 
495 	  case IFN_GATHER_LOAD:
496 	    return arg1_map;
497 
498 	  case IFN_MASK_GATHER_LOAD:
499 	    return arg1_arg4_map;
500 
501 	  default:
502 	    break;
503 	  }
504     }
505   return nullptr;
506 }
507 
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509    they are of a valid type and that they match the defs of the first stmt of
510    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
511    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
512    indicates swap is required for cond_expr stmts.  Specifically, SWAP
513    is 1 if STMT is cond and operands of comparison need to be swapped;
514    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
515 
516    If there was a fatal error return -1; if the error could be corrected by
517    swapping operands of father node of this one, return 1; if everything is
518    ok return 0.  */
519 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char swap,bool * skip_args,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521 			     bool *skip_args,
522 			     vec<stmt_vec_info> stmts, unsigned stmt_num,
523 			     vec<slp_oprnd_info> *oprnds_info)
524 {
525   stmt_vec_info stmt_info = stmts[stmt_num];
526   tree oprnd;
527   unsigned int i, number_of_oprnds;
528   enum vect_def_type dt = vect_uninitialized_def;
529   slp_oprnd_info oprnd_info;
530   unsigned int commutative_op = -1U;
531   bool first = stmt_num == 0;
532 
533   if (!is_a<gcall *> (stmt_info->stmt)
534       && !is_a<gassign *> (stmt_info->stmt)
535       && !is_a<gphi *> (stmt_info->stmt))
536     return -1;
537 
538   number_of_oprnds = gimple_num_args (stmt_info->stmt);
539   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540   if (map)
541     number_of_oprnds = *map++;
542   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
543     {
544       if (gimple_call_internal_p (stmt))
545 	{
546 	  internal_fn ifn = gimple_call_internal_fn (stmt);
547 	  commutative_op = first_commutative_argument (ifn);
548 	}
549     }
550   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
551     {
552       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553 	commutative_op = 0;
554     }
555 
556   bool swapped = (swap != 0);
557   bool backedge = false;
558   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559   for (i = 0; i < number_of_oprnds; i++)
560     {
561       int opno = map ? map[i] : int (i);
562       if (opno < 0)
563 	oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564       else
565 	{
566 	  oprnd = gimple_arg (stmt_info->stmt, opno);
567 	  if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568 	    backedge = dominated_by_p (CDI_DOMINATORS,
569 				       gimple_phi_arg_edge (stmt, opno)->src,
570 				       gimple_bb (stmt_info->stmt));
571 	}
572       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573 	oprnd = TREE_OPERAND (oprnd, 0);
574 
575       oprnd_info = (*oprnds_info)[i];
576 
577       stmt_vec_info def_stmt_info;
578       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
579 	{
580 	  if (dump_enabled_p ())
581 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 			     "Build SLP failed: can't analyze def for %T\n",
583 			     oprnd);
584 
585 	  return -1;
586 	}
587 
588       if (skip_args[i])
589 	{
590 	  oprnd_info->def_stmts.quick_push (NULL);
591 	  oprnd_info->ops.quick_push (NULL_TREE);
592 	  oprnd_info->first_dt = vect_uninitialized_def;
593 	  continue;
594 	}
595 
596       oprnd_info->def_stmts.quick_push (def_stmt_info);
597       oprnd_info->ops.quick_push (oprnd);
598 
599       if (def_stmt_info
600 	  && is_pattern_stmt_p (def_stmt_info))
601 	{
602 	  if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603 	      != def_stmt_info)
604 	    oprnd_info->any_pattern = true;
605 	  else
606 	    /* If we promote this to external use the original stmt def.  */
607 	    oprnd_info->ops.last ()
608 	      = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
609 	}
610 
611       /* If there's a extern def on a backedge make sure we can
612 	 code-generate at the region start.
613 	 ???  This is another case that could be fixed by adjusting
614 	 how we split the function but at the moment we'd have conflicting
615 	 goals there.  */
616       if (backedge
617 	  && dts[i] == vect_external_def
618 	  && is_a <bb_vec_info> (vinfo)
619 	  && TREE_CODE (oprnd) == SSA_NAME
620 	  && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621 	  && !dominated_by_p (CDI_DOMINATORS,
622 			      as_a <bb_vec_info> (vinfo)->bbs[0],
623 			      gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
624 	{
625 	  if (dump_enabled_p ())
626 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 			     "Build SLP failed: extern def %T only defined "
628 			     "on backedge\n", oprnd);
629 	  return -1;
630 	}
631 
632       if (first)
633 	{
634 	  tree type = TREE_TYPE (oprnd);
635 	  dt = dts[i];
636 	  if ((dt == vect_constant_def
637 	       || dt == vect_external_def)
638 	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639 	      && (TREE_CODE (type) == BOOLEAN_TYPE
640 		  || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641 						      type)))
642 	    {
643 	      if (dump_enabled_p ())
644 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 				 "Build SLP failed: invalid type of def "
646 				 "for variable-length SLP %T\n", oprnd);
647 	      return -1;
648 	    }
649 
650 	  /* For the swapping logic below force vect_reduction_def
651 	     for the reduction op in a SLP reduction group.  */
652 	  if (!STMT_VINFO_DATA_REF (stmt_info)
653 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654 	      && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655 	      && def_stmt_info)
656 	    dts[i] = dt = vect_reduction_def;
657 
658 	  /* Check the types of the definition.  */
659 	  switch (dt)
660 	    {
661 	    case vect_external_def:
662 	    case vect_constant_def:
663 	    case vect_internal_def:
664 	    case vect_reduction_def:
665 	    case vect_induction_def:
666 	    case vect_nested_cycle:
667 	      break;
668 
669 	    default:
670 	      /* FORNOW: Not supported.  */
671 	      if (dump_enabled_p ())
672 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 				 "Build SLP failed: illegal type of def %T\n",
674 				 oprnd);
675 	      return -1;
676 	    }
677 
678 	  oprnd_info->first_dt = dt;
679 	  oprnd_info->first_op_type = type;
680 	}
681     }
682   if (first)
683     return 0;
684 
685   /* Now match the operand definition types to that of the first stmt.  */
686   for (i = 0; i < number_of_oprnds;)
687     {
688       if (skip_args[i])
689 	{
690 	  ++i;
691 	  continue;
692 	}
693 
694       oprnd_info = (*oprnds_info)[i];
695       dt = dts[i];
696       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697       oprnd = oprnd_info->ops[stmt_num];
698       tree type = TREE_TYPE (oprnd);
699 
700       if (!types_compatible_p (oprnd_info->first_op_type, type))
701 	{
702 	  if (dump_enabled_p ())
703 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 			     "Build SLP failed: different operand types\n");
705 	  return 1;
706 	}
707 
708       /* Not first stmt of the group, check that the def-stmt/s match
709 	 the def-stmt/s of the first stmt.  Allow different definition
710 	 types for reduction chains: the first stmt must be a
711 	 vect_reduction_def (a phi node), and the rest
712 	 end in the reduction chain.  */
713       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714 	   && !(oprnd_info->first_dt == vect_reduction_def
715 		&& !STMT_VINFO_DATA_REF (stmt_info)
716 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 		&& def_stmt_info
718 		&& !STMT_VINFO_DATA_REF (def_stmt_info)
719 		&& (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720 		    == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721 	  || (!STMT_VINFO_DATA_REF (stmt_info)
722 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 	      && ((!def_stmt_info
724 		   || STMT_VINFO_DATA_REF (def_stmt_info)
725 		   || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726 		       != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727 		  != (oprnd_info->first_dt != vect_reduction_def))))
728 	{
729 	  /* Try swapping operands if we got a mismatch.  For BB
730 	     vectorization only in case it will clearly improve things.  */
731 	  if (i == commutative_op && !swapped
732 	      && (!is_a <bb_vec_info> (vinfo)
733 		  || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734 					     dts[i+1])
735 		      && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736 			  || vect_def_types_match
737 			       ((*oprnds_info)[i+1]->first_dt, dts[i])))))
738 	    {
739 	      if (dump_enabled_p ())
740 		dump_printf_loc (MSG_NOTE, vect_location,
741 				 "trying swapped operands\n");
742 	      std::swap (dts[i], dts[i+1]);
743 	      std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744 			 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745 	      std::swap ((*oprnds_info)[i]->ops[stmt_num],
746 			 (*oprnds_info)[i+1]->ops[stmt_num]);
747 	      swapped = true;
748 	      continue;
749 	    }
750 
751 	  if (is_a <bb_vec_info> (vinfo)
752 	      && !oprnd_info->any_pattern)
753 	    {
754 	      /* Now for commutative ops we should see whether we can
755 		 make the other operand matching.  */
756 	      if (dump_enabled_p ())
757 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 				 "treating operand as external\n");
759 	      oprnd_info->first_dt = dt = vect_external_def;
760 	    }
761 	  else
762 	    {
763 	      if (dump_enabled_p ())
764 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 				 "Build SLP failed: different types\n");
766 	      return 1;
767 	    }
768 	}
769 
770       /* Make sure to demote the overall operand to external.  */
771       if (dt == vect_external_def)
772 	oprnd_info->first_dt = vect_external_def;
773       /* For a SLP reduction chain we want to duplicate the reduction to
774 	 each of the chain members.  That gets us a sane SLP graph (still
775 	 the stmts are not 100% correct wrt the initial values).  */
776       else if ((dt == vect_internal_def
777 		|| dt == vect_reduction_def)
778 	       && oprnd_info->first_dt == vect_reduction_def
779 	       && !STMT_VINFO_DATA_REF (stmt_info)
780 	       && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 	       && !STMT_VINFO_DATA_REF (def_stmt_info)
782 	       && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783 		   == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
784 	{
785 	  oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786 	  oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
787 	}
788 
789       ++i;
790     }
791 
792   /* Swap operands.  */
793   if (swapped)
794     {
795       if (dump_enabled_p ())
796 	dump_printf_loc (MSG_NOTE, vect_location,
797 			 "swapped operands to match def types in %G",
798 			 stmt_info->stmt);
799     }
800 
801   return 0;
802 }
803 
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805    to be combined into the same SLP group.  */
806 
807 bool
compatible_calls_p(gcall * call1,gcall * call2)808 compatible_calls_p (gcall *call1, gcall *call2)
809 {
810   unsigned int nargs = gimple_call_num_args (call1);
811   if (nargs != gimple_call_num_args (call2))
812     return false;
813 
814   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815     return false;
816 
817   if (gimple_call_internal_p (call1))
818     {
819       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820 			       TREE_TYPE (gimple_call_lhs (call2))))
821 	return false;
822       for (unsigned int i = 0; i < nargs; ++i)
823 	if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824 				 TREE_TYPE (gimple_call_arg (call2, i))))
825 	  return false;
826     }
827   else
828     {
829       if (!operand_equal_p (gimple_call_fn (call1),
830 			    gimple_call_fn (call2), 0))
831 	return false;
832 
833       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834 	return false;
835     }
836 
837   /* Check that any unvectorized arguments are equal.  */
838   if (const int *map = vect_get_operand_map (call1))
839     {
840       unsigned int nkept = *map++;
841       unsigned int mapi = 0;
842       for (unsigned int i = 0; i < nargs; ++i)
843 	if (mapi < nkept && map[mapi] == int (i))
844 	  mapi += 1;
845 	else if (!operand_equal_p (gimple_call_arg (call1, i),
846 				   gimple_call_arg (call2, i)))
847 	  return false;
848     }
849 
850   return true;
851 }
852 
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854    caller's attempt to find the vector type in STMT_INFO with the narrowest
855    element type.  Return true if VECTYPE is nonnull and if it is valid
856    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
857    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
858    vect_build_slp_tree.  */
859 
860 static bool
vect_record_max_nunits(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862 			unsigned int group_size,
863 			tree vectype, poly_uint64 *max_nunits)
864 {
865   if (!vectype)
866     {
867       if (dump_enabled_p ())
868 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 			 "Build SLP failed: unsupported data-type in %G\n",
870 			 stmt_info->stmt);
871       /* Fatal mismatch.  */
872       return false;
873     }
874 
875   /* If populating the vector type requires unrolling then fail
876      before adjusting *max_nunits for basic-block vectorization.  */
877   if (is_a <bb_vec_info> (vinfo)
878       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879     {
880       if (dump_enabled_p ())
881 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 			 "Build SLP failed: unrolling required "
883 			 "in basic block SLP\n");
884       /* Fatal mismatch.  */
885       return false;
886     }
887 
888   /* In case of multiple types we need to detect the smallest type.  */
889   vect_update_max_nunits (max_nunits, vectype);
890   return true;
891 }
892 
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894    permutation or are of unsupported types of operation.  Return
895    true if they are, otherwise return false and indicate in *MATCHES
896    which stmts are not isomorphic to the first one.  If MATCHES[0]
897    is false then this indicates the comparison could not be
898    carried out or the stmts will never be vectorized by SLP.
899 
900    Note COND_EXPR is possibly isomorphic to another one after swapping its
901    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902    the first stmt by swapping the two operands of comparison; set SWAP[i]
903    to 2 if stmt I is isormorphic to the first stmt by inverting the code
904    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
906 
907 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators,tree * node_vectype)908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909 		       vec<stmt_vec_info> stmts, unsigned int group_size,
910 		       poly_uint64 *max_nunits, bool *matches,
911 		       bool *two_operators, tree *node_vectype)
912 {
913   unsigned int i;
914   stmt_vec_info first_stmt_info = stmts[0];
915   code_helper first_stmt_code = ERROR_MARK;
916   code_helper alt_stmt_code = ERROR_MARK;
917   code_helper rhs_code = ERROR_MARK;
918   code_helper first_cond_code = ERROR_MARK;
919   tree lhs;
920   bool need_same_oprnds = false;
921   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922   stmt_vec_info first_load = NULL, prev_first_load = NULL;
923   bool first_stmt_load_p = false, load_p = false;
924   bool first_stmt_phi_p = false, phi_p = false;
925   bool maybe_soft_fail = false;
926   tree soft_fail_nunits_vectype = NULL_TREE;
927 
928   /* For every stmt in NODE find its def stmt/s.  */
929   stmt_vec_info stmt_info;
930   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
931     {
932       gimple *stmt = stmt_info->stmt;
933       swap[i] = 0;
934       matches[i] = false;
935 
936       if (dump_enabled_p ())
937 	dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
938 
939       /* Fail to vectorize statements marked as unvectorizable, throw
940 	 or are volatile.  */
941       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942 	  || stmt_can_throw_internal (cfun, stmt)
943 	  || gimple_has_volatile_ops (stmt))
944         {
945           if (dump_enabled_p ())
946 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 			     "Build SLP failed: unvectorizable statement %G",
948 			     stmt);
949 	  /* ???  For BB vectorization we want to commutate operands in a way
950 	     to shuffle all unvectorizable defs into one operand and have
951 	     the other still vectorized.  The following doesn't reliably
952 	     work for this though but it's the easiest we can do here.  */
953 	  if (is_a <bb_vec_info> (vinfo) && i != 0)
954 	    continue;
955 	  /* Fatal mismatch.  */
956 	  matches[0] = false;
957           return false;
958         }
959 
960       lhs = gimple_get_lhs (stmt);
961       if (lhs == NULL_TREE)
962 	{
963 	  if (dump_enabled_p ())
964 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 			     "Build SLP failed: not GIMPLE_ASSIGN nor "
966 			     "GIMPLE_CALL %G", stmt);
967 	  if (is_a <bb_vec_info> (vinfo) && i != 0)
968 	    continue;
969 	  /* Fatal mismatch.  */
970 	  matches[0] = false;
971 	  return false;
972 	}
973 
974       tree nunits_vectype;
975       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976 					   &nunits_vectype, group_size))
977 	{
978 	  if (is_a <bb_vec_info> (vinfo) && i != 0)
979 	    continue;
980 	  /* Fatal mismatch.  */
981 	  matches[0] = false;
982 	  return false;
983 	}
984       /* Record nunits required but continue analysis, producing matches[]
985 	 as if nunits was not an issue.  This allows splitting of groups
986 	 to happen.  */
987       if (nunits_vectype
988 	  && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989 				      nunits_vectype, max_nunits))
990 	{
991 	  gcc_assert (is_a <bb_vec_info> (vinfo));
992 	  maybe_soft_fail = true;
993 	  soft_fail_nunits_vectype = nunits_vectype;
994 	}
995 
996       gcc_assert (vectype);
997 
998       gcall *call_stmt = dyn_cast <gcall *> (stmt);
999       if (call_stmt)
1000 	{
1001 	  combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002 	  if (cfn != CFN_LAST)
1003 	    rhs_code = cfn;
1004 	  else
1005 	    rhs_code = CALL_EXPR;
1006 
1007 	  if (cfn == CFN_MASK_LOAD
1008 	      || cfn == CFN_GATHER_LOAD
1009 	      || cfn == CFN_MASK_GATHER_LOAD)
1010 	    load_p = true;
1011 	  else if ((internal_fn_p (cfn)
1012 		    && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013 		   || gimple_call_tail_p (call_stmt)
1014 		   || gimple_call_noreturn_p (call_stmt)
1015 		   || gimple_call_chain (call_stmt))
1016 	    {
1017 	      if (dump_enabled_p ())
1018 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 				 "Build SLP failed: unsupported call type %G",
1020 				 call_stmt);
1021 	      if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 		continue;
1023 	      /* Fatal mismatch.  */
1024 	      matches[0] = false;
1025 	      return false;
1026 	    }
1027 	}
1028       else if (gimple_code (stmt) == GIMPLE_PHI)
1029 	{
1030 	  rhs_code = ERROR_MARK;
1031 	  phi_p = true;
1032 	}
1033       else
1034 	{
1035 	  rhs_code = gimple_assign_rhs_code (stmt);
1036 	  load_p = gimple_vuse (stmt);
1037 	}
1038 
1039       /* Check the operation.  */
1040       if (i == 0)
1041 	{
1042 	  *node_vectype = vectype;
1043 	  first_stmt_code = rhs_code;
1044 	  first_stmt_load_p = load_p;
1045 	  first_stmt_phi_p = phi_p;
1046 
1047 	  /* Shift arguments should be equal in all the packed stmts for a
1048 	     vector shift with scalar shift operand.  */
1049 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050 	      || rhs_code == LROTATE_EXPR
1051 	      || rhs_code == RROTATE_EXPR)
1052 	    {
1053 	      /* First see if we have a vector/vector shift.  */
1054 	      if (!directly_supported_p (rhs_code, vectype, optab_vector))
1055 		{
1056 		  /* No vector/vector shift, try for a vector/scalar shift.  */
1057 		  if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1058 		    {
1059 		      if (dump_enabled_p ())
1060 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 					 "Build SLP failed: "
1062 					 "op not supported by target.\n");
1063 		      if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 			continue;
1065 		      /* Fatal mismatch.  */
1066 		      matches[0] = false;
1067 		      return false;
1068 		    }
1069 		  need_same_oprnds = true;
1070 		  first_op1 = gimple_assign_rhs2 (stmt);
1071 		}
1072 	    }
1073 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
1074             {
1075               need_same_oprnds = true;
1076               first_op1 = gimple_assign_rhs2 (stmt);
1077             }
1078 	  else if (!load_p
1079 		   && rhs_code == BIT_FIELD_REF)
1080 	    {
1081 	      tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082 	      if (!is_a <bb_vec_info> (vinfo)
1083 		  || TREE_CODE (vec) != SSA_NAME
1084 		  || !operand_equal_p (TYPE_SIZE (vectype),
1085 				       TYPE_SIZE (TREE_TYPE (vec))))
1086 		{
1087 		  if (dump_enabled_p ())
1088 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 				     "Build SLP failed: "
1090 				     "BIT_FIELD_REF not supported\n");
1091 		  /* Fatal mismatch.  */
1092 		  matches[0] = false;
1093 		  return false;
1094 		}
1095 	    }
1096 	  else if (rhs_code == CFN_DIV_POW2)
1097 	    {
1098 	      need_same_oprnds = true;
1099 	      first_op1 = gimple_call_arg (call_stmt, 1);
1100 	    }
1101 	}
1102       else
1103 	{
1104 	  if (first_stmt_code != rhs_code
1105 	      && alt_stmt_code == ERROR_MARK)
1106 	    alt_stmt_code = rhs_code;
1107 	  if ((first_stmt_code != rhs_code
1108 	       && (first_stmt_code != IMAGPART_EXPR
1109 		   || rhs_code != REALPART_EXPR)
1110 	       && (first_stmt_code != REALPART_EXPR
1111 		   || rhs_code != IMAGPART_EXPR)
1112 	       /* Handle mismatches in plus/minus by computing both
1113 		  and merging the results.  */
1114 	       && !((first_stmt_code == PLUS_EXPR
1115 		     || first_stmt_code == MINUS_EXPR)
1116 		    && (alt_stmt_code == PLUS_EXPR
1117 			|| alt_stmt_code == MINUS_EXPR)
1118 		    && rhs_code == alt_stmt_code)
1119 	       && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 		    && (first_stmt_code == ARRAY_REF
1121 			|| first_stmt_code == BIT_FIELD_REF
1122 			|| first_stmt_code == INDIRECT_REF
1123 			|| first_stmt_code == COMPONENT_REF
1124 			|| first_stmt_code == MEM_REF)
1125 		    && (rhs_code == ARRAY_REF
1126 			|| rhs_code == BIT_FIELD_REF
1127 			|| rhs_code == INDIRECT_REF
1128 			|| rhs_code == COMPONENT_REF
1129 			|| rhs_code == MEM_REF)))
1130 	      || first_stmt_load_p != load_p
1131 	      || first_stmt_phi_p != phi_p)
1132 	    {
1133 	      if (dump_enabled_p ())
1134 		{
1135 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 				   "Build SLP failed: different operation "
1137 				   "in stmt %G", stmt);
1138 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 				   "original stmt %G", first_stmt_info->stmt);
1140 		}
1141 	      /* Mismatch.  */
1142 	      continue;
1143 	    }
1144 
1145 	  if (!load_p
1146 	      && first_stmt_code == BIT_FIELD_REF
1147 	      && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1148 		  != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1149 	    {
1150 	      if (dump_enabled_p ())
1151 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 				 "Build SLP failed: different BIT_FIELD_REF "
1153 				 "arguments in %G", stmt);
1154 	      /* Mismatch.  */
1155 	      continue;
1156 	    }
1157 
1158 	  if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1159 	    {
1160 	      if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1161 				       call_stmt))
1162 		{
1163 		  if (dump_enabled_p ())
1164 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 				     "Build SLP failed: different calls in %G",
1166 				     stmt);
1167 		  /* Mismatch.  */
1168 		  continue;
1169 		}
1170 	    }
1171 
1172 	  if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1173 	      && (gimple_bb (first_stmt_info->stmt)
1174 		  != gimple_bb (stmt_info->stmt)))
1175 	    {
1176 	      if (dump_enabled_p ())
1177 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 				 "Build SLP failed: different BB for PHI "
1179 				 "or possibly trapping operation in %G", stmt);
1180 	      /* Mismatch.  */
1181 	      continue;
1182 	    }
1183 
1184 	  if (need_same_oprnds)
1185 	    {
1186 	      tree other_op1 = gimple_arg (stmt, 1);
1187 	      if (!operand_equal_p (first_op1, other_op1, 0))
1188 		{
1189 		  if (dump_enabled_p ())
1190 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 				     "Build SLP failed: different shift "
1192 				     "arguments in %G", stmt);
1193 		  /* Mismatch.  */
1194 		  continue;
1195 		}
1196 	    }
1197 
1198 	  if (!types_compatible_p (vectype, *node_vectype))
1199 	    {
1200 	      if (dump_enabled_p ())
1201 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 				 "Build SLP failed: different vector type "
1203 				 "in %G", stmt);
1204 	      /* Mismatch.  */
1205 	      continue;
1206 	    }
1207 	}
1208 
1209       /* Grouped store or load.  */
1210       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211 	{
1212 	  if (REFERENCE_CLASS_P (lhs))
1213 	    {
1214 	      /* Store.  */
1215 	      ;
1216 	    }
1217 	  else
1218 	    {
1219 	      /* Load.  */
1220 	      first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1221               if (prev_first_load)
1222                 {
1223                   /* Check that there are no loads from different interleaving
1224                      chains in the same node.  */
1225                   if (prev_first_load != first_load)
1226                     {
1227                       if (dump_enabled_p ())
1228 			dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1229 					 vect_location,
1230 					 "Build SLP failed: different "
1231 					 "interleaving chains in one node %G",
1232 					 stmt);
1233 		      /* Mismatch.  */
1234 		      continue;
1235                     }
1236                 }
1237               else
1238                 prev_first_load = first_load;
1239            }
1240         } /* Grouped access.  */
1241       else
1242 	{
1243 	  if (load_p
1244 	      && rhs_code != CFN_GATHER_LOAD
1245 	      && rhs_code != CFN_MASK_GATHER_LOAD)
1246 	    {
1247 	      /* Not grouped load.  */
1248 	      if (dump_enabled_p ())
1249 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 				 "Build SLP failed: not grouped load %G", stmt);
1251 
1252 	      /* FORNOW: Not grouped loads are not supported.  */
1253 	      if (is_a <bb_vec_info> (vinfo) && i != 0)
1254 		continue;
1255 	      /* Fatal mismatch.  */
1256 	      matches[0] = false;
1257 	      return false;
1258 	    }
1259 
1260 	  /* Not memory operation.  */
1261 	  if (!phi_p
1262 	      && rhs_code.is_tree_code ()
1263 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1264 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1265 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1266 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1267 	      && rhs_code != VIEW_CONVERT_EXPR
1268 	      && rhs_code != CALL_EXPR
1269 	      && rhs_code != BIT_FIELD_REF)
1270 	    {
1271 	      if (dump_enabled_p ())
1272 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273 				 "Build SLP failed: operation unsupported %G",
1274 				 stmt);
1275 	      if (is_a <bb_vec_info> (vinfo) && i != 0)
1276 		continue;
1277 	      /* Fatal mismatch.  */
1278 	      matches[0] = false;
1279 	      return false;
1280 	    }
1281 
1282 	  if (rhs_code == COND_EXPR)
1283 	    {
1284 	      tree cond_expr = gimple_assign_rhs1 (stmt);
1285 	      enum tree_code cond_code = TREE_CODE (cond_expr);
1286 	      enum tree_code swap_code = ERROR_MARK;
1287 	      enum tree_code invert_code = ERROR_MARK;
1288 
1289 	      if (i == 0)
1290 		first_cond_code = TREE_CODE (cond_expr);
1291 	      else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1292 		{
1293 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1294 		  swap_code = swap_tree_comparison (cond_code);
1295 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
1296 		}
1297 
1298 	      if (first_cond_code == cond_code)
1299 		;
1300 	      /* Isomorphic can be achieved by swapping.  */
1301 	      else if (first_cond_code == swap_code)
1302 		swap[i] = 1;
1303 	      /* Isomorphic can be achieved by inverting.  */
1304 	      else if (first_cond_code == invert_code)
1305 		swap[i] = 2;
1306 	      else
1307 		{
1308 		  if (dump_enabled_p ())
1309 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 				     "Build SLP failed: different"
1311 				     " operation %G", stmt);
1312 		  /* Mismatch.  */
1313 		  continue;
1314 		}
1315 	    }
1316 	}
1317 
1318       matches[i] = true;
1319     }
1320 
1321   for (i = 0; i < group_size; ++i)
1322     if (!matches[i])
1323       return false;
1324 
1325   /* If we allowed a two-operation SLP node verify the target can cope
1326      with the permute we are going to use.  */
1327   if (alt_stmt_code != ERROR_MARK
1328       && (!alt_stmt_code.is_tree_code ()
1329 	  || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1330     {
1331       *two_operators = true;
1332     }
1333 
1334   if (maybe_soft_fail)
1335     {
1336       unsigned HOST_WIDE_INT const_nunits;
1337       if (!TYPE_VECTOR_SUBPARTS
1338 	    (soft_fail_nunits_vectype).is_constant (&const_nunits)
1339 	  || const_nunits > group_size)
1340 	matches[0] = false;
1341       else
1342 	{
1343 	  /* With constant vector elements simulate a mismatch at the
1344 	     point we need to split.  */
1345 	  unsigned tail = group_size & (const_nunits - 1);
1346 	  memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1347 	}
1348       return false;
1349     }
1350 
1351   return true;
1352 }
1353 
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355    Note we never remove apart from at destruction time so we do not
1356    need a special value for deleted that differs from empty.  */
1357 struct bst_traits
1358 {
1359   typedef vec <stmt_vec_info> value_type;
1360   typedef vec <stmt_vec_info> compare_type;
1361   static inline hashval_t hash (value_type);
1362   static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1363   static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1364   static inline bool is_deleted (value_type x) { return !x.exists (); }
1365   static const bool empty_zero_p = true;
mark_emptybst_traits1366   static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1367   static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1368   static inline void remove (value_type &x) { x.release (); }
1369 };
1370 inline hashval_t
hash(value_type x)1371 bst_traits::hash (value_type x)
1372 {
1373   inchash::hash h;
1374   for (unsigned i = 0; i < x.length (); ++i)
1375     h.add_int (gimple_uid (x[i]->stmt));
1376   return h.end ();
1377 }
1378 inline bool
equal(value_type existing,value_type candidate)1379 bst_traits::equal (value_type existing, value_type candidate)
1380 {
1381   if (existing.length () != candidate.length ())
1382     return false;
1383   for (unsigned i = 0; i < existing.length (); ++i)
1384     if (existing[i] != candidate[i])
1385       return false;
1386   return true;
1387 }
1388 
1389 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390    but then vec::insert does memmove and that's not compatible with
1391    std::pair.  */
1392 struct chain_op_t
1393 {
chain_op_tchain_op_t1394   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1395       : code (code_), dt (dt_), op (op_) {}
1396   tree_code code;
1397   vect_def_type dt;
1398   tree op;
1399 };
1400 
1401 /* Comparator for sorting associatable chains.  */
1402 
1403 static int
dt_sort_cmp(const void * op1_,const void * op2_,void *)1404 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1405 {
1406   auto *op1 = (const chain_op_t *) op1_;
1407   auto *op2 = (const chain_op_t *) op2_;
1408   if (op1->dt != op2->dt)
1409     return (int)op1->dt - (int)op2->dt;
1410   return (int)op1->code - (int)op2->code;
1411 }
1412 
1413 /* Linearize the associatable expression chain at START with the
1414    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415    filling CHAIN with the result and using WORKLIST as intermediate storage.
1416    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1418    stmts, starting with START.  */
1419 
1420 static void
vect_slp_linearize_chain(vec_info * vinfo,vec<std::pair<tree_code,gimple * >> & worklist,vec<chain_op_t> & chain,enum tree_code code,gimple * start,gimple * & code_stmt,gimple * & alt_code_stmt,vec<gimple * > * chain_stmts)1421 vect_slp_linearize_chain (vec_info *vinfo,
1422 			  vec<std::pair<tree_code, gimple *> > &worklist,
1423 			  vec<chain_op_t> &chain,
1424 			  enum tree_code code, gimple *start,
1425 			  gimple *&code_stmt, gimple *&alt_code_stmt,
1426 			  vec<gimple *> *chain_stmts)
1427 {
1428   /* For each lane linearize the addition/subtraction (or other
1429      uniform associatable operation) expression tree.  */
1430   worklist.safe_push (std::make_pair (code, start));
1431   while (!worklist.is_empty ())
1432     {
1433       auto entry = worklist.pop ();
1434       gassign *stmt = as_a <gassign *> (entry.second);
1435       enum tree_code in_code = entry.first;
1436       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1437       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1438       if (!code_stmt
1439 	  && gimple_assign_rhs_code (stmt) == code)
1440 	code_stmt = stmt;
1441       else if (!alt_code_stmt
1442 	       && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1443 	alt_code_stmt = stmt;
1444       if (chain_stmts)
1445 	chain_stmts->safe_push (stmt);
1446       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1447 	{
1448 	  tree op = gimple_op (stmt, opnum);
1449 	  vect_def_type dt;
1450 	  stmt_vec_info def_stmt_info;
1451 	  bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1452 	  gcc_assert (res);
1453 	  if (dt == vect_internal_def
1454 	      && is_pattern_stmt_p (def_stmt_info))
1455 	    op = gimple_get_lhs (def_stmt_info->stmt);
1456 	  gimple *use_stmt;
1457 	  use_operand_p use_p;
1458 	  if (dt == vect_internal_def
1459 	      && single_imm_use (op, &use_p, &use_stmt)
1460 	      && is_gimple_assign (def_stmt_info->stmt)
1461 	      && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1462 		  || (code == PLUS_EXPR
1463 		      && (gimple_assign_rhs_code (def_stmt_info->stmt)
1464 			  == MINUS_EXPR))))
1465 	    {
1466 	      tree_code op_def_code = this_code;
1467 	      if (op_def_code == MINUS_EXPR && opnum == 1)
1468 		op_def_code = PLUS_EXPR;
1469 	      if (in_code == MINUS_EXPR)
1470 		op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1471 	      worklist.safe_push (std::make_pair (op_def_code,
1472 						  def_stmt_info->stmt));
1473 	    }
1474 	  else
1475 	    {
1476 	      tree_code op_def_code = this_code;
1477 	      if (op_def_code == MINUS_EXPR && opnum == 1)
1478 		op_def_code = PLUS_EXPR;
1479 	      if (in_code == MINUS_EXPR)
1480 		op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1481 	      chain.safe_push (chain_op_t (op_def_code, dt, op));
1482 	    }
1483 	}
1484     }
1485 }
1486 
1487 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1488 		  simple_hashmap_traits <bst_traits, slp_tree> >
1489   scalar_stmts_to_slp_tree_map_t;
1490 
1491 static slp_tree
1492 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1493 		       vec<stmt_vec_info> stmts, unsigned int group_size,
1494 		       poly_uint64 *max_nunits,
1495 		       bool *matches, unsigned *limit, unsigned *tree_size,
1496 		       scalar_stmts_to_slp_tree_map_t *bst_map);
1497 
1498 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1499 vect_build_slp_tree (vec_info *vinfo,
1500 		     vec<stmt_vec_info> stmts, unsigned int group_size,
1501 		     poly_uint64 *max_nunits,
1502 		     bool *matches, unsigned *limit, unsigned *tree_size,
1503 		     scalar_stmts_to_slp_tree_map_t *bst_map)
1504 {
1505   if (slp_tree *leader = bst_map->get (stmts))
1506     {
1507       if (dump_enabled_p ())
1508 	dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1509 			 !(*leader)->failed ? "" : "failed ", *leader);
1510       if (!(*leader)->failed)
1511 	{
1512 	  SLP_TREE_REF_COUNT (*leader)++;
1513 	  vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1514 	  stmts.release ();
1515 	  return *leader;
1516 	}
1517       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1518       return NULL;
1519     }
1520 
1521   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522      so we can pick up backedge destinations during discovery.  */
1523   slp_tree res = new _slp_tree;
1524   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1525   SLP_TREE_SCALAR_STMTS (res) = stmts;
1526   bst_map->put (stmts.copy (), res);
1527 
1528   if (*limit == 0)
1529     {
1530       if (dump_enabled_p ())
1531 	dump_printf_loc (MSG_NOTE, vect_location,
1532 			 "SLP discovery limit exceeded\n");
1533       /* Mark the node invalid so we can detect those when still in use
1534 	 as backedge destinations.  */
1535       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1536       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1537       res->failed = XNEWVEC (bool, group_size);
1538       memset (res->failed, 0, sizeof (bool) * group_size);
1539       memset (matches, 0, sizeof (bool) * group_size);
1540       return NULL;
1541     }
1542   --*limit;
1543 
1544   if (dump_enabled_p ())
1545     dump_printf_loc (MSG_NOTE, vect_location,
1546 		     "starting SLP discovery for node %p\n", res);
1547 
1548   poly_uint64 this_max_nunits = 1;
1549   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1550 					&this_max_nunits,
1551 					matches, limit, tree_size, bst_map);
1552   if (!res_)
1553     {
1554       if (dump_enabled_p ())
1555 	dump_printf_loc (MSG_NOTE, vect_location,
1556 			 "SLP discovery for node %p failed\n", res);
1557       /* Mark the node invalid so we can detect those when still in use
1558 	 as backedge destinations.  */
1559       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1560       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1561       res->failed = XNEWVEC (bool, group_size);
1562       if (flag_checking)
1563 	{
1564 	  unsigned i;
1565 	  for (i = 0; i < group_size; ++i)
1566 	    if (!matches[i])
1567 	      break;
1568 	  gcc_assert (i < group_size);
1569 	}
1570       memcpy (res->failed, matches, sizeof (bool) * group_size);
1571     }
1572   else
1573     {
1574       if (dump_enabled_p ())
1575 	dump_printf_loc (MSG_NOTE, vect_location,
1576 			 "SLP discovery for node %p succeeded\n", res);
1577       gcc_assert (res_ == res);
1578       res->max_nunits = this_max_nunits;
1579       vect_update_max_nunits (max_nunits, this_max_nunits);
1580       /* Keep a reference for the bst_map use.  */
1581       SLP_TREE_REF_COUNT (res)++;
1582     }
1583   return res_;
1584 }
1585 
1586 /* Helper for building an associated SLP node chain.  */
1587 
1588 static void
vect_slp_build_two_operator_nodes(slp_tree perm,tree vectype,slp_tree op0,slp_tree op1,stmt_vec_info oper1,stmt_vec_info oper2,vec<std::pair<unsigned,unsigned>> lperm)1589 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1590 				   slp_tree op0, slp_tree op1,
1591 				   stmt_vec_info oper1, stmt_vec_info oper2,
1592 				   vec<std::pair<unsigned, unsigned> > lperm)
1593 {
1594   unsigned group_size = SLP_TREE_LANES (op1);
1595 
1596   slp_tree child1 = new _slp_tree;
1597   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1598   SLP_TREE_VECTYPE (child1) = vectype;
1599   SLP_TREE_LANES (child1) = group_size;
1600   SLP_TREE_CHILDREN (child1).create (2);
1601   SLP_TREE_CHILDREN (child1).quick_push (op0);
1602   SLP_TREE_CHILDREN (child1).quick_push (op1);
1603   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1604 
1605   slp_tree child2 = new _slp_tree;
1606   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1607   SLP_TREE_VECTYPE (child2) = vectype;
1608   SLP_TREE_LANES (child2) = group_size;
1609   SLP_TREE_CHILDREN (child2).create (2);
1610   SLP_TREE_CHILDREN (child2).quick_push (op0);
1611   SLP_TREE_REF_COUNT (op0)++;
1612   SLP_TREE_CHILDREN (child2).quick_push (op1);
1613   SLP_TREE_REF_COUNT (op1)++;
1614   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1615 
1616   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1617   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1618   SLP_TREE_VECTYPE (perm) = vectype;
1619   SLP_TREE_LANES (perm) = group_size;
1620   /* ???  We should set this NULL but that's not expected.  */
1621   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1622   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1623   SLP_TREE_CHILDREN (perm).quick_push (child1);
1624   SLP_TREE_CHILDREN (perm).quick_push (child2);
1625 }
1626 
1627 /* Recursively build an SLP tree starting from NODE.
1628    Fail (and return a value not equal to zero) if def-stmts are not
1629    isomorphic, require data permutation or are of unsupported types of
1630    operation.  Otherwise, return 0.
1631    The value returned is the depth in the SLP tree where a mismatch
1632    was found.  */
1633 
1634 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1635 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1636 		       vec<stmt_vec_info> stmts, unsigned int group_size,
1637 		       poly_uint64 *max_nunits,
1638 		       bool *matches, unsigned *limit, unsigned *tree_size,
1639 		       scalar_stmts_to_slp_tree_map_t *bst_map)
1640 {
1641   unsigned nops, i, this_tree_size = 0;
1642   poly_uint64 this_max_nunits = *max_nunits;
1643 
1644   matches[0] = false;
1645 
1646   stmt_vec_info stmt_info = stmts[0];
1647   if (!is_a<gcall *> (stmt_info->stmt)
1648       && !is_a<gassign *> (stmt_info->stmt)
1649       && !is_a<gphi *> (stmt_info->stmt))
1650     return NULL;
1651 
1652   nops = gimple_num_args (stmt_info->stmt);
1653   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1654     nops = map[0];
1655 
1656   /* If the SLP node is a PHI (induction or reduction), terminate
1657      the recursion.  */
1658   bool *skip_args = XALLOCAVEC (bool, nops);
1659   memset (skip_args, 0, sizeof (bool) * nops);
1660   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1661     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1662       {
1663 	tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1664 	tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1665 						    group_size);
1666 	if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1667 				     max_nunits))
1668 	  return NULL;
1669 
1670 	vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1671 	if (def_type == vect_induction_def)
1672 	  {
1673 	    /* Induction PHIs are not cycles but walk the initial
1674 	       value.  Only for inner loops through, for outer loops
1675 	       we need to pick up the value from the actual PHIs
1676 	       to more easily support peeling and epilogue vectorization.  */
1677 	    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678 	    if (!nested_in_vect_loop_p (loop, stmt_info))
1679 	      skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1680 	    else
1681 	      loop = loop->inner;
1682 	    skip_args[loop_latch_edge (loop)->dest_idx] = true;
1683 	  }
1684 	else if (def_type == vect_reduction_def
1685 		 || def_type == vect_double_reduction_def
1686 		 || def_type == vect_nested_cycle)
1687 	  {
1688 	    /* Else def types have to match.  */
1689 	    stmt_vec_info other_info;
1690 	    bool all_same = true;
1691 	    FOR_EACH_VEC_ELT (stmts, i, other_info)
1692 	      {
1693 		if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1694 		  return NULL;
1695 		if (other_info != stmt_info)
1696 		  all_same = false;
1697 	      }
1698 	    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699 	    /* Reduction initial values are not explicitely represented.  */
1700 	    if (!nested_in_vect_loop_p (loop, stmt_info))
1701 	      skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1702 	    /* Reduction chain backedge defs are filled manually.
1703 	       ???  Need a better way to identify a SLP reduction chain PHI.
1704 	       Or a better overall way to SLP match those.  */
1705 	    if (all_same && def_type == vect_reduction_def)
1706 	      skip_args[loop_latch_edge (loop)->dest_idx] = true;
1707 	  }
1708 	else if (def_type != vect_internal_def)
1709 	  return NULL;
1710       }
1711 
1712 
1713   bool two_operators = false;
1714   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1715   tree vectype = NULL_TREE;
1716   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1717 			      &this_max_nunits, matches, &two_operators,
1718 			      &vectype))
1719     return NULL;
1720 
1721   /* If the SLP node is a load, terminate the recursion unless masked.  */
1722   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1723       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1724     {
1725       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1726 	gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1727       else
1728 	{
1729 	  *max_nunits = this_max_nunits;
1730 	  (*tree_size)++;
1731 	  node = vect_create_new_slp_node (node, stmts, 0);
1732 	  SLP_TREE_VECTYPE (node) = vectype;
1733 	  /* And compute the load permutation.  Whether it is actually
1734 	     a permutation depends on the unrolling factor which is
1735 	     decided later.  */
1736 	  vec<unsigned> load_permutation;
1737 	  int j;
1738 	  stmt_vec_info load_info;
1739 	  load_permutation.create (group_size);
1740 	  stmt_vec_info first_stmt_info
1741 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1742 	  bool any_permute = false;
1743 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1744 	    {
1745 	      int load_place = vect_get_place_in_interleaving_chain
1746 		  (load_info, first_stmt_info);
1747 	      gcc_assert (load_place != -1);
1748 	      any_permute |= load_place != j;
1749 	      load_permutation.quick_push (load_place);
1750 	    }
1751 
1752 	  if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1753 	    {
1754 	      gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1755 			  || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1756 			  || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1757 	      load_permutation.release ();
1758 	      /* We cannot handle permuted masked loads, see PR114375.  */
1759 	      if (any_permute
1760 		  || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1761 		      && DR_GROUP_SIZE (first_stmt_info) != group_size)
1762 		  || STMT_VINFO_STRIDED_P (stmt_info))
1763 		{
1764 		  matches[0] = false;
1765 		  return NULL;
1766 		}
1767 	    }
1768 	  else
1769 	    {
1770 	      SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1771 	      return node;
1772 	    }
1773 	}
1774     }
1775   else if (gimple_assign_single_p (stmt_info->stmt)
1776 	   && !gimple_vuse (stmt_info->stmt)
1777 	   && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1778     {
1779       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1780 	 the same SSA name vector of a compatible type to vectype.  */
1781       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1782       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1783       stmt_vec_info estmt_info;
1784       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1785 	{
1786 	  gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1787 	  tree bfref = gimple_assign_rhs1 (estmt);
1788 	  HOST_WIDE_INT lane;
1789 	  if (!known_eq (bit_field_size (bfref),
1790 			 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1791 	      || !constant_multiple_p (bit_field_offset (bfref),
1792 				       bit_field_size (bfref), &lane))
1793 	    {
1794 	      lperm.release ();
1795 	      matches[0] = false;
1796 	      return NULL;
1797 	    }
1798 	  lperm.safe_push (std::make_pair (0, (unsigned)lane));
1799 	}
1800       slp_tree vnode = vect_create_new_slp_node (vNULL);
1801       /* ???  We record vectype here but we hide eventually necessary
1802 	 punning and instead rely on code generation to materialize
1803 	 VIEW_CONVERT_EXPRs as necessary.  We instead should make
1804 	 this explicit somehow.  */
1805       SLP_TREE_VECTYPE (vnode) = vectype;
1806       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1807       /* We are always building a permutation node even if it is an identity
1808 	 permute to shield the rest of the vectorizer from the odd node
1809 	 representing an actual vector without any scalar ops.
1810 	 ???  We could hide it completely with making the permute node
1811 	 external?  */
1812       node = vect_create_new_slp_node (node, stmts, 1);
1813       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1814       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1815       SLP_TREE_VECTYPE (node) = vectype;
1816       SLP_TREE_CHILDREN (node).quick_push (vnode);
1817       return node;
1818     }
1819   /* When discovery reaches an associatable operation see whether we can
1820      improve that to match up lanes in a way superior to the operand
1821      swapping code which at most looks at two defs.
1822      ???  For BB vectorization we cannot do the brute-force search
1823      for matching as we can succeed by means of builds from scalars
1824      and have no good way to "cost" one build against another.  */
1825   else if (is_a <loop_vec_info> (vinfo)
1826 	   /* ???  We don't handle !vect_internal_def defs below.  */
1827 	   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1828 	   && is_gimple_assign (stmt_info->stmt)
1829 	   && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1830 	       || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1831 	   && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1832 	       || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1833 		   && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1834     {
1835       /* See if we have a chain of (mixed) adds or subtracts or other
1836 	 associatable ops.  */
1837       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1838       if (code == MINUS_EXPR)
1839 	code = PLUS_EXPR;
1840       stmt_vec_info other_op_stmt_info = NULL;
1841       stmt_vec_info op_stmt_info = NULL;
1842       unsigned chain_len = 0;
1843       auto_vec<chain_op_t> chain;
1844       auto_vec<std::pair<tree_code, gimple *> > worklist;
1845       auto_vec<vec<chain_op_t> > chains (group_size);
1846       auto_vec<slp_tree, 4> children;
1847       bool hard_fail = true;
1848       for (unsigned lane = 0; lane < group_size; ++lane)
1849 	{
1850 	  /* For each lane linearize the addition/subtraction (or other
1851 	     uniform associatable operation) expression tree.  */
1852 	  gimple *op_stmt = NULL, *other_op_stmt = NULL;
1853 	  vect_slp_linearize_chain (vinfo, worklist, chain, code,
1854 				    stmts[lane]->stmt, op_stmt, other_op_stmt,
1855 				    NULL);
1856 	  if (!op_stmt_info && op_stmt)
1857 	    op_stmt_info = vinfo->lookup_stmt (op_stmt);
1858 	  if (!other_op_stmt_info && other_op_stmt)
1859 	    other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1860 	  if (chain.length () == 2)
1861 	    {
1862 	      /* In a chain of just two elements resort to the regular
1863 		 operand swapping scheme.  If we run into a length
1864 		 mismatch still hard-FAIL.  */
1865 	      if (chain_len == 0)
1866 		hard_fail = false;
1867 	      else
1868 		{
1869 		  matches[lane] = false;
1870 		  /* ???  We might want to process the other lanes, but
1871 		     make sure to not give false matching hints to the
1872 		     caller for lanes we did not process.  */
1873 		  if (lane != group_size - 1)
1874 		    matches[0] = false;
1875 		}
1876 	      break;
1877 	    }
1878 	  else if (chain_len == 0)
1879 	    chain_len = chain.length ();
1880 	  else if (chain.length () != chain_len)
1881 	    {
1882 	      /* ???  Here we could slip in magic to compensate with
1883 		 neutral operands.  */
1884 	      matches[lane] = false;
1885 	      if (lane != group_size - 1)
1886 		matches[0] = false;
1887 	      break;
1888 	    }
1889 	  chains.quick_push (chain.copy ());
1890 	  chain.truncate (0);
1891 	}
1892       if (chains.length () == group_size)
1893 	{
1894 	  /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1895 	  if (!op_stmt_info)
1896 	    {
1897 	      hard_fail = false;
1898 	      goto out;
1899 	    }
1900 	  /* Now we have a set of chains with the same length.  */
1901 	  /* 1. pre-sort according to def_type and operation.  */
1902 	  for (unsigned lane = 0; lane < group_size; ++lane)
1903 	    chains[lane].stablesort (dt_sort_cmp, vinfo);
1904 	  if (dump_enabled_p ())
1905 	    {
1906 	      dump_printf_loc (MSG_NOTE, vect_location,
1907 			       "pre-sorted chains of %s\n",
1908 			       get_tree_code_name (code));
1909 	      for (unsigned lane = 0; lane < group_size; ++lane)
1910 		{
1911 		  for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1912 		    dump_printf (MSG_NOTE, "%s %T ",
1913 				 get_tree_code_name (chains[lane][opnum].code),
1914 				 chains[lane][opnum].op);
1915 		  dump_printf (MSG_NOTE, "\n");
1916 		}
1917 	    }
1918 	  /* 2. try to build children nodes, associating as necessary.  */
1919 	  for (unsigned n = 0; n < chain_len; ++n)
1920 	    {
1921 	      vect_def_type dt = chains[0][n].dt;
1922 	      unsigned lane;
1923 	      for (lane = 0; lane < group_size; ++lane)
1924 		if (chains[lane][n].dt != dt)
1925 		  {
1926 		    if (dt == vect_constant_def
1927 			&& chains[lane][n].dt == vect_external_def)
1928 		      dt = vect_external_def;
1929 		    else if (dt == vect_external_def
1930 			     && chains[lane][n].dt == vect_constant_def)
1931 		      ;
1932 		    else
1933 		      break;
1934 		  }
1935 	      if (lane != group_size)
1936 		{
1937 		  if (dump_enabled_p ())
1938 		    dump_printf_loc (MSG_NOTE, vect_location,
1939 				     "giving up on chain due to mismatched "
1940 				     "def types\n");
1941 		  matches[lane] = false;
1942 		  if (lane != group_size - 1)
1943 		    matches[0] = false;
1944 		  goto out;
1945 		}
1946 	      if (dt == vect_constant_def
1947 		  || dt == vect_external_def)
1948 		{
1949 		  /* Check whether we can build the invariant.  If we can't
1950 		     we never will be able to.  */
1951 		  tree type = TREE_TYPE (chains[0][n].op);
1952 		  if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1953 		      && (TREE_CODE (type) == BOOLEAN_TYPE
1954 			  || !can_duplicate_and_interleave_p (vinfo, group_size,
1955 							      type)))
1956 		    {
1957 		      matches[0] = false;
1958 		      goto out;
1959 		    }
1960 		  vec<tree> ops;
1961 		  ops.create (group_size);
1962 		  for (lane = 0; lane < group_size; ++lane)
1963 		    ops.quick_push (chains[lane][n].op);
1964 		  slp_tree child = vect_create_new_slp_node (ops);
1965 		  SLP_TREE_DEF_TYPE (child) = dt;
1966 		  children.safe_push (child);
1967 		}
1968 	      else if (dt != vect_internal_def)
1969 		{
1970 		  /* Not sure, we might need sth special.
1971 		     gcc.dg/vect/pr96854.c,
1972 		     gfortran.dg/vect/fast-math-pr37021.f90
1973 		     and gfortran.dg/vect/pr61171.f trigger.  */
1974 		  /* Soft-fail for now.  */
1975 		  hard_fail = false;
1976 		  goto out;
1977 		}
1978 	      else
1979 		{
1980 		  vec<stmt_vec_info> op_stmts;
1981 		  op_stmts.create (group_size);
1982 		  slp_tree child = NULL;
1983 		  /* Brute-force our way.  We have to consider a lane
1984 		     failing after fixing an earlier fail up in the
1985 		     SLP discovery recursion.  So track the current
1986 		     permute per lane.  */
1987 		  unsigned *perms = XALLOCAVEC (unsigned, group_size);
1988 		  memset (perms, 0, sizeof (unsigned) * group_size);
1989 		  do
1990 		    {
1991 		      op_stmts.truncate (0);
1992 		      for (lane = 0; lane < group_size; ++lane)
1993 			op_stmts.quick_push
1994 			  (vinfo->lookup_def (chains[lane][n].op));
1995 		      child = vect_build_slp_tree (vinfo, op_stmts,
1996 						   group_size, &this_max_nunits,
1997 						   matches, limit,
1998 						   &this_tree_size, bst_map);
1999 		      /* ???  We're likely getting too many fatal mismatches
2000 			 here so maybe we want to ignore them (but then we
2001 			 have no idea which lanes fatally mismatched).  */
2002 		      if (child || !matches[0])
2003 			break;
2004 		      /* Swap another lane we have not yet matched up into
2005 			 lanes that did not match.  If we run out of
2006 			 permute possibilities for a lane terminate the
2007 			 search.  */
2008 		      bool term = false;
2009 		      for (lane = 1; lane < group_size; ++lane)
2010 			if (!matches[lane])
2011 			  {
2012 			    if (n + perms[lane] + 1 == chain_len)
2013 			      {
2014 				term = true;
2015 				break;
2016 			      }
2017 			    std::swap (chains[lane][n],
2018 				       chains[lane][n + perms[lane] + 1]);
2019 			    perms[lane]++;
2020 			  }
2021 		      if (term)
2022 			break;
2023 		    }
2024 		  while (1);
2025 		  if (!child)
2026 		    {
2027 		      if (dump_enabled_p ())
2028 			dump_printf_loc (MSG_NOTE, vect_location,
2029 					 "failed to match up op %d\n", n);
2030 		      op_stmts.release ();
2031 		      if (lane != group_size - 1)
2032 			matches[0] = false;
2033 		      else
2034 			matches[lane] = false;
2035 		      goto out;
2036 		    }
2037 		  if (dump_enabled_p ())
2038 		    {
2039 		      dump_printf_loc (MSG_NOTE, vect_location,
2040 				       "matched up op %d to\n", n);
2041 		      vect_print_slp_tree (MSG_NOTE, vect_location, child);
2042 		    }
2043 		  children.safe_push (child);
2044 		}
2045 	    }
2046 	  /* 3. build SLP nodes to combine the chain.  */
2047 	  for (unsigned lane = 0; lane < group_size; ++lane)
2048 	    if (chains[lane][0].code != code)
2049 	      {
2050 		/* See if there's any alternate all-PLUS entry.  */
2051 		unsigned n;
2052 		for (n = 1; n < chain_len; ++n)
2053 		  {
2054 		    for (lane = 0; lane < group_size; ++lane)
2055 		      if (chains[lane][n].code != code)
2056 			break;
2057 		    if (lane == group_size)
2058 		      break;
2059 		  }
2060 		if (n != chain_len)
2061 		  {
2062 		    /* Swap that in at first position.  */
2063 		    std::swap (children[0], children[n]);
2064 		    for (lane = 0; lane < group_size; ++lane)
2065 		      std::swap (chains[lane][0], chains[lane][n]);
2066 		  }
2067 		else
2068 		  {
2069 		    /* ???  When this triggers and we end up with two
2070 		       vect_constant/external_def up-front things break (ICE)
2071 		       spectacularly finding an insertion place for the
2072 		       all-constant op.  We should have a fully
2073 		       vect_internal_def operand though(?) so we can swap
2074 		       that into first place and then prepend the all-zero
2075 		       constant.  */
2076 		    if (dump_enabled_p ())
2077 		      dump_printf_loc (MSG_NOTE, vect_location,
2078 				       "inserting constant zero to compensate "
2079 				       "for (partially) negated first "
2080 				       "operand\n");
2081 		    chain_len++;
2082 		    for (lane = 0; lane < group_size; ++lane)
2083 		      chains[lane].safe_insert
2084 			(0, chain_op_t (code, vect_constant_def, NULL_TREE));
2085 		    vec<tree> zero_ops;
2086 		    zero_ops.create (group_size);
2087 		    zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2088 		    for (lane = 1; lane < group_size; ++lane)
2089 		      zero_ops.quick_push (zero_ops[0]);
2090 		    slp_tree zero = vect_create_new_slp_node (zero_ops);
2091 		    SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2092 		    children.safe_insert (0, zero);
2093 		  }
2094 		break;
2095 	      }
2096 	  for (unsigned i = 1; i < children.length (); ++i)
2097 	    {
2098 	      slp_tree op0 = children[i - 1];
2099 	      slp_tree op1 = children[i];
2100 	      bool this_two_op = false;
2101 	      for (unsigned lane = 0; lane < group_size; ++lane)
2102 		if (chains[lane][i].code != chains[0][i].code)
2103 		  {
2104 		    this_two_op = true;
2105 		    break;
2106 		  }
2107 	      slp_tree child;
2108 	      if (i == children.length () - 1)
2109 		child = vect_create_new_slp_node (node, stmts, 2);
2110 	      else
2111 		child = vect_create_new_slp_node (2, ERROR_MARK);
2112 	      if (this_two_op)
2113 		{
2114 		  vec<std::pair<unsigned, unsigned> > lperm;
2115 		  lperm.create (group_size);
2116 		  for (unsigned lane = 0; lane < group_size; ++lane)
2117 		    lperm.quick_push (std::make_pair
2118 		      (chains[lane][i].code != chains[0][i].code, lane));
2119 		  vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2120 						     (chains[0][i].code == code
2121 						      ? op_stmt_info
2122 						      : other_op_stmt_info),
2123 						     (chains[0][i].code == code
2124 						      ? other_op_stmt_info
2125 						      : op_stmt_info),
2126 						     lperm);
2127 		}
2128 	      else
2129 		{
2130 		  SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2131 		  SLP_TREE_VECTYPE (child) = vectype;
2132 		  SLP_TREE_LANES (child) = group_size;
2133 		  SLP_TREE_CHILDREN (child).quick_push (op0);
2134 		  SLP_TREE_CHILDREN (child).quick_push (op1);
2135 		  SLP_TREE_REPRESENTATIVE (child)
2136 		    = (chains[0][i].code == code
2137 		       ? op_stmt_info : other_op_stmt_info);
2138 		}
2139 	      children[i] = child;
2140 	    }
2141 	  *tree_size += this_tree_size + 1;
2142 	  *max_nunits = this_max_nunits;
2143 	  while (!chains.is_empty ())
2144 	    chains.pop ().release ();
2145 	  return node;
2146 	}
2147 out:
2148       while (!children.is_empty ())
2149 	vect_free_slp_tree (children.pop ());
2150       while (!chains.is_empty ())
2151 	chains.pop ().release ();
2152       /* Hard-fail, otherwise we might run into quadratic processing of the
2153 	 chains starting one stmt into the chain again.  */
2154       if (hard_fail)
2155 	return NULL;
2156       /* Fall thru to normal processing.  */
2157     }
2158 
2159   /* Get at the operands, verifying they are compatible.  */
2160   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2161   slp_oprnd_info oprnd_info;
2162   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2163     {
2164       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2165 					     stmts, i, &oprnds_info);
2166       if (res != 0)
2167 	matches[(res == -1) ? 0 : i] = false;
2168       if (!matches[0])
2169 	break;
2170     }
2171   for (i = 0; i < group_size; ++i)
2172     if (!matches[i])
2173       {
2174 	vect_free_oprnd_info (oprnds_info);
2175 	return NULL;
2176       }
2177   swap = NULL;
2178 
2179   auto_vec<slp_tree, 4> children;
2180 
2181   stmt_info = stmts[0];
2182 
2183   /* Create SLP_TREE nodes for the definition node/s.  */
2184   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2185     {
2186       slp_tree child;
2187       unsigned int j;
2188 
2189       /* We're skipping certain operands from processing, for example
2190 	 outer loop reduction initial defs.  */
2191       if (skip_args[i])
2192 	{
2193 	  children.safe_push (NULL);
2194 	  continue;
2195 	}
2196 
2197       if (oprnd_info->first_dt == vect_uninitialized_def)
2198 	{
2199 	  /* COND_EXPR have one too many eventually if the condition
2200 	     is a SSA name.  */
2201 	  gcc_assert (i == 3 && nops == 4);
2202 	  continue;
2203 	}
2204 
2205       if (is_a <bb_vec_info> (vinfo)
2206 	  && oprnd_info->first_dt == vect_internal_def
2207 	  && !oprnd_info->any_pattern)
2208 	{
2209 	  /* For BB vectorization, if all defs are the same do not
2210 	     bother to continue the build along the single-lane
2211 	     graph but use a splat of the scalar value.  */
2212 	  stmt_vec_info first_def = oprnd_info->def_stmts[0];
2213 	  for (j = 1; j < group_size; ++j)
2214 	    if (oprnd_info->def_stmts[j] != first_def)
2215 	      break;
2216 	  if (j == group_size
2217 	      /* But avoid doing this for loads where we may be
2218 		 able to CSE things, unless the stmt is not
2219 		 vectorizable.  */
2220 	      && (!STMT_VINFO_VECTORIZABLE (first_def)
2221 		  || !gimple_vuse (first_def->stmt)))
2222 	    {
2223 	      if (dump_enabled_p ())
2224 		dump_printf_loc (MSG_NOTE, vect_location,
2225 				 "Using a splat of the uniform operand %G",
2226 				 first_def->stmt);
2227 	      oprnd_info->first_dt = vect_external_def;
2228 	    }
2229 	}
2230 
2231       if (oprnd_info->first_dt == vect_external_def
2232 	  || oprnd_info->first_dt == vect_constant_def)
2233 	{
2234 	  slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2235 	  SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2236 	  oprnd_info->ops = vNULL;
2237 	  children.safe_push (invnode);
2238 	  continue;
2239 	}
2240 
2241       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2242 					group_size, &this_max_nunits,
2243 					matches, limit,
2244 					&this_tree_size, bst_map)) != NULL)
2245 	{
2246 	  oprnd_info->def_stmts = vNULL;
2247 	  children.safe_push (child);
2248 	  continue;
2249 	}
2250 
2251       /* If the SLP build for operand zero failed and operand zero
2252 	 and one can be commutated try that for the scalar stmts
2253 	 that failed the match.  */
2254       if (i == 0
2255 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
2256 	  && matches[0]
2257 	  /* ???  For COND_EXPRs we can swap the comparison operands
2258 	     as well as the arms under some constraints.  */
2259 	  && nops == 2
2260 	  && oprnds_info[1]->first_dt == vect_internal_def
2261 	  && is_gimple_assign (stmt_info->stmt)
2262 	  /* Swapping operands for reductions breaks assumptions later on.  */
2263 	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2264 	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2265 	{
2266 	  /* See whether we can swap the matching or the non-matching
2267 	     stmt operands.  */
2268 	  bool swap_not_matching = true;
2269 	  do
2270 	    {
2271 	      for (j = 0; j < group_size; ++j)
2272 		{
2273 		  if (matches[j] != !swap_not_matching)
2274 		    continue;
2275 		  stmt_vec_info stmt_info = stmts[j];
2276 		  /* Verify if we can swap operands of this stmt.  */
2277 		  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2278 		  if (!stmt
2279 		      || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2280 		    {
2281 		      if (!swap_not_matching)
2282 			goto fail;
2283 		      swap_not_matching = false;
2284 		      break;
2285 		    }
2286 		}
2287 	    }
2288 	  while (j != group_size);
2289 
2290 	  /* Swap mismatched definition stmts.  */
2291 	  if (dump_enabled_p ())
2292 	    dump_printf_loc (MSG_NOTE, vect_location,
2293 			     "Re-trying with swapped operands of stmts ");
2294 	  for (j = 0; j < group_size; ++j)
2295 	    if (matches[j] == !swap_not_matching)
2296 	      {
2297 		std::swap (oprnds_info[0]->def_stmts[j],
2298 			   oprnds_info[1]->def_stmts[j]);
2299 		std::swap (oprnds_info[0]->ops[j],
2300 			   oprnds_info[1]->ops[j]);
2301 		if (dump_enabled_p ())
2302 		  dump_printf (MSG_NOTE, "%d ", j);
2303 	      }
2304 	  if (dump_enabled_p ())
2305 	    dump_printf (MSG_NOTE, "\n");
2306 	  /* After swapping some operands we lost track whether an
2307 	     operand has any pattern defs so be conservative here.  */
2308 	  if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2309 	    oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2310 	  /* And try again with scratch 'matches' ... */
2311 	  bool *tem = XALLOCAVEC (bool, group_size);
2312 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2313 					    group_size, &this_max_nunits,
2314 					    tem, limit,
2315 					    &this_tree_size, bst_map)) != NULL)
2316 	    {
2317 	      oprnd_info->def_stmts = vNULL;
2318 	      children.safe_push (child);
2319 	      continue;
2320 	    }
2321 	}
2322 fail:
2323 
2324       /* If the SLP build failed and we analyze a basic-block
2325 	 simply treat nodes we fail to build as externally defined
2326 	 (and thus build vectors from the scalar defs).
2327 	 The cost model will reject outright expensive cases.
2328 	 ???  This doesn't treat cases where permutation ultimatively
2329 	 fails (or we don't try permutation below).  Ideally we'd
2330 	 even compute a permutation that will end up with the maximum
2331 	 SLP tree size...  */
2332       if (is_a <bb_vec_info> (vinfo)
2333 	  /* ???  Rejecting patterns this way doesn't work.  We'd have to
2334 	     do extra work to cancel the pattern so the uses see the
2335 	     scalar version.  */
2336 	  && !is_pattern_stmt_p (stmt_info)
2337 	  && !oprnd_info->any_pattern)
2338 	{
2339 	  /* But if there's a leading vector sized set of matching stmts
2340 	     fail here so we can split the group.  This matches the condition
2341 	     vect_analyze_slp_instance uses.  */
2342 	  /* ???  We might want to split here and combine the results to support
2343 	     multiple vector sizes better.  */
2344 	  for (j = 0; j < group_size; ++j)
2345 	    if (!matches[j])
2346 	      break;
2347 	  if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2348 	    {
2349 	      if (dump_enabled_p ())
2350 		dump_printf_loc (MSG_NOTE, vect_location,
2351 				 "Building vector operands from scalars\n");
2352 	      this_tree_size++;
2353 	      child = vect_create_new_slp_node (oprnd_info->ops);
2354 	      children.safe_push (child);
2355 	      oprnd_info->ops = vNULL;
2356 	      continue;
2357 	    }
2358 	}
2359 
2360       gcc_assert (child == NULL);
2361       FOR_EACH_VEC_ELT (children, j, child)
2362 	if (child)
2363 	  vect_free_slp_tree (child);
2364       vect_free_oprnd_info (oprnds_info);
2365       return NULL;
2366     }
2367 
2368   vect_free_oprnd_info (oprnds_info);
2369 
2370   /* If we have all children of a child built up from uniform scalars
2371      or does more than one possibly expensive vector construction then
2372      just throw that away, causing it built up from scalars.
2373      The exception is the SLP node for the vector store.  */
2374   if (is_a <bb_vec_info> (vinfo)
2375       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2376       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2377 	 do extra work to cancel the pattern so the uses see the
2378 	 scalar version.  */
2379       && !is_pattern_stmt_p (stmt_info))
2380     {
2381       slp_tree child;
2382       unsigned j;
2383       bool all_uniform_p = true;
2384       unsigned n_vector_builds = 0;
2385       FOR_EACH_VEC_ELT (children, j, child)
2386 	{
2387 	  if (!child)
2388 	    ;
2389 	  else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2390 	    all_uniform_p = false;
2391 	  else if (!vect_slp_tree_uniform_p (child))
2392 	    {
2393 	      all_uniform_p = false;
2394 	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2395 		n_vector_builds++;
2396 	    }
2397 	}
2398       if (all_uniform_p
2399 	  || n_vector_builds > 1
2400 	  || (n_vector_builds == children.length ()
2401 	      && is_a <gphi *> (stmt_info->stmt)))
2402 	{
2403 	  /* Roll back.  */
2404 	  matches[0] = false;
2405 	  FOR_EACH_VEC_ELT (children, j, child)
2406 	    if (child)
2407 	      vect_free_slp_tree (child);
2408 
2409 	  if (dump_enabled_p ())
2410 	    dump_printf_loc (MSG_NOTE, vect_location,
2411 			     "Building parent vector operands from "
2412 			     "scalars instead\n");
2413 	  return NULL;
2414 	}
2415     }
2416 
2417   *tree_size += this_tree_size + 1;
2418   *max_nunits = this_max_nunits;
2419 
2420   if (two_operators)
2421     {
2422       /* ???  We'd likely want to either cache in bst_map sth like
2423 	 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2424 	 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2425 	 explicit stmts to put in so the keying on 'stmts' doesn't
2426 	 work (but we have the same issue with nodes that use 'ops').  */
2427       slp_tree one = new _slp_tree;
2428       slp_tree two = new _slp_tree;
2429       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2430       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2431       SLP_TREE_VECTYPE (one) = vectype;
2432       SLP_TREE_VECTYPE (two) = vectype;
2433       SLP_TREE_CHILDREN (one).safe_splice (children);
2434       SLP_TREE_CHILDREN (two).safe_splice (children);
2435       slp_tree child;
2436       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2437 	SLP_TREE_REF_COUNT (child)++;
2438 
2439       /* Here we record the original defs since this
2440 	 node represents the final lane configuration.  */
2441       node = vect_create_new_slp_node (node, stmts, 2);
2442       SLP_TREE_VECTYPE (node) = vectype;
2443       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2444       SLP_TREE_CHILDREN (node).quick_push (one);
2445       SLP_TREE_CHILDREN (node).quick_push (two);
2446       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2447       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2448       enum tree_code ocode = ERROR_MARK;
2449       stmt_vec_info ostmt_info;
2450       unsigned j = 0;
2451       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2452 	{
2453 	  gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2454 	  if (gimple_assign_rhs_code (ostmt) != code0)
2455 	    {
2456 	      SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2457 	      ocode = gimple_assign_rhs_code (ostmt);
2458 	      j = i;
2459 	    }
2460 	  else
2461 	    SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2462 	}
2463       SLP_TREE_CODE (one) = code0;
2464       SLP_TREE_CODE (two) = ocode;
2465       SLP_TREE_LANES (one) = stmts.length ();
2466       SLP_TREE_LANES (two) = stmts.length ();
2467       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2468       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2469       return node;
2470     }
2471 
2472   node = vect_create_new_slp_node (node, stmts, nops);
2473   SLP_TREE_VECTYPE (node) = vectype;
2474   SLP_TREE_CHILDREN (node).splice (children);
2475   return node;
2476 }
2477 
2478 /* Dump a single SLP tree NODE.  */
2479 
2480 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)2481 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2482 		     slp_tree node)
2483 {
2484   unsigned i, j;
2485   slp_tree child;
2486   stmt_vec_info stmt_info;
2487   tree op;
2488 
2489   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2490   dump_user_location_t user_loc = loc.get_user_location ();
2491   dump_printf_loc (metadata, user_loc,
2492 		   "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2493 		   ", refcnt=%u)",
2494 		   SLP_TREE_DEF_TYPE (node) == vect_external_def
2495 		   ? " (external)"
2496 		   : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2497 		      ? " (constant)"
2498 		      : ""), node,
2499 		   estimated_poly_value (node->max_nunits),
2500 					 SLP_TREE_REF_COUNT (node));
2501   if (SLP_TREE_VECTYPE (node))
2502     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2503   dump_printf (metadata, "\n");
2504   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2505     {
2506       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2507 	dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2508       else
2509 	dump_printf_loc (metadata, user_loc, "op template: %G",
2510 			 SLP_TREE_REPRESENTATIVE (node)->stmt);
2511     }
2512   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2513     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2514       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2515   else
2516     {
2517       dump_printf_loc (metadata, user_loc, "\t{ ");
2518       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2519 	dump_printf (metadata, "%T%s ", op,
2520 		     i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2521       dump_printf (metadata, "}\n");
2522     }
2523   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2524     {
2525       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2526       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2527 	dump_printf (dump_kind, " %u", j);
2528       dump_printf (dump_kind, " }\n");
2529     }
2530   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2531     {
2532       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2533       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2534 	dump_printf (dump_kind, " %u[%u]",
2535 		     SLP_TREE_LANE_PERMUTATION (node)[i].first,
2536 		     SLP_TREE_LANE_PERMUTATION (node)[i].second);
2537       dump_printf (dump_kind, " }\n");
2538     }
2539   if (SLP_TREE_CHILDREN (node).is_empty ())
2540     return;
2541   dump_printf_loc (metadata, user_loc, "\tchildren");
2542   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2543     dump_printf (dump_kind, " %p", (void *)child);
2544   dump_printf (dump_kind, "\n");
2545 }
2546 
2547 DEBUG_FUNCTION void
debug(slp_tree node)2548 debug (slp_tree node)
2549 {
2550   debug_dump_context ctx;
2551   vect_print_slp_tree (MSG_NOTE,
2552 		       dump_location_t::from_location_t (UNKNOWN_LOCATION),
2553 		       node);
2554 }
2555 
2556 /* Recursive helper for the dot producer below.  */
2557 
2558 static void
dot_slp_tree(FILE * f,slp_tree node,hash_set<slp_tree> & visited)2559 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2560 {
2561   if (visited.add (node))
2562     return;
2563 
2564   fprintf (f, "\"%p\" [label=\"", (void *)node);
2565   vect_print_slp_tree (MSG_NOTE,
2566 		       dump_location_t::from_location_t (UNKNOWN_LOCATION),
2567 		       node);
2568   fprintf (f, "\"];\n");
2569 
2570 
2571   for (slp_tree child : SLP_TREE_CHILDREN (node))
2572     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2573 
2574   for (slp_tree child : SLP_TREE_CHILDREN (node))
2575     if (child)
2576       dot_slp_tree (f, child, visited);
2577 }
2578 
2579 DEBUG_FUNCTION void
dot_slp_tree(const char * fname,slp_tree node)2580 dot_slp_tree (const char *fname, slp_tree node)
2581 {
2582   FILE *f = fopen (fname, "w");
2583   fprintf (f, "digraph {\n");
2584   fflush (f);
2585     {
2586       debug_dump_context ctx (f);
2587       hash_set<slp_tree> visited;
2588       dot_slp_tree (f, node, visited);
2589     }
2590   fflush (f);
2591   fprintf (f, "}\n");
2592   fclose (f);
2593 }
2594 
2595 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2596 
2597 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)2598 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2599 		      slp_tree node, hash_set<slp_tree> &visited)
2600 {
2601   unsigned i;
2602   slp_tree child;
2603 
2604   if (visited.add (node))
2605     return;
2606 
2607   vect_print_slp_tree (dump_kind, loc, node);
2608 
2609   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2610     if (child)
2611       vect_print_slp_graph (dump_kind, loc, child, visited);
2612 }
2613 
2614 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree entry)2615 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2616 		      slp_tree entry)
2617 {
2618   hash_set<slp_tree> visited;
2619   vect_print_slp_graph (dump_kind, loc, entry, visited);
2620 }
2621 
2622 /* Mark the tree rooted at NODE with PURE_SLP.  */
2623 
2624 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)2625 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2626 {
2627   int i;
2628   stmt_vec_info stmt_info;
2629   slp_tree child;
2630 
2631   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2632     return;
2633 
2634   if (visited.add (node))
2635     return;
2636 
2637   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2638     STMT_SLP_TYPE (stmt_info) = pure_slp;
2639 
2640   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2641     if (child)
2642       vect_mark_slp_stmts (child, visited);
2643 }
2644 
2645 static void
vect_mark_slp_stmts(slp_tree node)2646 vect_mark_slp_stmts (slp_tree node)
2647 {
2648   hash_set<slp_tree> visited;
2649   vect_mark_slp_stmts (node, visited);
2650 }
2651 
2652 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2653 
2654 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)2655 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2656 {
2657   int i;
2658   stmt_vec_info stmt_info;
2659   slp_tree child;
2660 
2661   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2662     return;
2663 
2664   if (visited.add (node))
2665     return;
2666 
2667   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2668     {
2669       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2670                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2671       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2672     }
2673 
2674   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2675     if (child)
2676       vect_mark_slp_stmts_relevant (child, visited);
2677 }
2678 
2679 static void
vect_mark_slp_stmts_relevant(slp_tree node)2680 vect_mark_slp_stmts_relevant (slp_tree node)
2681 {
2682   hash_set<slp_tree> visited;
2683   vect_mark_slp_stmts_relevant (node, visited);
2684 }
2685 
2686 
2687 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2688 
2689 static void
vect_gather_slp_loads(vec<slp_tree> & loads,slp_tree node,hash_set<slp_tree> & visited)2690 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2691 		       hash_set<slp_tree> &visited)
2692 {
2693   if (!node || visited.add (node))
2694     return;
2695 
2696   if (SLP_TREE_CHILDREN (node).length () == 0)
2697     {
2698       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2699 	return;
2700       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2701       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2702 	  && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2703 	loads.safe_push (node);
2704     }
2705   else
2706     {
2707       unsigned i;
2708       slp_tree child;
2709       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2710 	vect_gather_slp_loads (loads, child, visited);
2711     }
2712 }
2713 
2714 
2715 /* Find the last store in SLP INSTANCE.  */
2716 
2717 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2718 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2719 {
2720   stmt_vec_info last = NULL;
2721   stmt_vec_info stmt_vinfo;
2722 
2723   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2724     {
2725       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2726       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2727     }
2728 
2729   return last;
2730 }
2731 
2732 /* Find the first stmt in NODE.  */
2733 
2734 stmt_vec_info
vect_find_first_scalar_stmt_in_slp(slp_tree node)2735 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2736 {
2737   stmt_vec_info first = NULL;
2738   stmt_vec_info stmt_vinfo;
2739 
2740   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2741     {
2742       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2743       if (!first
2744 	  || get_later_stmt (stmt_vinfo, first) == first)
2745 	first = stmt_vinfo;
2746     }
2747 
2748   return first;
2749 }
2750 
2751 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2752    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2753    (also containing the first GROUP1_SIZE stmts, since stores are
2754    consecutive), the second containing the remainder.
2755    Return the first stmt in the second group.  */
2756 
2757 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2758 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2759 {
2760   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2761   gcc_assert (group1_size > 0);
2762   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2763   gcc_assert (group2_size > 0);
2764   DR_GROUP_SIZE (first_vinfo) = group1_size;
2765 
2766   stmt_vec_info stmt_info = first_vinfo;
2767   for (unsigned i = group1_size; i > 1; i--)
2768     {
2769       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2770       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2771     }
2772   /* STMT is now the last element of the first group.  */
2773   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2774   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2775 
2776   DR_GROUP_SIZE (group2) = group2_size;
2777   for (stmt_info = group2; stmt_info;
2778        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2779     {
2780       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2781       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2782     }
2783 
2784   /* For the second group, the DR_GROUP_GAP is that before the original group,
2785      plus skipping over the first vector.  */
2786   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2787 
2788   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2789   DR_GROUP_GAP (first_vinfo) += group2_size;
2790 
2791   if (dump_enabled_p ())
2792     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2793 		     group1_size, group2_size);
2794 
2795   return group2;
2796 }
2797 
2798 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2799    statements and a vector of NUNITS elements.  */
2800 
2801 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2802 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2803 {
2804   return exact_div (common_multiple (nunits, group_size), group_size);
2805 }
2806 
2807 /* Helper that checks to see if a node is a load node.  */
2808 
2809 static inline bool
vect_is_slp_load_node(slp_tree root)2810 vect_is_slp_load_node  (slp_tree root)
2811 {
2812   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2813 	 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2814 	 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2815 }
2816 
2817 
2818 /* Helper function of optimize_load_redistribution that performs the operation
2819    recursively.  */
2820 
2821 static slp_tree
optimize_load_redistribution_1(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2822 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2823 				vec_info *vinfo, unsigned int group_size,
2824 				hash_map<slp_tree, slp_tree> *load_map,
2825 				slp_tree root)
2826 {
2827   if (slp_tree *leader = load_map->get (root))
2828     return *leader;
2829 
2830   slp_tree node;
2831   unsigned i;
2832 
2833   /* For now, we don't know anything about externals so do not do anything.  */
2834   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2835     return NULL;
2836   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2837     {
2838       /* First convert this node into a load node and add it to the leaves
2839 	 list and flatten the permute from a lane to a load one.  If it's
2840 	 unneeded it will be elided later.  */
2841       vec<stmt_vec_info> stmts;
2842       stmts.create (SLP_TREE_LANES (root));
2843       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2844       for (unsigned j = 0; j < lane_perm.length (); j++)
2845 	{
2846 	  std::pair<unsigned, unsigned> perm = lane_perm[j];
2847 	  node = SLP_TREE_CHILDREN (root)[perm.first];
2848 
2849 	  if (!vect_is_slp_load_node (node)
2850 	      || SLP_TREE_CHILDREN (node).exists ())
2851 	    {
2852 	      stmts.release ();
2853 	      goto next;
2854 	    }
2855 
2856 	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2857 	}
2858 
2859       if (dump_enabled_p ())
2860 	dump_printf_loc (MSG_NOTE, vect_location,
2861 			 "converting stmts on permute node %p\n", root);
2862 
2863       bool *matches = XALLOCAVEC (bool, group_size);
2864       poly_uint64 max_nunits = 1;
2865       unsigned tree_size = 0, limit = 1;
2866       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2867 				  matches, &limit, &tree_size, bst_map);
2868       if (!node)
2869 	stmts.release ();
2870 
2871       load_map->put (root, node);
2872       return node;
2873     }
2874 
2875 next:
2876   load_map->put (root, NULL);
2877 
2878   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2879     {
2880       slp_tree value
2881 	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2882 					  node);
2883       if (value)
2884 	{
2885 	  SLP_TREE_REF_COUNT (value)++;
2886 	  SLP_TREE_CHILDREN (root)[i] = value;
2887 	  /* ???  We know the original leafs of the replaced nodes will
2888 	     be referenced by bst_map, only the permutes created by
2889 	     pattern matching are not.  */
2890 	  if (SLP_TREE_REF_COUNT (node) == 1)
2891 	    load_map->remove (node);
2892 	  vect_free_slp_tree (node);
2893 	}
2894     }
2895 
2896   return NULL;
2897 }
2898 
2899 /* Temporary workaround for loads not being CSEd during SLP build.  This
2900    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2901    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2902    same DR such that the final operation is equal to a permuted load.  Such
2903    NODES are then directly converted into LOADS themselves.  The nodes are
2904    CSEd using BST_MAP.  */
2905 
2906 static void
optimize_load_redistribution(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2907 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2908 			      vec_info *vinfo, unsigned int group_size,
2909 			      hash_map<slp_tree, slp_tree> *load_map,
2910 			      slp_tree root)
2911 {
2912   slp_tree node;
2913   unsigned i;
2914 
2915   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2916     {
2917       slp_tree value
2918 	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2919 					  node);
2920       if (value)
2921 	{
2922 	  SLP_TREE_REF_COUNT (value)++;
2923 	  SLP_TREE_CHILDREN (root)[i] = value;
2924 	  /* ???  We know the original leafs of the replaced nodes will
2925 	     be referenced by bst_map, only the permutes created by
2926 	     pattern matching are not.  */
2927 	  if (SLP_TREE_REF_COUNT (node) == 1)
2928 	    load_map->remove (node);
2929 	  vect_free_slp_tree (node);
2930 	}
2931     }
2932 }
2933 
2934 /* Helper function of vect_match_slp_patterns.
2935 
2936    Attempts to match patterns against the slp tree rooted in REF_NODE using
2937    VINFO.  Patterns are matched in post-order traversal.
2938 
2939    If matching is successful the value in REF_NODE is updated and returned, if
2940    not then it is returned unchanged.  */
2941 
2942 static bool
vect_match_slp_patterns_2(slp_tree * ref_node,vec_info * vinfo,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache,hash_set<slp_tree> * visited)2943 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2944 			   slp_tree_to_load_perm_map_t *perm_cache,
2945 			   slp_compat_nodes_map_t *compat_cache,
2946 			   hash_set<slp_tree> *visited)
2947 {
2948   unsigned i;
2949   slp_tree node = *ref_node;
2950   bool found_p = false;
2951   if (!node || visited->add (node))
2952     return false;
2953 
2954   slp_tree child;
2955   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2956     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2957 					  vinfo, perm_cache, compat_cache,
2958 					  visited);
2959 
2960   for (unsigned x = 0; x < num__slp_patterns; x++)
2961     {
2962       vect_pattern *pattern
2963 	= slp_patterns[x] (perm_cache, compat_cache, ref_node);
2964       if (pattern)
2965 	{
2966 	  pattern->build (vinfo);
2967 	  delete pattern;
2968 	  found_p = true;
2969 	}
2970     }
2971 
2972   return found_p;
2973 }
2974 
2975 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2976    vec_info VINFO.
2977 
2978    The modified tree is returned.  Patterns are tried in order and multiple
2979    patterns may match.  */
2980 
2981 static bool
vect_match_slp_patterns(slp_instance instance,vec_info * vinfo,hash_set<slp_tree> * visited,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache)2982 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2983 			 hash_set<slp_tree> *visited,
2984 			 slp_tree_to_load_perm_map_t *perm_cache,
2985 			 slp_compat_nodes_map_t *compat_cache)
2986 {
2987   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2988   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2989 
2990   if (dump_enabled_p ())
2991     dump_printf_loc (MSG_NOTE, vect_location,
2992 		     "Analyzing SLP tree %p for patterns\n",
2993 		     SLP_INSTANCE_TREE (instance));
2994 
2995   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2996 				    visited);
2997 }
2998 
2999 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3000    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3001    Return true if we could use IFN_STORE_LANES instead and if that appears
3002    to be the better approach.  */
3003 
3004 static bool
vect_slp_prefer_store_lanes_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,unsigned int new_group_size)3005 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3006 			       unsigned int group_size,
3007 			       unsigned int new_group_size)
3008 {
3009   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3010   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3011   if (!vectype)
3012     return false;
3013   /* Allow the split if one of the two new groups would operate on full
3014      vectors *within* rather than across one scalar loop iteration.
3015      This is purely a heuristic, but it should work well for group
3016      sizes of 3 and 4, where the possible splits are:
3017 
3018        3->2+1:  OK if the vector has exactly two elements
3019        4->2+2:  Likewise
3020        4->3+1:  Less clear-cut.  */
3021   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3022       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3023     return false;
3024   return vect_store_lanes_supported (vectype, group_size, false);
3025 }
3026 
3027 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3028    vect_build_slp_tree to build a tree of packed stmts if possible.
3029    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3030 
3031 static bool
3032 vect_analyze_slp_instance (vec_info *vinfo,
3033 			   scalar_stmts_to_slp_tree_map_t *bst_map,
3034 			   stmt_vec_info stmt_info, slp_instance_kind kind,
3035 			   unsigned max_tree_size, unsigned *limit);
3036 
3037 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3038    of KIND.  Return true if successful.  */
3039 
3040 static bool
vect_build_slp_instance(vec_info * vinfo,slp_instance_kind kind,vec<stmt_vec_info> & scalar_stmts,vec<stmt_vec_info> & root_stmt_infos,unsigned max_tree_size,unsigned * limit,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info_)3041 vect_build_slp_instance (vec_info *vinfo,
3042 			 slp_instance_kind kind,
3043 			 vec<stmt_vec_info> &scalar_stmts,
3044 			 vec<stmt_vec_info> &root_stmt_infos,
3045 			 unsigned max_tree_size, unsigned *limit,
3046 			 scalar_stmts_to_slp_tree_map_t *bst_map,
3047 			 /* ???  We need stmt_info for group splitting.  */
3048 			 stmt_vec_info stmt_info_)
3049 {
3050   if (dump_enabled_p ())
3051     {
3052       dump_printf_loc (MSG_NOTE, vect_location,
3053 		       "Starting SLP discovery for\n");
3054       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3055 	dump_printf_loc (MSG_NOTE, vect_location,
3056 			 "  %G", scalar_stmts[i]->stmt);
3057     }
3058 
3059   /* Build the tree for the SLP instance.  */
3060   unsigned int group_size = scalar_stmts.length ();
3061   bool *matches = XALLOCAVEC (bool, group_size);
3062   poly_uint64 max_nunits = 1;
3063   unsigned tree_size = 0;
3064   unsigned i;
3065   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3066 				       &max_nunits, matches, limit,
3067 				       &tree_size, bst_map);
3068   if (node != NULL)
3069     {
3070       /* Calculate the unrolling factor based on the smallest type.  */
3071       poly_uint64 unrolling_factor
3072 	= calculate_unrolling_factor (max_nunits, group_size);
3073 
3074       if (maybe_ne (unrolling_factor, 1U)
3075 	  && is_a <bb_vec_info> (vinfo))
3076 	{
3077 	  unsigned HOST_WIDE_INT const_max_nunits;
3078 	  if (!max_nunits.is_constant (&const_max_nunits)
3079 	      || const_max_nunits > group_size)
3080 	    {
3081 	      if (dump_enabled_p ())
3082 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3083 				 "Build SLP failed: store group "
3084 				 "size not a multiple of the vector size "
3085 				 "in basic block SLP\n");
3086 	      vect_free_slp_tree (node);
3087 	      return false;
3088 	    }
3089 	  /* Fatal mismatch.  */
3090 	  if (dump_enabled_p ())
3091 	    dump_printf_loc (MSG_NOTE, vect_location,
3092 			     "SLP discovery succeeded but node needs "
3093 			     "splitting\n");
3094 	  memset (matches, true, group_size);
3095 	  matches[group_size / const_max_nunits * const_max_nunits] = false;
3096 	  vect_free_slp_tree (node);
3097 	}
3098       else
3099 	{
3100 	  /* Create a new SLP instance.  */
3101 	  slp_instance new_instance = XNEW (class _slp_instance);
3102 	  SLP_INSTANCE_TREE (new_instance) = node;
3103 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3104 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
3105 	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3106 	  SLP_INSTANCE_KIND (new_instance) = kind;
3107 	  new_instance->reduc_phis = NULL;
3108 	  new_instance->cost_vec = vNULL;
3109 	  new_instance->subgraph_entries = vNULL;
3110 
3111 	  if (dump_enabled_p ())
3112 	    dump_printf_loc (MSG_NOTE, vect_location,
3113 			     "SLP size %u vs. limit %u.\n",
3114 			     tree_size, max_tree_size);
3115 
3116 	  /* Fixup SLP reduction chains.  */
3117 	  if (kind == slp_inst_kind_reduc_chain)
3118 	    {
3119 	      /* If this is a reduction chain with a conversion in front
3120 		 amend the SLP tree with a node for that.  */
3121 	      gimple *scalar_def
3122 		= vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3123 	      if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3124 		{
3125 		  /* Get at the conversion stmt - we know it's the single use
3126 		     of the last stmt of the reduction chain.  */
3127 		  use_operand_p use_p;
3128 		  bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3129 					   &use_p, &scalar_def);
3130 		  gcc_assert (r);
3131 		  stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3132 		  next_info = vect_stmt_to_vectorize (next_info);
3133 		  scalar_stmts = vNULL;
3134 		  scalar_stmts.create (group_size);
3135 		  for (unsigned i = 0; i < group_size; ++i)
3136 		    scalar_stmts.quick_push (next_info);
3137 		  slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3138 		  SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3139 		  SLP_TREE_CHILDREN (conv).quick_push (node);
3140 		  SLP_INSTANCE_TREE (new_instance) = conv;
3141 		  /* We also have to fake this conversion stmt as SLP reduction
3142 		     group so we don't have to mess with too much code
3143 		     elsewhere.  */
3144 		  REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3145 		  REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3146 		}
3147 	      /* Fill the backedge child of the PHI SLP node.  The
3148 		 general matching code cannot find it because the
3149 		 scalar code does not reflect how we vectorize the
3150 		 reduction.  */
3151 	      use_operand_p use_p;
3152 	      imm_use_iterator imm_iter;
3153 	      class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3154 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3155 				     gimple_get_lhs (scalar_def))
3156 		/* There are exactly two non-debug uses, the reduction
3157 		   PHI and the loop-closed PHI node.  */
3158 		if (!is_gimple_debug (USE_STMT (use_p))
3159 		    && gimple_bb (USE_STMT (use_p)) == loop->header)
3160 		  {
3161 		    auto_vec<stmt_vec_info, 64> phis (group_size);
3162 		    stmt_vec_info phi_info
3163 		      = vinfo->lookup_stmt (USE_STMT (use_p));
3164 		    for (unsigned i = 0; i < group_size; ++i)
3165 		      phis.quick_push (phi_info);
3166 		    slp_tree *phi_node = bst_map->get (phis);
3167 		    unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3168 		    SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3169 		      = SLP_INSTANCE_TREE (new_instance);
3170 		    SLP_INSTANCE_TREE (new_instance)->refcnt++;
3171 		  }
3172 	    }
3173 
3174 	  vinfo->slp_instances.safe_push (new_instance);
3175 
3176 	  /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3177 	     the number of scalar stmts in the root in a few places.
3178 	     Verify that assumption holds.  */
3179 	  gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3180 			.length () == group_size);
3181 
3182 	  if (dump_enabled_p ())
3183 	    {
3184 	      dump_printf_loc (MSG_NOTE, vect_location,
3185 			       "Final SLP tree for instance %p:\n", new_instance);
3186 	      vect_print_slp_graph (MSG_NOTE, vect_location,
3187 				    SLP_INSTANCE_TREE (new_instance));
3188 	    }
3189 
3190 	  return true;
3191 	}
3192     }
3193   else
3194     {
3195       /* Failed to SLP.  */
3196       /* Free the allocated memory.  */
3197       scalar_stmts.release ();
3198     }
3199 
3200   stmt_vec_info stmt_info = stmt_info_;
3201   /* Try to break the group up into pieces.  */
3202   if (kind == slp_inst_kind_store)
3203     {
3204       /* ???  We could delay all the actual splitting of store-groups
3205 	 until after SLP discovery of the original group completed.
3206 	 Then we can recurse to vect_build_slp_instance directly.  */
3207       for (i = 0; i < group_size; i++)
3208 	if (!matches[i])
3209 	  break;
3210 
3211       /* For basic block SLP, try to break the group up into multiples of
3212 	 a vector size.  */
3213       if (is_a <bb_vec_info> (vinfo)
3214 	  && (i > 1 && i < group_size))
3215 	{
3216 	  tree scalar_type
3217 	    = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3218 	  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3219 						      1 << floor_log2 (i));
3220 	  unsigned HOST_WIDE_INT const_nunits;
3221 	  if (vectype
3222 	      && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3223 	    {
3224 	      /* Split into two groups at the first vector boundary.  */
3225 	      gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3226 	      unsigned group1_size = i & ~(const_nunits - 1);
3227 
3228 	      if (dump_enabled_p ())
3229 		dump_printf_loc (MSG_NOTE, vect_location,
3230 				 "Splitting SLP group at stmt %u\n", i);
3231 	      stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3232 							       group1_size);
3233 	      bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3234 						    kind, max_tree_size,
3235 						    limit);
3236 	      /* Split the rest at the failure point and possibly
3237 		 re-analyze the remaining matching part if it has
3238 		 at least two lanes.  */
3239 	      if (group1_size < i
3240 		  && (i + 1 < group_size
3241 		      || i - group1_size > 1))
3242 		{
3243 		  stmt_vec_info rest2 = rest;
3244 		  rest = vect_split_slp_store_group (rest, i - group1_size);
3245 		  if (i - group1_size > 1)
3246 		    res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3247 						      kind, max_tree_size,
3248 						      limit);
3249 		}
3250 	      /* Re-analyze the non-matching tail if it has at least
3251 		 two lanes.  */
3252 	      if (i + 1 < group_size)
3253 		res |= vect_analyze_slp_instance (vinfo, bst_map,
3254 						  rest, kind, max_tree_size,
3255 						  limit);
3256 	      return res;
3257 	    }
3258 	}
3259 
3260       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3261       if (is_a <loop_vec_info> (vinfo)
3262 	  && (i > 1 && i < group_size)
3263 	  && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3264 	{
3265 	  unsigned group1_size = i;
3266 
3267 	  if (dump_enabled_p ())
3268 	    dump_printf_loc (MSG_NOTE, vect_location,
3269 			     "Splitting SLP group at stmt %u\n", i);
3270 
3271 	  stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3272 							   group1_size);
3273 	  /* Loop vectorization cannot handle gaps in stores, make sure
3274 	     the split group appears as strided.  */
3275 	  STMT_VINFO_STRIDED_P (rest) = 1;
3276 	  DR_GROUP_GAP (rest) = 0;
3277 	  STMT_VINFO_STRIDED_P (stmt_info) = 1;
3278 	  DR_GROUP_GAP (stmt_info) = 0;
3279 
3280 	  bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3281 						kind, max_tree_size, limit);
3282 	  if (i + 1 < group_size)
3283 	    res |= vect_analyze_slp_instance (vinfo, bst_map,
3284 					      rest, kind, max_tree_size, limit);
3285 
3286 	  return res;
3287 	}
3288 
3289       /* Even though the first vector did not all match, we might be able to SLP
3290 	 (some) of the remainder.  FORNOW ignore this possibility.  */
3291     }
3292 
3293   /* Failed to SLP.  */
3294   if (dump_enabled_p ())
3295     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3296   return false;
3297 }
3298 
3299 
3300 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3301    vect_build_slp_tree to build a tree of packed stmts if possible.
3302    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3303 
3304 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,slp_instance_kind kind,unsigned max_tree_size,unsigned * limit)3305 vect_analyze_slp_instance (vec_info *vinfo,
3306 			   scalar_stmts_to_slp_tree_map_t *bst_map,
3307 			   stmt_vec_info stmt_info,
3308 			   slp_instance_kind kind,
3309 			   unsigned max_tree_size, unsigned *limit)
3310 {
3311   unsigned int i;
3312   vec<stmt_vec_info> scalar_stmts;
3313 
3314   if (is_a <bb_vec_info> (vinfo))
3315     vect_location = stmt_info->stmt;
3316 
3317   stmt_vec_info next_info = stmt_info;
3318   if (kind == slp_inst_kind_store)
3319     {
3320       /* Collect the stores and store them in scalar_stmts.  */
3321       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3322       while (next_info)
3323 	{
3324 	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3325 	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3326 	}
3327     }
3328   else if (kind == slp_inst_kind_reduc_chain)
3329     {
3330       /* Collect the reduction stmts and store them in scalar_stmts.  */
3331       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3332       while (next_info)
3333 	{
3334 	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3335 	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3336 	}
3337       /* Mark the first element of the reduction chain as reduction to properly
3338 	 transform the node.  In the reduction analysis phase only the last
3339 	 element of the chain is marked as reduction.  */
3340       STMT_VINFO_DEF_TYPE (stmt_info)
3341 	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3342       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3343 	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3344     }
3345   else if (kind == slp_inst_kind_ctor)
3346     {
3347       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3348       tree val;
3349       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3350       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3351 	{
3352 	  stmt_vec_info def_info = vinfo->lookup_def (val);
3353 	  def_info = vect_stmt_to_vectorize (def_info);
3354 	  scalar_stmts.quick_push (def_info);
3355 	}
3356       if (dump_enabled_p ())
3357 	dump_printf_loc (MSG_NOTE, vect_location,
3358 			 "Analyzing vectorizable constructor: %G\n",
3359 			 stmt_info->stmt);
3360     }
3361   else if (kind == slp_inst_kind_reduc_group)
3362     {
3363       /* Collect reduction statements.  */
3364       const vec<stmt_vec_info> &reductions
3365 	= as_a <loop_vec_info> (vinfo)->reductions;
3366       scalar_stmts.create (reductions.length ());
3367       for (i = 0; reductions.iterate (i, &next_info); i++)
3368 	if ((STMT_VINFO_RELEVANT_P (next_info)
3369 	     || STMT_VINFO_LIVE_P (next_info))
3370 	    /* ???  Make sure we didn't skip a conversion around a reduction
3371 	       path.  In that case we'd have to reverse engineer that conversion
3372 	       stmt following the chain using reduc_idx and from the PHI
3373 	       using reduc_def.  */
3374 	    && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3375 	  scalar_stmts.quick_push (next_info);
3376       /* If less than two were relevant/live there's nothing to SLP.  */
3377       if (scalar_stmts.length () < 2)
3378 	return false;
3379     }
3380   else
3381     gcc_unreachable ();
3382 
3383   vec<stmt_vec_info> roots = vNULL;
3384   if (kind == slp_inst_kind_ctor)
3385     {
3386       roots.create (1);
3387       roots.quick_push (stmt_info);
3388     }
3389   /* Build the tree for the SLP instance.  */
3390   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3391 				      roots,
3392 				      max_tree_size, limit, bst_map,
3393 				      kind == slp_inst_kind_store
3394 				      ? stmt_info : NULL);
3395   if (!res)
3396     roots.release ();
3397 
3398   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3399      where we should do store group splitting.  */
3400 
3401   return res;
3402 }
3403 
3404 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3405    trees of packed scalar stmts if SLP is possible.  */
3406 
3407 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)3408 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3409 {
3410   unsigned int i;
3411   stmt_vec_info first_element;
3412   slp_instance instance;
3413 
3414   DUMP_VECT_SCOPE ("vect_analyze_slp");
3415 
3416   unsigned limit = max_tree_size;
3417 
3418   scalar_stmts_to_slp_tree_map_t *bst_map
3419     = new scalar_stmts_to_slp_tree_map_t ();
3420 
3421   /* Find SLP sequences starting from groups of grouped stores.  */
3422   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3423     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3424 			       STMT_VINFO_GROUPED_ACCESS (first_element)
3425 			       ? slp_inst_kind_store : slp_inst_kind_ctor,
3426 			       max_tree_size, &limit);
3427 
3428   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3429     {
3430       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3431 	{
3432 	  vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3433 	  /* Apply patterns.  */
3434 	  for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3435 	    bb_vinfo->roots[i].stmts[j]
3436 	      = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3437 	  if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3438 				       bb_vinfo->roots[i].stmts,
3439 				       bb_vinfo->roots[i].roots,
3440 				       max_tree_size, &limit, bst_map, NULL))
3441 	    {
3442 	      bb_vinfo->roots[i].stmts = vNULL;
3443 	      bb_vinfo->roots[i].roots = vNULL;
3444 	    }
3445 	}
3446     }
3447 
3448   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3449     {
3450       /* Find SLP sequences starting from reduction chains.  */
3451       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3452 	if (! STMT_VINFO_RELEVANT_P (first_element)
3453 	    && ! STMT_VINFO_LIVE_P (first_element))
3454 	  ;
3455 	else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3456 					      slp_inst_kind_reduc_chain,
3457 					      max_tree_size, &limit))
3458 	  {
3459 	    /* Dissolve reduction chain group.  */
3460 	    stmt_vec_info vinfo = first_element;
3461 	    stmt_vec_info last = NULL;
3462 	    while (vinfo)
3463 	      {
3464 		stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3465 		REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3466 		REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3467 		last = vinfo;
3468 		vinfo = next;
3469 	      }
3470 	    STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3471 	    /* It can be still vectorized as part of an SLP reduction.  */
3472 	    loop_vinfo->reductions.safe_push (last);
3473 	  }
3474 
3475       /* Find SLP sequences starting from groups of reductions.  */
3476       if (loop_vinfo->reductions.length () > 1)
3477 	vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3478 				   slp_inst_kind_reduc_group, max_tree_size,
3479 				   &limit);
3480     }
3481 
3482   hash_set<slp_tree> visited_patterns;
3483   slp_tree_to_load_perm_map_t perm_cache;
3484   slp_compat_nodes_map_t compat_cache;
3485 
3486   /* See if any patterns can be found in the SLP tree.  */
3487   bool pattern_found = false;
3488   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3489     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3490 					      &visited_patterns, &perm_cache,
3491 					      &compat_cache);
3492 
3493   /* If any were found optimize permutations of loads.  */
3494   if (pattern_found)
3495     {
3496       hash_map<slp_tree, slp_tree> load_map;
3497       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3498 	{
3499 	  slp_tree root = SLP_INSTANCE_TREE (instance);
3500 	  optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3501 					&load_map, root);
3502 	}
3503     }
3504 
3505 
3506 
3507   /* The map keeps a reference on SLP nodes built, release that.  */
3508   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3509        it != bst_map->end (); ++it)
3510     if ((*it).second)
3511       vect_free_slp_tree ((*it).second);
3512   delete bst_map;
3513 
3514   if (pattern_found && dump_enabled_p ())
3515     {
3516       dump_printf_loc (MSG_NOTE, vect_location,
3517 		       "Pattern matched SLP tree\n");
3518       hash_set<slp_tree> visited;
3519       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3520 	vect_print_slp_graph (MSG_NOTE, vect_location,
3521 			      SLP_INSTANCE_TREE (instance), visited);
3522     }
3523 
3524   return opt_result::success ();
3525 }
3526 
3527 struct slpg_vertex
3528 {
slpg_vertexslpg_vertex3529   slpg_vertex (slp_tree node_)
3530     : node (node_), perm_in (-1), perm_out (-1) {}
3531 
get_perm_materializedslpg_vertex3532   int get_perm_materialized () const
3533     { return perm_in != perm_out ? perm_in : 0; }
3534 
3535   slp_tree node;
3536   /* The common permutation on the incoming lanes (towards SLP children).  */
3537   int perm_in;
3538   /* The permutation on the outgoing lanes (towards SLP parents).  When
3539      the node is a materialization point for a permute this differs
3540      from perm_in (and is then usually zero).  Materialization happens
3541      on the input side.  */
3542   int perm_out;
3543 };
3544 
3545 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
3546 
3547 static void
vect_slp_build_vertices(hash_set<slp_tree> & visited,slp_tree node,vec<slpg_vertex> & vertices,vec<int> & leafs)3548 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3549 			 vec<slpg_vertex> &vertices, vec<int> &leafs)
3550 {
3551   unsigned i;
3552   slp_tree child;
3553 
3554   if (visited.add (node))
3555     return;
3556 
3557   node->vertex = vertices.length ();
3558   vertices.safe_push (slpg_vertex (node));
3559 
3560   bool leaf = true;
3561   bool force_leaf = false;
3562   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3563     if (child)
3564       {
3565 	leaf = false;
3566 	vect_slp_build_vertices (visited, child, vertices, leafs);
3567       }
3568     else
3569       force_leaf = true;
3570   /* Since SLP discovery works along use-def edges all cycles have an
3571      entry - but there's the exception of cycles where we do not handle
3572      the entry explicitely (but with a NULL SLP node), like some reductions
3573      and inductions.  Force those SLP PHIs to act as leafs to make them
3574      backwards reachable.  */
3575   if (leaf || force_leaf)
3576     leafs.safe_push (node->vertex);
3577 }
3578 
3579 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
3580 
3581 static void
vect_slp_build_vertices(vec_info * info,vec<slpg_vertex> & vertices,vec<int> & leafs)3582 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3583 			 vec<int> &leafs)
3584 {
3585   hash_set<slp_tree> visited;
3586   unsigned i;
3587   slp_instance instance;
3588   FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3589     vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3590 			     leafs);
3591 }
3592 
3593 /* Apply (reverse) bijectite PERM to VEC.  */
3594 
3595 template <class T>
3596 static void
vect_slp_permute(vec<unsigned> perm,vec<T> & vec,bool reverse)3597 vect_slp_permute (vec<unsigned> perm,
3598 		  vec<T> &vec, bool reverse)
3599 {
3600   auto_vec<T, 64> saved;
3601   saved.create (vec.length ());
3602   for (unsigned i = 0; i < vec.length (); ++i)
3603     saved.quick_push (vec[i]);
3604 
3605   if (reverse)
3606     {
3607       for (unsigned i = 0; i < vec.length (); ++i)
3608 	vec[perm[i]] = saved[i];
3609       for (unsigned i = 0; i < vec.length (); ++i)
3610 	gcc_assert (vec[perm[i]] == saved[i]);
3611     }
3612   else
3613     {
3614       for (unsigned i = 0; i < vec.length (); ++i)
3615 	vec[i] = saved[perm[i]];
3616       for (unsigned i = 0; i < vec.length (); ++i)
3617 	gcc_assert (vec[i] == saved[perm[i]]);
3618     }
3619 }
3620 
3621 /* Return whether permutations PERM_A and PERM_B as recorded in the
3622    PERMS vector are equal.  */
3623 
3624 static bool
vect_slp_perms_eq(const vec<vec<unsigned>> & perms,int perm_a,int perm_b)3625 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3626 		   int perm_a, int perm_b)
3627 {
3628   return (perm_a == perm_b
3629 	  || (perm_a != -1 && perm_b != -1
3630 	      && perms[perm_a].length () == perms[perm_b].length ()
3631 	      && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3632 			 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3633 }
3634 
3635 /* Optimize the SLP graph of VINFO.  */
3636 
3637 void
vect_optimize_slp(vec_info * vinfo)3638 vect_optimize_slp (vec_info *vinfo)
3639 {
3640   if (vinfo->slp_instances.is_empty ())
3641     return;
3642 
3643   slp_tree node;
3644   unsigned i;
3645   auto_vec<slpg_vertex> vertices;
3646   auto_vec<int> leafs;
3647   vect_slp_build_vertices (vinfo, vertices, leafs);
3648 
3649   struct graph *slpg = new_graph (vertices.length ());
3650   for (slpg_vertex &v : vertices)
3651     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3652       if (child)
3653 	add_edge (slpg, v.node->vertex, child->vertex);
3654 
3655   /* Compute (reverse) postorder on the inverted graph.  */
3656   auto_vec<int> ipo;
3657   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3658 
3659   auto_vec<vec<unsigned> > perms;
3660   perms.safe_push (vNULL); /* zero is no permute */
3661 
3662   /* Produce initial permutations.  */
3663   for (i = 0; i < leafs.length (); ++i)
3664     {
3665       int idx = leafs[i];
3666       slp_tree node = vertices[idx].node;
3667 
3668       /* Handle externals and constants optimistically throughout the
3669 	 iteration.  But treat existing vectors as fixed since we
3670 	 do not handle permuting them below.  */
3671       if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3672 	   && !SLP_TREE_VEC_DEFS (node).exists ())
3673 	  || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3674 	continue;
3675 
3676       /* Leafs do not change across iterations.  Note leafs also double
3677 	 as entries to the reverse graph.  */
3678       if (!slpg->vertices[idx].succ)
3679 	{
3680 	  vertices[idx].perm_in = 0;
3681 	  vertices[idx].perm_out = 0;
3682 	}
3683 
3684       /* Loads are the only thing generating permutes.  */
3685       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3686 	continue;
3687 
3688       /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3689 	 node unpermuted, record this permute.  */
3690       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3691       if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3692 	continue;
3693       dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3694       unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3695       bool any_permute = false;
3696       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3697 	{
3698 	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3699 	  imin = MIN (imin, idx);
3700 	  imax = MAX (imax, idx);
3701 	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3702 	    any_permute = true;
3703 	}
3704       /* If there's no permute no need to split one out.  */
3705       if (!any_permute)
3706 	continue;
3707       /* If the span doesn't match we'd disrupt VF computation, avoid
3708 	 that for now.  */
3709       if (imax - imin + 1 != SLP_TREE_LANES (node))
3710 	continue;
3711 
3712       /* For now only handle true permutes, like
3713 	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
3714 	 when permuting constants and invariants keeping the permute
3715 	 bijective.  */
3716       auto_sbitmap load_index (SLP_TREE_LANES (node));
3717       bitmap_clear (load_index);
3718       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3719 	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3720       unsigned j;
3721       for (j = 0; j < SLP_TREE_LANES (node); ++j)
3722 	if (!bitmap_bit_p (load_index, j))
3723 	  break;
3724       if (j != SLP_TREE_LANES (node))
3725 	continue;
3726 
3727       vec<unsigned> perm = vNULL;
3728       perm.safe_grow (SLP_TREE_LANES (node), true);
3729       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3730 	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3731       perms.safe_push (perm);
3732       vertices[idx].perm_in = perms.length () - 1;
3733       vertices[idx].perm_out = perms.length () - 1;
3734     }
3735 
3736   /* We have to mark outgoing permutations facing non-associating-reduction
3737      graph entries that are not represented as to be materialized.  */
3738   for (slp_instance instance : vinfo->slp_instances)
3739     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3740       {
3741 	/* Just setting perm_out isn't enough for the propagation to
3742 	   pick this up.  */
3743 	vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3744 	vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3745       }
3746     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
3747       {
3748 	stmt_vec_info stmt_info
3749 	  = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
3750 	stmt_vec_info reduc_info = info_for_reduction (vinfo, stmt_info);
3751 	if (needs_fold_left_reduction_p (TREE_TYPE
3752 					   (gimple_get_lhs (stmt_info->stmt)),
3753 					 STMT_VINFO_REDUC_CODE (reduc_info)))
3754 	  {
3755 	    unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
3756 	    vertices[node_i].perm_in = 0;
3757 	    vertices[node_i].perm_out = 0;
3758 	  }
3759       }
3760 
3761   /* Propagate permutes along the graph and compute materialization points.  */
3762   bool changed;
3763   bool do_materialization = false;
3764   unsigned iteration = 0;
3765   do
3766     {
3767       changed = false;
3768       ++iteration;
3769 
3770       if (dump_enabled_p ())
3771 	dump_printf_loc (MSG_NOTE, vect_location,
3772 			 "SLP optimize iteration %d\n", iteration);
3773 
3774       for (i = vertices.length (); i > 0 ; --i)
3775 	{
3776 	  int idx = ipo[i-1];
3777 	  slp_tree node = vertices[idx].node;
3778 
3779 	  /* Handle externals and constants optimistically throughout the
3780 	     iteration.  */
3781 	  if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3782 	      || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3783 	    continue;
3784 
3785 	  /* We still eventually have failed backedge SLP nodes in the
3786 	     graph, those are only cancelled when analyzing operations.
3787 	     Simply treat them as transparent ops, propagating permutes
3788 	     through them.  */
3789 	  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3790 	    {
3791 	      /* We do not handle stores with a permutation, so all
3792 		 incoming permutes must have been materialized.  */
3793 	      stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3794 	      if (STMT_VINFO_DATA_REF (rep)
3795 		  && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3796 		{
3797 		  /* ???  We're forcing materialization in place
3798 		     of the child here, we'd need special handling
3799 		     in materialization to leave perm_in -1 here.  */
3800 		  vertices[idx].perm_in = 0;
3801 		  vertices[idx].perm_out = 0;
3802 		}
3803 	      /* We cannot move a permute across an operation that is
3804 		 not independent on lanes.  Note this is an explicit
3805 		 negative list since that's much shorter than the respective
3806 		 positive one but it's critical to keep maintaining it.  */
3807 	      if (is_gimple_call (STMT_VINFO_STMT (rep)))
3808 		switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3809 		  {
3810 		  case CFN_COMPLEX_ADD_ROT90:
3811 		  case CFN_COMPLEX_ADD_ROT270:
3812 		  case CFN_COMPLEX_MUL:
3813 		  case CFN_COMPLEX_MUL_CONJ:
3814 		  case CFN_VEC_ADDSUB:
3815 		  case CFN_VEC_FMADDSUB:
3816 		  case CFN_VEC_FMSUBADD:
3817 		    vertices[idx].perm_in = 0;
3818 		    vertices[idx].perm_out = 0;
3819 		  default:;
3820 		  }
3821 	    }
3822 
3823 	  if (!slpg->vertices[idx].succ)
3824 	    /* Pick up pre-computed leaf values.  */
3825 	    ;
3826 	  else
3827 	    {
3828 	      bool any_succ_perm_out_m1 = false;
3829 	      int perm_in = vertices[idx].perm_in;
3830 	      for (graph_edge *succ = slpg->vertices[idx].succ;
3831 		   succ; succ = succ->succ_next)
3832 		{
3833 		  int succ_idx = succ->dest;
3834 		  int succ_perm = vertices[succ_idx].perm_out;
3835 		  /* Handle unvisited (and constant) nodes optimistically.  */
3836 		  /* ???  But for constants once we want to handle
3837 		     non-bijective permutes we have to verify the permute,
3838 		     when unifying lanes, will not unify different constants.
3839 		     For example see gcc.dg/vect/bb-slp-14.c for a case
3840 		     that would break.  */
3841 		  if (succ_perm == -1)
3842 		    {
3843 		      /* When we handled a non-leaf optimistically, note
3844 			 that so we can adjust its outgoing permute below.  */
3845 		      slp_tree succ_node = vertices[succ_idx].node;
3846 		      if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3847 			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3848 			any_succ_perm_out_m1 = true;
3849 		      continue;
3850 		    }
3851 		  if (perm_in == -1)
3852 		    perm_in = succ_perm;
3853 		  else if (succ_perm == 0
3854 			   || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3855 		    {
3856 		      perm_in = 0;
3857 		      break;
3858 		    }
3859 		}
3860 
3861 	      /* Adjust any incoming permutes we treated optimistically.  */
3862 	      if (perm_in != -1 && any_succ_perm_out_m1)
3863 		{
3864 		  for (graph_edge *succ = slpg->vertices[idx].succ;
3865 		       succ; succ = succ->succ_next)
3866 		    {
3867 		      slp_tree succ_node = vertices[succ->dest].node;
3868 		      if (vertices[succ->dest].perm_out == -1
3869 			  && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3870 			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3871 			{
3872 			  vertices[succ->dest].perm_out = perm_in;
3873 			  /* And ensure this propagates.  */
3874 			  if (vertices[succ->dest].perm_in == -1)
3875 			    vertices[succ->dest].perm_in = perm_in;
3876 			}
3877 		    }
3878 		  changed = true;
3879 		}
3880 
3881 	      if (!vect_slp_perms_eq (perms, perm_in,
3882 				      vertices[idx].perm_in))
3883 		{
3884 		  /* Make sure we eventually converge.  */
3885 		  gcc_checking_assert (vertices[idx].perm_in == -1
3886 				       || perm_in == 0);
3887 		  vertices[idx].perm_in = perm_in;
3888 
3889 		  /* While we can handle VEC_PERM nodes as transparent
3890 		     pass-through they can be a cheap materialization
3891 		     point as well.  In addition they can act as source
3892 		     of a random permutation as well.
3893 		     The following ensures that former materialization
3894 		     points that now have zero incoming permutes no
3895 		     longer appear as such and that former "any" permutes
3896 		     get pass-through.  We keep VEC_PERM nodes optimistic
3897 		     as "any" outgoing permute though.  */
3898 		  if (vertices[idx].perm_out != 0
3899 		      && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3900 		    vertices[idx].perm_out = perm_in;
3901 		  changed = true;
3902 		}
3903 	    }
3904 
3905 	  /* Elide pruning at materialization points in the first
3906 	     iteration phase.  */
3907 	  if (!do_materialization)
3908 	    continue;
3909 
3910 	  int perm = vertices[idx].perm_out;
3911 	  if (perm == 0 || perm == -1)
3912 	    continue;
3913 
3914 	  /* Decide on permute materialization.  Look whether there's
3915 	     a use (pred) edge that is permuted differently than us.
3916 	     In that case mark ourselves so the permutation is applied.  */
3917 	  bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3918 	  if (all_preds_permuted)
3919 	    for (graph_edge *pred = slpg->vertices[idx].pred;
3920 		 pred; pred = pred->pred_next)
3921 	      {
3922 		int pred_perm = vertices[pred->src].perm_in;
3923 		gcc_checking_assert (pred_perm != -1);
3924 		if (!vect_slp_perms_eq (perms, perm, pred_perm))
3925 		  {
3926 		    all_preds_permuted = false;
3927 		    break;
3928 		  }
3929 	      }
3930 	  if (!all_preds_permuted)
3931 	    {
3932 	      vertices[idx].perm_out = 0;
3933 	      changed = true;
3934 	    }
3935 	}
3936 
3937       /* If the initial propagation converged, switch on materialization
3938 	 and re-propagate.  */
3939       if (!changed && !do_materialization)
3940 	{
3941 	  do_materialization = true;
3942 	  changed = true;
3943 	}
3944     }
3945   while (changed);
3946   statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3947 
3948   /* Materialize.  */
3949   for (i = 0; i < vertices.length (); ++i)
3950     {
3951       int perm_in = vertices[i].perm_in;
3952       slp_tree node = vertices[i].node;
3953 
3954       /* First permute invariant/external original successors, we handle
3955 	 those optimistically during propagation and duplicate them if
3956 	 they are used with different permutations.  */
3957       unsigned j;
3958       slp_tree child;
3959       if (perm_in > 0)
3960 	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3961 	  {
3962 	    if (!child
3963 		|| (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3964 		    && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3965 	      continue;
3966 
3967 	    /* If the vector is uniform there's nothing to do.  */
3968 	    if (vect_slp_tree_uniform_p (child))
3969 	      continue;
3970 
3971 	    /* We can end up sharing some externals via two_operator
3972 	       handling.  Be prepared to unshare those.  */
3973 	    if (child->refcnt != 1)
3974 	      {
3975 		gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3976 		SLP_TREE_CHILDREN (node)[j] = child
3977 		  = vect_create_new_slp_node
3978 		      (SLP_TREE_SCALAR_OPS (child).copy ());
3979 	      }
3980 	    vect_slp_permute (perms[perm_in],
3981 			      SLP_TREE_SCALAR_OPS (child), true);
3982 	  }
3983 
3984       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3985 	{
3986 	  /* Apply the common permutes to the input vectors.  */
3987 	  if (perm_in > 0)
3988 	    {
3989 	      /* If the node is already a permute node we can apply
3990 		 the permutation to the lane selection, effectively
3991 		 materializing it on the incoming vectors.  */
3992 	      if (dump_enabled_p ())
3993 		dump_printf_loc (MSG_NOTE, vect_location,
3994 				 "simplifying permute node %p\n",
3995 				 node);
3996 	      for (unsigned k = 0;
3997 		   k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3998 		SLP_TREE_LANE_PERMUTATION (node)[k].second
3999 		  = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
4000 	    }
4001 	  /* Apply the anticipated output permute to the permute and
4002 	     stmt vectors.  */
4003 	  int perm_out = vertices[i].perm_out;
4004 	  if (perm_out > 0)
4005 	    {
4006 	      vect_slp_permute (perms[perm_out],
4007 				SLP_TREE_SCALAR_STMTS (node), true);
4008 	      vect_slp_permute (perms[perm_out],
4009 				SLP_TREE_LANE_PERMUTATION (node), true);
4010 	    }
4011 	}
4012       else if (vertices[i].get_perm_materialized () != 0)
4013 	{
4014 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4015 	    /* For loads simply drop the permutation, the load permutation
4016 	       already performs the desired permutation.  */
4017 	    ;
4018 	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4019 	    gcc_unreachable ();
4020 	  else
4021 	    {
4022 	      if (dump_enabled_p ())
4023 		dump_printf_loc (MSG_NOTE, vect_location,
4024 				 "inserting permute node in place of %p\n",
4025 				 node);
4026 
4027 	      /* Make a copy of NODE and in-place change it to a
4028 		 VEC_PERM node to permute the lanes of the copy.  */
4029 	      slp_tree copy = new _slp_tree;
4030 	      SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
4031 	      SLP_TREE_CHILDREN (node) = vNULL;
4032 	      SLP_TREE_SCALAR_STMTS (copy)
4033 		= SLP_TREE_SCALAR_STMTS (node).copy ();
4034 	      vect_slp_permute (perms[perm_in],
4035 				SLP_TREE_SCALAR_STMTS (copy), true);
4036 	      gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
4037 	      SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
4038 	      gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4039 	      SLP_TREE_LANE_PERMUTATION (copy)
4040 		= SLP_TREE_LANE_PERMUTATION (node);
4041 	      SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4042 	      SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4043 	      copy->refcnt = 1;
4044 	      copy->max_nunits = node->max_nunits;
4045 	      SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4046 	      SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4047 	      SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4048 
4049 	      /* Now turn NODE into a VEC_PERM.  */
4050 	      SLP_TREE_CHILDREN (node).safe_push (copy);
4051 	      SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4052 	      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4053 		SLP_TREE_LANE_PERMUTATION (node)
4054 		  .quick_push (std::make_pair (0, perms[perm_in][j]));
4055 	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4056 	    }
4057 	}
4058       else if (perm_in > 0) /* perm_in == perm_out */
4059 	{
4060 	  /* Apply the reverse permutation to our stmts.  */
4061 	  vect_slp_permute (perms[perm_in],
4062 			    SLP_TREE_SCALAR_STMTS (node), true);
4063 	  /* And to the lane/load permutation, which we can simply
4064 	     make regular by design.  */
4065 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4066 	    {
4067 	      gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4068 	      /* ???  When we handle non-bijective permutes the idea
4069 		 is that we can force the load-permutation to be
4070 		 { min, min + 1, min + 2, ... max }.  But then the
4071 		 scalar defs might no longer match the lane content
4072 		 which means wrong-code with live lane vectorization.
4073 		 So we possibly have to have NULL entries for those.  */
4074 	      vect_slp_permute (perms[perm_in],
4075 				SLP_TREE_LOAD_PERMUTATION (node), true);
4076 	    }
4077 	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4078 	    gcc_unreachable ();
4079 	}
4080     }
4081 
4082   /* Elide any permutations at BB reduction roots.  */
4083   if (is_a <bb_vec_info> (vinfo))
4084     {
4085       for (slp_instance instance : vinfo->slp_instances)
4086 	{
4087 	  if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4088 	    continue;
4089 	  slp_tree old = SLP_INSTANCE_TREE (instance);
4090 	  if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4091 	      && SLP_TREE_CHILDREN (old).length () == 1)
4092 	    {
4093 	      slp_tree child = SLP_TREE_CHILDREN (old)[0];
4094 	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4095 		{
4096 		  /* Preserve the special VEC_PERM we use to shield existing
4097 		     vector defs from the rest.  But make it a no-op.  */
4098 		  auto_vec<stmt_vec_info, 64> saved;
4099 		  saved.create (SLP_TREE_SCALAR_STMTS (old).length ());
4100 		  for (unsigned i = 0;
4101 		       i < SLP_TREE_SCALAR_STMTS (old).length (); ++i)
4102 		    saved.quick_push (SLP_TREE_SCALAR_STMTS (old)[i]);
4103 		  for (unsigned i = 0;
4104 		       i < SLP_TREE_SCALAR_STMTS (old).length (); ++i)
4105 		    SLP_TREE_SCALAR_STMTS (old)[i]
4106 		      = saved[SLP_TREE_LANE_PERMUTATION (old)[i].second];
4107 		  unsigned i = 0;
4108 		  for (std::pair<unsigned, unsigned> &p
4109 		       : SLP_TREE_LANE_PERMUTATION (old))
4110 		    p.second = i++;
4111 		}
4112 	      else
4113 		{
4114 		  SLP_INSTANCE_TREE (instance) = child;
4115 		  SLP_TREE_REF_COUNT (child)++;
4116 		  vect_free_slp_tree (old);
4117 		}
4118 	    }
4119 	  else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4120 		   && SLP_TREE_REF_COUNT (old) == 1
4121 		   && vertices[old->vertex].get_perm_materialized () != 0)
4122 	    {
4123 	      /* ???  For loads the situation is more complex since
4124 		 we can't modify the permute in place in case the
4125 		 node is used multiple times.  In fact for loads this
4126 		 should be somehow handled in the propagation engine.  */
4127 	      /* Apply the reverse permutation to our stmts.  */
4128 	      int perm = vertices[old->vertex].get_perm_materialized ();
4129 	      vect_slp_permute (perms[perm],
4130 				SLP_TREE_SCALAR_STMTS (old), true);
4131 	      vect_slp_permute (perms[perm],
4132 				SLP_TREE_LOAD_PERMUTATION (old), true);
4133 	    }
4134 	}
4135     }
4136 
4137   /* Free the perms vector used for propagation.  */
4138   while (!perms.is_empty ())
4139     perms.pop ().release ();
4140   free_graph (slpg);
4141 
4142 
4143   /* Now elide load permutations that are not necessary.  */
4144   for (i = 0; i < leafs.length (); ++i)
4145     {
4146       node = vertices[leafs[i]].node;
4147       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4148 	continue;
4149 
4150       /* In basic block vectorization we allow any subchain of an interleaving
4151 	 chain.
4152 	 FORNOW: not in loop SLP because of realignment complications.  */
4153       if (is_a <bb_vec_info> (vinfo))
4154 	{
4155 	  bool subchain_p = true;
4156 	  stmt_vec_info next_load_info = NULL;
4157 	  stmt_vec_info load_info;
4158 	  unsigned j;
4159 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4160 	    {
4161 	      if (j != 0
4162 		  && (next_load_info != load_info
4163 		      || DR_GROUP_GAP (load_info) != 1))
4164 		{
4165 		  subchain_p = false;
4166 		  break;
4167 		}
4168 	      next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4169 	    }
4170 	  if (subchain_p)
4171 	    {
4172 	      SLP_TREE_LOAD_PERMUTATION (node).release ();
4173 	      continue;
4174 	    }
4175 	}
4176       else
4177 	{
4178 	  stmt_vec_info load_info;
4179 	  bool this_load_permuted = false;
4180 	  unsigned j;
4181 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4182 	    if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4183 	      {
4184 		this_load_permuted = true;
4185 		break;
4186 	      }
4187 	  stmt_vec_info first_stmt_info
4188 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4189 	  if (!this_load_permuted
4190 	      /* The load requires permutation when unrolling exposes
4191 		 a gap either because the group is larger than the SLP
4192 		 group-size or because there is a gap between the groups.  */
4193 	      && (known_eq (LOOP_VINFO_VECT_FACTOR
4194 			      (as_a <loop_vec_info> (vinfo)), 1U)
4195 		  || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4196 		      && DR_GROUP_GAP (first_stmt_info) == 0)))
4197 	    {
4198 	      SLP_TREE_LOAD_PERMUTATION (node).release ();
4199 	      continue;
4200 	    }
4201 	}
4202     }
4203 }
4204 
4205 /* Gather loads reachable from the individual SLP graph entries.  */
4206 
4207 void
vect_gather_slp_loads(vec_info * vinfo)4208 vect_gather_slp_loads (vec_info *vinfo)
4209 {
4210   unsigned i;
4211   slp_instance instance;
4212   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4213     {
4214       hash_set<slp_tree> visited;
4215       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4216 			     SLP_INSTANCE_TREE (instance), visited);
4217     }
4218 }
4219 
4220 
4221 /* For each possible SLP instance decide whether to SLP it and calculate overall
4222    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
4223    least one instance.  */
4224 
4225 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)4226 vect_make_slp_decision (loop_vec_info loop_vinfo)
4227 {
4228   unsigned int i;
4229   poly_uint64 unrolling_factor = 1;
4230   const vec<slp_instance> &slp_instances
4231     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4232   slp_instance instance;
4233   int decided_to_slp = 0;
4234 
4235   DUMP_VECT_SCOPE ("vect_make_slp_decision");
4236 
4237   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4238     {
4239       /* FORNOW: SLP if you can.  */
4240       /* All unroll factors have the form:
4241 
4242 	   GET_MODE_SIZE (vinfo->vector_mode) * X
4243 
4244 	 for some rational X, so they must have a common multiple.  */
4245       unrolling_factor
4246 	= force_common_multiple (unrolling_factor,
4247 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4248 
4249       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
4250 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4251 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
4252       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4253       decided_to_slp++;
4254     }
4255 
4256   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4257 
4258   if (decided_to_slp && dump_enabled_p ())
4259     {
4260       dump_printf_loc (MSG_NOTE, vect_location,
4261 		       "Decided to SLP %d instances. Unrolling factor ",
4262 		       decided_to_slp);
4263       dump_dec (MSG_NOTE, unrolling_factor);
4264       dump_printf (MSG_NOTE, "\n");
4265     }
4266 
4267   return (decided_to_slp > 0);
4268 }
4269 
4270 /* Private data for vect_detect_hybrid_slp.  */
4271 struct vdhs_data
4272 {
4273   loop_vec_info loop_vinfo;
4274   vec<stmt_vec_info> *worklist;
4275 };
4276 
4277 /* Walker for walk_gimple_op.  */
4278 
4279 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)4280 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4281 {
4282   walk_stmt_info *wi = (walk_stmt_info *)data;
4283   vdhs_data *dat = (vdhs_data *)wi->info;
4284 
4285   if (wi->is_lhs)
4286     return NULL_TREE;
4287 
4288   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4289   if (!def_stmt_info)
4290     return NULL_TREE;
4291   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4292   if (PURE_SLP_STMT (def_stmt_info))
4293     {
4294       if (dump_enabled_p ())
4295 	dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4296 			 def_stmt_info->stmt);
4297       STMT_SLP_TYPE (def_stmt_info) = hybrid;
4298       dat->worklist->safe_push (def_stmt_info);
4299     }
4300 
4301   return NULL_TREE;
4302 }
4303 
4304 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4305    if so, otherwise pushing it to WORKLIST.  */
4306 
4307 static void
maybe_push_to_hybrid_worklist(vec_info * vinfo,vec<stmt_vec_info> & worklist,stmt_vec_info stmt_info)4308 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4309 			       vec<stmt_vec_info> &worklist,
4310 			       stmt_vec_info stmt_info)
4311 {
4312   if (dump_enabled_p ())
4313     dump_printf_loc (MSG_NOTE, vect_location,
4314 		     "Processing hybrid candidate : %G", stmt_info->stmt);
4315   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4316   imm_use_iterator iter2;
4317   ssa_op_iter iter1;
4318   use_operand_p use_p;
4319   def_operand_p def_p;
4320   bool any_def = false;
4321   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4322     {
4323       any_def = true;
4324       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4325 	{
4326 	  if (is_gimple_debug (USE_STMT (use_p)))
4327 	    continue;
4328 	  stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4329 	  /* An out-of loop use means this is a loop_vect sink.  */
4330 	  if (!use_info)
4331 	    {
4332 	      if (dump_enabled_p ())
4333 		dump_printf_loc (MSG_NOTE, vect_location,
4334 				 "Found loop_vect sink: %G", stmt_info->stmt);
4335 	      worklist.safe_push (stmt_info);
4336 	      return;
4337 	    }
4338 	  else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4339 	    {
4340 	      if (dump_enabled_p ())
4341 		dump_printf_loc (MSG_NOTE, vect_location,
4342 				 "Found loop_vect use: %G", use_info->stmt);
4343 	      worklist.safe_push (stmt_info);
4344 	      return;
4345 	    }
4346 	}
4347     }
4348   /* No def means this is a loo_vect sink.  */
4349   if (!any_def)
4350     {
4351       if (dump_enabled_p ())
4352 	dump_printf_loc (MSG_NOTE, vect_location,
4353 			 "Found loop_vect sink: %G", stmt_info->stmt);
4354       worklist.safe_push (stmt_info);
4355       return;
4356     }
4357   if (dump_enabled_p ())
4358     dump_printf_loc (MSG_NOTE, vect_location,
4359 		     "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4360   STMT_SLP_TYPE (stmt_info) = pure_slp;
4361 }
4362 
4363 /* Find stmts that must be both vectorized and SLPed.  */
4364 
4365 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)4366 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4367 {
4368   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4369 
4370   /* All stmts participating in SLP are marked pure_slp, all other
4371      stmts are loop_vect.
4372      First collect all loop_vect stmts into a worklist.
4373      SLP patterns cause not all original scalar stmts to appear in
4374      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4375      Rectify this here and do a backward walk over the IL only considering
4376      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4377      mark them as pure_slp.  */
4378   auto_vec<stmt_vec_info> worklist;
4379   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4380     {
4381       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4382       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4383 	   gsi_next (&gsi))
4384 	{
4385 	  gphi *phi = gsi.phi ();
4386 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4387 	  if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4388 	    maybe_push_to_hybrid_worklist (loop_vinfo,
4389 					   worklist, stmt_info);
4390 	}
4391       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4392 	   gsi_prev (&gsi))
4393 	{
4394 	  gimple *stmt = gsi_stmt (gsi);
4395 	  if (is_gimple_debug (stmt))
4396 	    continue;
4397 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4398 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4399 	    {
4400 	      for (gimple_stmt_iterator gsi2
4401 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4402 		   !gsi_end_p (gsi2); gsi_next (&gsi2))
4403 		{
4404 		  stmt_vec_info patt_info
4405 		    = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4406 		  if (!STMT_SLP_TYPE (patt_info)
4407 		      && STMT_VINFO_RELEVANT (patt_info))
4408 		    maybe_push_to_hybrid_worklist (loop_vinfo,
4409 						   worklist, patt_info);
4410 		}
4411 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4412 	    }
4413 	  if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4414 	    maybe_push_to_hybrid_worklist (loop_vinfo,
4415 					   worklist, stmt_info);
4416 	}
4417     }
4418 
4419   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4420      mark any SLP vectorized stmt as hybrid.
4421      ???  We're visiting def stmts N times (once for each non-SLP and
4422      once for each hybrid-SLP use).  */
4423   walk_stmt_info wi;
4424   vdhs_data dat;
4425   dat.worklist = &worklist;
4426   dat.loop_vinfo = loop_vinfo;
4427   memset (&wi, 0, sizeof (wi));
4428   wi.info = (void *)&dat;
4429   while (!worklist.is_empty ())
4430     {
4431       stmt_vec_info stmt_info = worklist.pop ();
4432       /* Since SSA operands are not set up for pattern stmts we need
4433 	 to use walk_gimple_op.  */
4434       wi.is_lhs = 0;
4435       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4436       /* For gather/scatter make sure to walk the offset operand, that
4437 	 can be a scaling and conversion away.  */
4438       gather_scatter_info gs_info;
4439       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4440 	  && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
4441 	{
4442 	  int dummy;
4443 	  vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
4444 	}
4445     }
4446 }
4447 
4448 
4449 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
4450 
_bb_vec_info(vec<basic_block> _bbs,vec_info_shared * shared)4451 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4452   : vec_info (vec_info::bb, shared),
4453     bbs (_bbs),
4454     roots (vNULL)
4455 {
4456   for (unsigned i = 0; i < bbs.length (); ++i)
4457     {
4458       if (i != 0)
4459 	for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4460 	     gsi_next (&si))
4461 	  {
4462 	    gphi *phi = si.phi ();
4463 	    gimple_set_uid (phi, 0);
4464 	    add_stmt (phi);
4465 	  }
4466       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4467 	   !gsi_end_p (gsi); gsi_next (&gsi))
4468 	{
4469 	  gimple *stmt = gsi_stmt (gsi);
4470 	  gimple_set_uid (stmt, 0);
4471 	  if (is_gimple_debug (stmt))
4472 	    continue;
4473 	  add_stmt (stmt);
4474 	}
4475     }
4476 }
4477 
4478 
4479 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4480    stmts in the basic block.  */
4481 
~_bb_vec_info()4482 _bb_vec_info::~_bb_vec_info ()
4483 {
4484   /* Reset region marker.  */
4485   for (unsigned i = 0; i < bbs.length (); ++i)
4486     {
4487       if (i != 0)
4488 	for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4489 	     gsi_next (&si))
4490 	  {
4491 	    gphi *phi = si.phi ();
4492 	    gimple_set_uid (phi, -1);
4493 	  }
4494       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4495 	   !gsi_end_p (gsi); gsi_next (&gsi))
4496 	{
4497 	  gimple *stmt = gsi_stmt (gsi);
4498 	  gimple_set_uid (stmt, -1);
4499 	}
4500     }
4501 
4502   for (unsigned i = 0; i < roots.length (); ++i)
4503     {
4504       roots[i].stmts.release ();
4505       roots[i].roots.release ();
4506     }
4507   roots.release ();
4508 }
4509 
4510 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
4511    given then that child nodes have already been processed, and that
4512    their def types currently match their SLP node's def type.  */
4513 
4514 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)4515 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4516 				    slp_instance node_instance,
4517 				    stmt_vector_for_cost *cost_vec)
4518 {
4519   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4520 
4521   /* Calculate the number of vector statements to be created for the
4522      scalar stmts in this node.  For SLP reductions it is equal to the
4523      number of vector statements in the children (which has already been
4524      calculated by the recursive call).  Otherwise it is the number of
4525      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4526      VF divided by the number of elements in a vector.  */
4527   if (!STMT_VINFO_DATA_REF (stmt_info)
4528       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4529     {
4530       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4531 	if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4532 	  {
4533 	    SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4534 	      = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4535 	    break;
4536 	  }
4537     }
4538   else
4539     {
4540       poly_uint64 vf;
4541       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4542 	vf = loop_vinfo->vectorization_factor;
4543       else
4544 	vf = 1;
4545       unsigned int group_size = SLP_TREE_LANES (node);
4546       tree vectype = SLP_TREE_VECTYPE (node);
4547       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4548 	= vect_get_num_vectors (vf * group_size, vectype);
4549     }
4550 
4551   /* Handle purely internal nodes.  */
4552   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4553     {
4554       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
4555 	return false;
4556 
4557       stmt_vec_info slp_stmt_info;
4558       unsigned int i;
4559       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
4560 	{
4561 	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
4562 	      && !vectorizable_live_operation (vinfo,
4563 					       slp_stmt_info, NULL, node,
4564 					       node_instance, i,
4565 					       false, cost_vec))
4566 	    return false;
4567 	}
4568       return true;
4569     }
4570 
4571   bool dummy;
4572   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4573 			    node, node_instance, cost_vec);
4574 }
4575 
4576 /* Try to build NODE from scalars, returning true on success.
4577    NODE_INSTANCE is the SLP instance that contains NODE.  */
4578 
4579 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)4580 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4581 			      slp_instance node_instance)
4582 {
4583   stmt_vec_info stmt_info;
4584   unsigned int i;
4585 
4586   if (!is_a <bb_vec_info> (vinfo)
4587       || node == SLP_INSTANCE_TREE (node_instance)
4588       || !SLP_TREE_SCALAR_STMTS (node).exists ()
4589       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4590     return false;
4591 
4592   if (dump_enabled_p ())
4593     dump_printf_loc (MSG_NOTE, vect_location,
4594 		     "Building vector operands of %p from scalars instead\n", node);
4595 
4596   /* Don't remove and free the child nodes here, since they could be
4597      referenced by other structures.  The analysis and scheduling phases
4598      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
4599   unsigned int group_size = SLP_TREE_LANES (node);
4600   SLP_TREE_DEF_TYPE (node) = vect_external_def;
4601   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4602   SLP_TREE_LOAD_PERMUTATION (node).release ();
4603   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4604     {
4605       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4606       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4607     }
4608   return true;
4609 }
4610 
4611 /* Return true if all elements of the slice are the same.  */
4612 bool
all_same_p() const4613 vect_scalar_ops_slice::all_same_p () const
4614 {
4615   for (unsigned int i = 1; i < length; ++i)
4616     if (!operand_equal_p (op (0), op (i)))
4617       return false;
4618   return true;
4619 }
4620 
4621 hashval_t
hash(const value_type & s)4622 vect_scalar_ops_slice_hash::hash (const value_type &s)
4623 {
4624   hashval_t hash = 0;
4625   for (unsigned i = 0; i < s.length; ++i)
4626     hash = iterative_hash_expr (s.op (i), hash);
4627   return hash;
4628 }
4629 
4630 bool
equal(const value_type & s1,const compare_type & s2)4631 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4632 				   const compare_type &s2)
4633 {
4634   if (s1.length != s2.length)
4635     return false;
4636   for (unsigned i = 0; i < s1.length; ++i)
4637     if (!operand_equal_p (s1.op (i), s2.op (i)))
4638       return false;
4639   return true;
4640 }
4641 
4642 /* Compute the prologue cost for invariant or constant operands represented
4643    by NODE.  */
4644 
4645 static void
vect_prologue_cost_for_slp(slp_tree node,stmt_vector_for_cost * cost_vec)4646 vect_prologue_cost_for_slp (slp_tree node,
4647 			    stmt_vector_for_cost *cost_vec)
4648 {
4649   /* There's a special case of an existing vector, that costs nothing.  */
4650   if (SLP_TREE_SCALAR_OPS (node).length () == 0
4651       && !SLP_TREE_VEC_DEFS (node).is_empty ())
4652     return;
4653   /* Without looking at the actual initializer a vector of
4654      constants can be implemented as load from the constant pool.
4655      When all elements are the same we can use a splat.  */
4656   tree vectype = SLP_TREE_VECTYPE (node);
4657   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4658   unsigned HOST_WIDE_INT const_nunits;
4659   unsigned nelt_limit;
4660   auto ops = &SLP_TREE_SCALAR_OPS (node);
4661   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4662   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4663       && ! multiple_p (const_nunits, group_size))
4664     {
4665       nelt_limit = const_nunits;
4666       hash_set<vect_scalar_ops_slice_hash> vector_ops;
4667       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4668 	if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4669 	  starts.quick_push (i * const_nunits);
4670     }
4671   else
4672     {
4673       /* If either the vector has variable length or the vectors
4674 	 are composed of repeated whole groups we only need to
4675 	 cost construction once.  All vectors will be the same.  */
4676       nelt_limit = group_size;
4677       starts.quick_push (0);
4678     }
4679   /* ???  We're just tracking whether vectors in a single node are the same.
4680      Ideally we'd do something more global.  */
4681   for (unsigned int start : starts)
4682     {
4683       vect_cost_for_stmt kind;
4684       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4685 	kind = vector_load;
4686       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4687 	kind = scalar_to_vec;
4688       else
4689 	kind = vec_construct;
4690       record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4691     }
4692 }
4693 
4694 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4695    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4696 
4697    Return true if the operations are supported.  */
4698 
4699 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited_set,vec<slp_tree> & visited_vec,stmt_vector_for_cost * cost_vec)4700 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4701 				  slp_instance node_instance,
4702 				  hash_set<slp_tree> &visited_set,
4703 				  vec<slp_tree> &visited_vec,
4704 				  stmt_vector_for_cost *cost_vec)
4705 {
4706   int i, j;
4707   slp_tree child;
4708 
4709   /* Assume we can code-generate all invariants.  */
4710   if (!node
4711       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4712       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4713     return true;
4714 
4715   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4716     {
4717       if (dump_enabled_p ())
4718 	dump_printf_loc (MSG_NOTE, vect_location,
4719 			 "Failed cyclic SLP reference in %p\n", node);
4720       return false;
4721     }
4722   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4723 
4724   /* If we already analyzed the exact same set of scalar stmts we're done.
4725      We share the generated vector stmts for those.  */
4726   if (visited_set.add (node))
4727     return true;
4728   visited_vec.safe_push (node);
4729 
4730   bool res = true;
4731   unsigned visited_rec_start = visited_vec.length ();
4732   unsigned cost_vec_rec_start = cost_vec->length ();
4733   bool seen_non_constant_child = false;
4734   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4735     {
4736       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4737 					      visited_set, visited_vec,
4738 					      cost_vec);
4739       if (!res)
4740 	break;
4741       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4742 	seen_non_constant_child = true;
4743     }
4744   /* We're having difficulties scheduling nodes with just constant
4745      operands and no scalar stmts since we then cannot compute a stmt
4746      insertion place.  */
4747   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4748     {
4749       if (dump_enabled_p ())
4750 	dump_printf_loc (MSG_NOTE, vect_location,
4751 			 "Cannot vectorize all-constant op node %p\n", node);
4752       res = false;
4753     }
4754 
4755   if (res)
4756     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4757 					      cost_vec);
4758   /* If analysis failed we have to pop all recursive visited nodes
4759      plus ourselves.  */
4760   if (!res)
4761     {
4762       while (visited_vec.length () >= visited_rec_start)
4763 	visited_set.remove (visited_vec.pop ());
4764       cost_vec->truncate (cost_vec_rec_start);
4765     }
4766 
4767   /* When the node can be vectorized cost invariant nodes it references.
4768      This is not done in DFS order to allow the refering node
4769      vectorizable_* calls to nail down the invariant nodes vector type
4770      and possibly unshare it if it needs a different vector type than
4771      other referrers.  */
4772   if (res)
4773     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4774       if (child
4775 	  && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4776 	      || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4777 	  /* Perform usual caching, note code-generation still
4778 	     code-gens these nodes multiple times but we expect
4779 	     to CSE them later.  */
4780 	  && !visited_set.add (child))
4781 	{
4782 	  visited_vec.safe_push (child);
4783 	  /* ???  After auditing more code paths make a "default"
4784 	     and push the vector type from NODE to all children
4785 	     if it is not already set.  */
4786 	  /* Compute the number of vectors to be generated.  */
4787 	  tree vector_type = SLP_TREE_VECTYPE (child);
4788 	  if (!vector_type)
4789 	    {
4790 	      /* For shifts with a scalar argument we don't need
4791 		 to cost or code-generate anything.
4792 		 ???  Represent this more explicitely.  */
4793 	      gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4794 			   == shift_vec_info_type)
4795 			  && j == 1);
4796 	      continue;
4797 	    }
4798 	  unsigned group_size = SLP_TREE_LANES (child);
4799 	  poly_uint64 vf = 1;
4800 	  if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4801 	    vf = loop_vinfo->vectorization_factor;
4802 	  SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4803 	    = vect_get_num_vectors (vf * group_size, vector_type);
4804 	  /* And cost them.  */
4805 	  vect_prologue_cost_for_slp (child, cost_vec);
4806 	}
4807 
4808   /* If this node or any of its children can't be vectorized, try pruning
4809      the tree here rather than felling the whole thing.  */
4810   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4811     {
4812       /* We'll need to revisit this for invariant costing and number
4813 	 of vectorized stmt setting.   */
4814       res = true;
4815     }
4816 
4817   return res;
4818 }
4819 
4820 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4821    region and that can be vectorized using vectorizable_live_operation
4822    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
4823    scalar code computing it to be retained.  */
4824 
4825 static void
vect_bb_slp_mark_live_stmts(bb_vec_info bb_vinfo,slp_tree node,slp_instance instance,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & svisited,hash_set<slp_tree> & visited)4826 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4827 			     slp_instance instance,
4828 			     stmt_vector_for_cost *cost_vec,
4829 			     hash_set<stmt_vec_info> &svisited,
4830 			     hash_set<slp_tree> &visited)
4831 {
4832   if (visited.add (node))
4833     return;
4834 
4835   unsigned i;
4836   stmt_vec_info stmt_info;
4837   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4838   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4839     {
4840       if (svisited.contains (stmt_info))
4841 	continue;
4842       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4843       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4844 	  && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4845 	/* Only the pattern root stmt computes the original scalar value.  */
4846 	continue;
4847       bool mark_visited = true;
4848       gimple *orig_stmt = orig_stmt_info->stmt;
4849       ssa_op_iter op_iter;
4850       def_operand_p def_p;
4851       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4852 	{
4853 	  imm_use_iterator use_iter;
4854 	  gimple *use_stmt;
4855 	  stmt_vec_info use_stmt_info;
4856 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4857 	    if (!is_gimple_debug (use_stmt))
4858 	      {
4859 		use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4860 		if (!use_stmt_info
4861 		    || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4862 		  {
4863 		    STMT_VINFO_LIVE_P (stmt_info) = true;
4864 		    if (vectorizable_live_operation (bb_vinfo, stmt_info,
4865 						     NULL, node, instance, i,
4866 						     false, cost_vec))
4867 		      /* ???  So we know we can vectorize the live stmt
4868 			 from one SLP node.  If we cannot do so from all
4869 			 or none consistently we'd have to record which
4870 			 SLP node (and lane) we want to use for the live
4871 			 operation.  So make sure we can code-generate
4872 			 from all nodes.  */
4873 		      mark_visited = false;
4874 		    else
4875 		      STMT_VINFO_LIVE_P (stmt_info) = false;
4876 		    break;
4877 		  }
4878 	      }
4879 	  /* We have to verify whether we can insert the lane extract
4880 	     before all uses.  The following is a conservative approximation.
4881 	     We cannot put this into vectorizable_live_operation because
4882 	     iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4883 	     doesn't work.
4884 	     Note that while the fact that we emit code for loads at the
4885 	     first load should make this a non-problem leafs we construct
4886 	     from scalars are vectorized after the last scalar def.
4887 	     ???  If we'd actually compute the insert location during
4888 	     analysis we could use sth less conservative than the last
4889 	     scalar stmt in the node for the dominance check.  */
4890 	  /* ???  What remains is "live" uses in vector CTORs in the same
4891 	     SLP graph which is where those uses can end up code-generated
4892 	     right after their definition instead of close to their original
4893 	     use.  But that would restrict us to code-generate lane-extracts
4894 	     from the latest stmt in a node.  So we compensate for this
4895 	     during code-generation, simply not replacing uses for those
4896 	     hopefully rare cases.  */
4897 	  if (STMT_VINFO_LIVE_P (stmt_info))
4898 	    FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4899 	      if (!is_gimple_debug (use_stmt)
4900 		  && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4901 		      || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4902 		  && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4903 		{
4904 		  if (dump_enabled_p ())
4905 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4906 				     "Cannot determine insertion place for "
4907 				     "lane extract\n");
4908 		  STMT_VINFO_LIVE_P (stmt_info) = false;
4909 		  mark_visited = true;
4910 		}
4911 	}
4912       if (mark_visited)
4913 	svisited.add (stmt_info);
4914     }
4915 
4916   slp_tree child;
4917   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4918     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4919       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4920 				   cost_vec, svisited, visited);
4921 }
4922 
4923 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
4924 
4925 static bool
vectorizable_bb_reduc_epilogue(slp_instance instance,stmt_vector_for_cost * cost_vec)4926 vectorizable_bb_reduc_epilogue (slp_instance instance,
4927 				stmt_vector_for_cost *cost_vec)
4928 {
4929   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4930   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4931   if (reduc_code == MINUS_EXPR)
4932     reduc_code = PLUS_EXPR;
4933   internal_fn reduc_fn;
4934   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4935   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4936       || reduc_fn == IFN_LAST
4937       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4938       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4939 				     TREE_TYPE (vectype)))
4940     return false;
4941 
4942   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4943      cost log2 vector operations plus shuffles and one extraction.  */
4944   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4945   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4946 		    vectype, 0, vect_body);
4947   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4948 		    vectype, 0, vect_body);
4949   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4950 		    vectype, 0, vect_body);
4951   return true;
4952 }
4953 
4954 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4955    and recurse to children.  */
4956 
4957 static void
vect_slp_prune_covered_roots(slp_tree node,hash_set<stmt_vec_info> & roots,hash_set<slp_tree> & visited)4958 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4959 			      hash_set<slp_tree> &visited)
4960 {
4961   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4962       || visited.add (node))
4963     return;
4964 
4965   stmt_vec_info stmt;
4966   unsigned i;
4967   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4968     roots.remove (vect_orig_stmt (stmt));
4969 
4970   slp_tree child;
4971   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4972     if (child)
4973       vect_slp_prune_covered_roots (child, roots, visited);
4974 }
4975 
4976 /* Analyze statements in SLP instances of VINFO.  Return true if the
4977    operations are supported. */
4978 
4979 bool
vect_slp_analyze_operations(vec_info * vinfo)4980 vect_slp_analyze_operations (vec_info *vinfo)
4981 {
4982   slp_instance instance;
4983   int i;
4984 
4985   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4986 
4987   hash_set<slp_tree> visited;
4988   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4989     {
4990       auto_vec<slp_tree> visited_vec;
4991       stmt_vector_for_cost cost_vec;
4992       cost_vec.create (2);
4993       if (is_a <bb_vec_info> (vinfo))
4994 	vect_location = instance->location ();
4995       if (!vect_slp_analyze_node_operations (vinfo,
4996 					     SLP_INSTANCE_TREE (instance),
4997 					     instance, visited, visited_vec,
4998 					     &cost_vec)
4999 	  /* CTOR instances require vectorized defs for the SLP tree root.  */
5000 	  || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
5001 	      && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
5002 		  != vect_internal_def
5003 		  /* Make sure we vectorized with the expected type.  */
5004 		  || !useless_type_conversion_p
5005 			(TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
5006 					      (instance->root_stmts[0]->stmt))),
5007 			 TREE_TYPE (SLP_TREE_VECTYPE
5008 					    (SLP_INSTANCE_TREE (instance))))))
5009 	  /* Check we can vectorize the reduction.  */
5010 	  || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
5011 	      && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
5012         {
5013 	  slp_tree node = SLP_INSTANCE_TREE (instance);
5014 	  stmt_vec_info stmt_info;
5015 	  if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5016 	    stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5017 	  else
5018 	    stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5019 	  if (dump_enabled_p ())
5020 	    dump_printf_loc (MSG_NOTE, vect_location,
5021 			     "removing SLP instance operations starting from: %G",
5022 			     stmt_info->stmt);
5023 	  vect_free_slp_instance (instance);
5024           vinfo->slp_instances.ordered_remove (i);
5025 	  cost_vec.release ();
5026 	  while (!visited_vec.is_empty ())
5027 	    visited.remove (visited_vec.pop ());
5028 	}
5029       else
5030 	{
5031 	  i++;
5032 	  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
5033 	    {
5034 	      add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
5035 	      cost_vec.release ();
5036 	    }
5037 	  else
5038 	    /* For BB vectorization remember the SLP graph entry
5039 	       cost for later.  */
5040 	    instance->cost_vec = cost_vec;
5041 	}
5042     }
5043 
5044   /* Now look for SLP instances with a root that are covered by other
5045      instances and remove them.  */
5046   hash_set<stmt_vec_info> roots;
5047   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5048     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5049       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
5050   if (!roots.is_empty ())
5051     {
5052       visited.empty ();
5053       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5054 	vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
5055 				      visited);
5056       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
5057 	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
5058 	    && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
5059 	  {
5060 	    stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5061 	    if (dump_enabled_p ())
5062 	      dump_printf_loc (MSG_NOTE, vect_location,
5063 			       "removing SLP instance operations starting "
5064 			       "from: %G", root->stmt);
5065 	    vect_free_slp_instance (instance);
5066 	    vinfo->slp_instances.ordered_remove (i);
5067 	  }
5068 	else
5069 	  ++i;
5070     }
5071 
5072   /* Compute vectorizable live stmts.  */
5073   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5074     {
5075       hash_set<stmt_vec_info> svisited;
5076       hash_set<slp_tree> visited;
5077       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5078 	{
5079 	  vect_location = instance->location ();
5080 	  vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5081 				       instance, &instance->cost_vec, svisited,
5082 				       visited);
5083 	}
5084     }
5085 
5086   return !vinfo->slp_instances.is_empty ();
5087 }
5088 
5089 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5090    closing the eventual chain.  */
5091 
5092 static slp_instance
get_ultimate_leader(slp_instance instance,hash_map<slp_instance,slp_instance> & instance_leader)5093 get_ultimate_leader (slp_instance instance,
5094 		     hash_map<slp_instance, slp_instance> &instance_leader)
5095 {
5096   auto_vec<slp_instance *, 8> chain;
5097   slp_instance *tem;
5098   while (*(tem = instance_leader.get (instance)) != instance)
5099     {
5100       chain.safe_push (tem);
5101       instance = *tem;
5102     }
5103   while (!chain.is_empty ())
5104     *chain.pop () = instance;
5105   return instance;
5106 }
5107 
5108 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
5109 
5110 static void
vect_bb_partition_graph_r(bb_vec_info bb_vinfo,slp_instance instance,slp_tree node,hash_map<stmt_vec_info,slp_instance> & stmt_to_instance,hash_map<slp_instance,slp_instance> & instance_leader,hash_set<slp_tree> & visited)5111 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5112 			   slp_instance instance, slp_tree node,
5113 			   hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5114 			   hash_map<slp_instance, slp_instance> &instance_leader,
5115 			   hash_set<slp_tree> &visited)
5116 {
5117   stmt_vec_info stmt_info;
5118   unsigned i;
5119 
5120   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5121     {
5122       bool existed_p;
5123       slp_instance &stmt_instance
5124 	= stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5125       if (!existed_p)
5126 	;
5127       else if (stmt_instance != instance)
5128 	{
5129 	  /* If we're running into a previously marked stmt make us the
5130 	     leader of the current ultimate leader.  This keeps the
5131 	     leader chain acyclic and works even when the current instance
5132 	     connects two previously independent graph parts.  */
5133 	  slp_instance stmt_leader
5134 	    = get_ultimate_leader (stmt_instance, instance_leader);
5135 	  if (stmt_leader != instance)
5136 	    instance_leader.put (stmt_leader, instance);
5137 	}
5138       stmt_instance = instance;
5139     }
5140 
5141   if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5142     return;
5143 
5144   slp_tree child;
5145   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5146     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5147       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5148 				 instance_leader, visited);
5149 }
5150 
5151 /* Partition the SLP graph into pieces that can be costed independently.  */
5152 
5153 static void
vect_bb_partition_graph(bb_vec_info bb_vinfo)5154 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5155 {
5156   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5157 
5158   /* First walk the SLP graph assigning each involved scalar stmt a
5159      corresponding SLP graph entry and upon visiting a previously
5160      marked stmt, make the stmts leader the current SLP graph entry.  */
5161   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5162   hash_map<slp_instance, slp_instance> instance_leader;
5163   hash_set<slp_tree> visited;
5164   slp_instance instance;
5165   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5166     {
5167       instance_leader.put (instance, instance);
5168       vect_bb_partition_graph_r (bb_vinfo,
5169 				 instance, SLP_INSTANCE_TREE (instance),
5170 				 stmt_to_instance, instance_leader,
5171 				 visited);
5172     }
5173 
5174   /* Then collect entries to each independent subgraph.  */
5175   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5176     {
5177       slp_instance leader = get_ultimate_leader (instance, instance_leader);
5178       leader->subgraph_entries.safe_push (instance);
5179       if (dump_enabled_p ()
5180 	  && leader != instance)
5181 	dump_printf_loc (MSG_NOTE, vect_location,
5182 			 "instance %p is leader of %p\n",
5183 			 leader, instance);
5184     }
5185 }
5186 
5187 /* Compute the set of scalar stmts participating in internal and external
5188    nodes.  */
5189 
5190 static void
vect_slp_gather_vectorized_scalar_stmts(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited,hash_set<stmt_vec_info> & vstmts,hash_set<stmt_vec_info> & estmts)5191 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5192 					 hash_set<slp_tree> &visited,
5193 					 hash_set<stmt_vec_info> &vstmts,
5194 					 hash_set<stmt_vec_info> &estmts)
5195 {
5196   int i;
5197   stmt_vec_info stmt_info;
5198   slp_tree child;
5199 
5200   if (visited.add (node))
5201     return;
5202 
5203   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5204     {
5205       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5206 	vstmts.add (stmt_info);
5207 
5208       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5209 	if (child)
5210 	  vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5211 						   vstmts, estmts);
5212     }
5213   else
5214     for (tree def : SLP_TREE_SCALAR_OPS (node))
5215       {
5216 	stmt_vec_info def_stmt = vinfo->lookup_def (def);
5217 	if (def_stmt)
5218 	  estmts.add (def_stmt);
5219       }
5220 }
5221 
5222 
5223 /* Compute the scalar cost of the SLP node NODE and its children
5224    and return it.  Do not account defs that are marked in LIFE and
5225    update LIFE according to uses of NODE.  */
5226 
5227 static void
vect_bb_slp_scalar_cost(vec_info * vinfo,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & vectorized_scalar_stmts,hash_set<slp_tree> & visited)5228 vect_bb_slp_scalar_cost (vec_info *vinfo,
5229 			 slp_tree node, vec<bool, va_heap> *life,
5230 			 stmt_vector_for_cost *cost_vec,
5231 			 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5232 			 hash_set<slp_tree> &visited)
5233 {
5234   unsigned i;
5235   stmt_vec_info stmt_info;
5236   slp_tree child;
5237 
5238   if (visited.add (node))
5239     return;
5240 
5241   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5242     {
5243       ssa_op_iter op_iter;
5244       def_operand_p def_p;
5245 
5246       if ((*life)[i])
5247 	continue;
5248 
5249       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5250       gimple *orig_stmt = orig_stmt_info->stmt;
5251 
5252       /* If there is a non-vectorized use of the defs then the scalar
5253          stmt is kept live in which case we do not account it or any
5254 	 required defs in the SLP children in the scalar cost.  This
5255 	 way we make the vectorization more costly when compared to
5256 	 the scalar cost.  */
5257       if (!STMT_VINFO_LIVE_P (stmt_info))
5258 	{
5259 	  auto_vec<gimple *, 8> worklist;
5260 	  hash_set<gimple *> *worklist_visited = NULL;
5261 	  worklist.quick_push (orig_stmt);
5262 	  do
5263 	    {
5264 	      gimple *work_stmt = worklist.pop ();
5265 	      FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
5266 		{
5267 		  imm_use_iterator use_iter;
5268 		  gimple *use_stmt;
5269 		  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
5270 					 DEF_FROM_PTR (def_p))
5271 		    if (!is_gimple_debug (use_stmt))
5272 		      {
5273 			stmt_vec_info use_stmt_info
5274 			  = vinfo->lookup_stmt (use_stmt);
5275 			if (!use_stmt_info
5276 			    || !vectorized_scalar_stmts.contains (use_stmt_info))
5277 			  {
5278 			    if (use_stmt_info
5279 				&& STMT_VINFO_IN_PATTERN_P (use_stmt_info))
5280 			      {
5281 				/* For stmts participating in patterns we have
5282 				   to check its uses recursively.  */
5283 				if (!worklist_visited)
5284 				  worklist_visited = new hash_set<gimple *> ();
5285 				if (!worklist_visited->add (use_stmt))
5286 				  worklist.safe_push (use_stmt);
5287 				continue;
5288 			      }
5289 			    (*life)[i] = true;
5290 			    goto next_lane;
5291 			  }
5292 		      }
5293 		}
5294 	    }
5295 	  while (!worklist.is_empty ());
5296 next_lane:
5297 	  if (worklist_visited)
5298 	    delete worklist_visited;
5299 	  if ((*life)[i])
5300 	    continue;
5301 	}
5302 
5303       /* Count scalar stmts only once.  */
5304       if (gimple_visited_p (orig_stmt))
5305 	continue;
5306       gimple_set_visited (orig_stmt, true);
5307 
5308       vect_cost_for_stmt kind;
5309       if (STMT_VINFO_DATA_REF (orig_stmt_info))
5310 	{
5311 	  if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5312 	    kind = scalar_load;
5313 	  else
5314 	    kind = scalar_store;
5315 	}
5316       else if (vect_nop_conversion_p (orig_stmt_info))
5317 	continue;
5318       /* For single-argument PHIs assume coalescing which means zero cost
5319 	 for the scalar and the vector PHIs.  This avoids artificially
5320 	 favoring the vector path (but may pessimize it in some cases).  */
5321       else if (is_a <gphi *> (orig_stmt_info->stmt)
5322 	       && gimple_phi_num_args
5323 		    (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5324 	continue;
5325       else
5326 	kind = scalar_stmt;
5327       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5328 			SLP_TREE_VECTYPE (node), 0, vect_body);
5329     }
5330 
5331   auto_vec<bool, 20> subtree_life;
5332   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5333     {
5334       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5335 	{
5336 	  /* Do not directly pass LIFE to the recursive call, copy it to
5337 	     confine changes in the callee to the current child/subtree.  */
5338 	  if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5339 	    {
5340 	      subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5341 	      for (unsigned j = 0;
5342 		   j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5343 		{
5344 		  auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5345 		  if (perm.first == i)
5346 		    subtree_life[perm.second] = (*life)[j];
5347 		}
5348 	    }
5349 	  else
5350 	    {
5351 	      gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5352 	      subtree_life.safe_splice (*life);
5353 	    }
5354 	  vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5355 				   vectorized_scalar_stmts, visited);
5356 	  subtree_life.truncate (0);
5357 	}
5358     }
5359 }
5360 
5361 /* Comparator for the loop-index sorted cost vectors.  */
5362 
5363 static int
li_cost_vec_cmp(const void * a_,const void * b_)5364 li_cost_vec_cmp (const void *a_, const void *b_)
5365 {
5366   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5367   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5368   if (a->first < b->first)
5369     return -1;
5370   else if (a->first == b->first)
5371     return 0;
5372   return 1;
5373 }
5374 
5375 /* Check if vectorization of the basic block is profitable for the
5376    subgraph denoted by SLP_INSTANCES.  */
5377 
5378 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo,vec<slp_instance> slp_instances,loop_p orig_loop)5379 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5380 				    vec<slp_instance> slp_instances,
5381 				    loop_p orig_loop)
5382 {
5383   slp_instance instance;
5384   int i;
5385   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5386   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5387 
5388   if (dump_enabled_p ())
5389     {
5390       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5391       hash_set<slp_tree> visited;
5392       FOR_EACH_VEC_ELT (slp_instances, i, instance)
5393 	vect_print_slp_graph (MSG_NOTE, vect_location,
5394 			      SLP_INSTANCE_TREE (instance), visited);
5395     }
5396 
5397   /* Compute the set of scalar stmts we know will go away 'locally' when
5398      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
5399      not accurate for nodes promoted extern late or for scalar stmts that
5400      are used both in extern defs and in vectorized defs.  */
5401   hash_set<stmt_vec_info> vectorized_scalar_stmts;
5402   hash_set<stmt_vec_info> scalar_stmts_in_externs;
5403   hash_set<slp_tree> visited;
5404   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5405     {
5406       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5407 					       SLP_INSTANCE_TREE (instance),
5408 					       visited,
5409 					       vectorized_scalar_stmts,
5410 					       scalar_stmts_in_externs);
5411       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5412 	vectorized_scalar_stmts.add (rstmt);
5413     }
5414   /* Scalar stmts used as defs in external nodes need to be preseved, so
5415      remove them from vectorized_scalar_stmts.  */
5416   for (stmt_vec_info stmt : scalar_stmts_in_externs)
5417     vectorized_scalar_stmts.remove (stmt);
5418 
5419   /* Calculate scalar cost and sum the cost for the vector stmts
5420      previously collected.  */
5421   stmt_vector_for_cost scalar_costs = vNULL;
5422   stmt_vector_for_cost vector_costs = vNULL;
5423   visited.empty ();
5424   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5425     {
5426       auto_vec<bool, 20> life;
5427       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5428 			      true);
5429       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5430 	record_stmt_cost (&scalar_costs,
5431 			  SLP_INSTANCE_ROOT_STMTS (instance).length (),
5432 			  scalar_stmt,
5433 			  SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5434       vect_bb_slp_scalar_cost (bb_vinfo,
5435 			       SLP_INSTANCE_TREE (instance),
5436 			       &life, &scalar_costs, vectorized_scalar_stmts,
5437 			       visited);
5438       vector_costs.safe_splice (instance->cost_vec);
5439       instance->cost_vec.release ();
5440     }
5441 
5442   if (dump_enabled_p ())
5443     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5444 
5445   /* When costing non-loop vectorization we need to consider each covered
5446      loop independently and make sure vectorization is profitable.  For
5447      now we assume a loop may be not entered or executed an arbitrary
5448      number of iterations (???  static information can provide more
5449      precise info here) which means we can simply cost each containing
5450      loops stmts separately.  */
5451 
5452   /* First produce cost vectors sorted by loop index.  */
5453   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5454     li_scalar_costs (scalar_costs.length ());
5455   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5456     li_vector_costs (vector_costs.length ());
5457   stmt_info_for_cost *cost;
5458   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5459     {
5460       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5461       li_scalar_costs.quick_push (std::make_pair (l, cost));
5462     }
5463   /* Use a random used loop as fallback in case the first vector_costs
5464      entry does not have a stmt_info associated with it.  */
5465   unsigned l = li_scalar_costs[0].first;
5466   FOR_EACH_VEC_ELT (vector_costs, i, cost)
5467     {
5468       /* We inherit from the previous COST, invariants, externals and
5469 	 extracts immediately follow the cost for the related stmt.  */
5470       if (cost->stmt_info)
5471 	l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5472       li_vector_costs.quick_push (std::make_pair (l, cost));
5473     }
5474   li_scalar_costs.qsort (li_cost_vec_cmp);
5475   li_vector_costs.qsort (li_cost_vec_cmp);
5476 
5477   /* Now cost the portions individually.  */
5478   unsigned vi = 0;
5479   unsigned si = 0;
5480   bool profitable = true;
5481   while (si < li_scalar_costs.length ()
5482 	 && vi < li_vector_costs.length ())
5483     {
5484       unsigned sl = li_scalar_costs[si].first;
5485       unsigned vl = li_vector_costs[vi].first;
5486       if (sl != vl)
5487 	{
5488 	  if (dump_enabled_p ())
5489 	    dump_printf_loc (MSG_NOTE, vect_location,
5490 			     "Scalar %d and vector %d loop part do not "
5491 			     "match up, skipping scalar part\n", sl, vl);
5492 	  /* Skip the scalar part, assuming zero cost on the vector side.  */
5493 	  do
5494 	    {
5495 	      si++;
5496 	    }
5497 	  while (si < li_scalar_costs.length ()
5498 		 && li_scalar_costs[si].first == sl);
5499 	  continue;
5500 	}
5501 
5502       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5503       do
5504 	{
5505 	  add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5506 	  si++;
5507 	}
5508       while (si < li_scalar_costs.length ()
5509 	     && li_scalar_costs[si].first == sl);
5510       unsigned dummy;
5511       finish_cost (scalar_target_cost_data, nullptr,
5512 		   &dummy, &scalar_cost, &dummy);
5513 
5514       /* Complete the target-specific vector cost calculation.  */
5515       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5516       do
5517 	{
5518 	  add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5519 	  vi++;
5520 	}
5521       while (vi < li_vector_costs.length ()
5522 	     && li_vector_costs[vi].first == vl);
5523       finish_cost (vect_target_cost_data, scalar_target_cost_data,
5524 		   &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5525       delete scalar_target_cost_data;
5526       delete vect_target_cost_data;
5527 
5528       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5529 
5530       if (dump_enabled_p ())
5531 	{
5532 	  dump_printf_loc (MSG_NOTE, vect_location,
5533 			   "Cost model analysis for part in loop %d:\n", sl);
5534 	  dump_printf (MSG_NOTE, "  Vector cost: %d\n",
5535 		       vec_inside_cost + vec_outside_cost);
5536 	  dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
5537 	}
5538 
5539       /* Vectorization is profitable if its cost is more than the cost of scalar
5540 	 version.  Note that we err on the vector side for equal cost because
5541 	 the cost estimate is otherwise quite pessimistic (constant uses are
5542 	 free on the scalar side but cost a load on the vector side for
5543 	 example).  */
5544       if (vec_outside_cost + vec_inside_cost > scalar_cost)
5545 	{
5546 	  profitable = false;
5547 	  break;
5548 	}
5549     }
5550   if (profitable && vi < li_vector_costs.length ())
5551     {
5552       if (dump_enabled_p ())
5553 	dump_printf_loc (MSG_NOTE, vect_location,
5554 			 "Excess vector cost for part in loop %d:\n",
5555 			 li_vector_costs[vi].first);
5556       profitable = false;
5557     }
5558 
5559   /* Unset visited flag.  This is delayed when the subgraph is profitable
5560      and we process the loop for remaining unvectorized if-converted code.  */
5561   if (!orig_loop || !profitable)
5562     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5563       gimple_set_visited  (cost->stmt_info->stmt, false);
5564 
5565   scalar_costs.release ();
5566   vector_costs.release ();
5567 
5568   return profitable;
5569 }
5570 
5571 /* qsort comparator for lane defs.  */
5572 
5573 static int
vld_cmp(const void * a_,const void * b_)5574 vld_cmp (const void *a_, const void *b_)
5575 {
5576   auto *a = (const std::pair<unsigned, tree> *)a_;
5577   auto *b = (const std::pair<unsigned, tree> *)b_;
5578   return a->first - b->first;
5579 }
5580 
5581 /* Return true if USE_STMT is a vector lane insert into VEC and set
5582    *THIS_LANE to the lane number that is set.  */
5583 
5584 static bool
vect_slp_is_lane_insert(gimple * use_stmt,tree vec,unsigned * this_lane)5585 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5586 {
5587   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5588   if (!use_ass
5589       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5590       || (vec
5591 	  ? gimple_assign_rhs1 (use_ass) != vec
5592 	  : ((vec = gimple_assign_rhs1 (use_ass)), false))
5593       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5594 				     TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5595       || !constant_multiple_p
5596 	    (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5597 	     tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5598 	     this_lane))
5599     return false;
5600   return true;
5601 }
5602 
5603 /* Find any vectorizable constructors and add them to the grouped_store
5604    array.  */
5605 
5606 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)5607 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5608 {
5609   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5610     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5611 	 !gsi_end_p (gsi); gsi_next (&gsi))
5612     {
5613       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5614       if (!assign)
5615 	continue;
5616 
5617       tree rhs = gimple_assign_rhs1 (assign);
5618       enum tree_code code = gimple_assign_rhs_code (assign);
5619       use_operand_p use_p;
5620       gimple *use_stmt;
5621       if (code == CONSTRUCTOR)
5622 	{
5623 	  if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5624 	      || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5625 			   CONSTRUCTOR_NELTS (rhs))
5626 	      || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5627 	      || uniform_vector_p (rhs))
5628 	    continue;
5629 
5630 	  unsigned j;
5631 	  tree val;
5632 	  FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5633 	      if (TREE_CODE (val) != SSA_NAME
5634 		  || !bb_vinfo->lookup_def (val))
5635 		break;
5636 	  if (j != CONSTRUCTOR_NELTS (rhs))
5637 	    continue;
5638 
5639 	  stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5640 	  BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5641 	}
5642       else if (code == BIT_INSERT_EXPR
5643 	       && VECTOR_TYPE_P (TREE_TYPE (rhs))
5644 	       && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5645 	       && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5646 	       && integer_zerop (gimple_assign_rhs3 (assign))
5647 	       && useless_type_conversion_p
5648 		    (TREE_TYPE (TREE_TYPE (rhs)),
5649 		     TREE_TYPE (gimple_assign_rhs2 (assign)))
5650 	       && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5651 	{
5652 	  /* We start to match on insert to lane zero but since the
5653 	     inserts need not be ordered we'd have to search both
5654 	     the def and the use chains.  */
5655 	  tree vectype = TREE_TYPE (rhs);
5656 	  unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5657 	  auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5658 	  auto_sbitmap lanes (nlanes);
5659 	  bitmap_clear (lanes);
5660 	  bitmap_set_bit (lanes, 0);
5661 	  tree def = gimple_assign_lhs (assign);
5662 	  lane_defs.quick_push
5663 		      (std::make_pair (0, gimple_assign_rhs2 (assign)));
5664 	  unsigned lanes_found = 1;
5665 	  /* Start with the use chains, the last stmt will be the root.  */
5666 	  stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5667 	  vec<stmt_vec_info> roots = vNULL;
5668 	  roots.safe_push (last);
5669 	  do
5670 	    {
5671 	      use_operand_p use_p;
5672 	      gimple *use_stmt;
5673 	      if (!single_imm_use (def, &use_p, &use_stmt))
5674 		break;
5675 	      unsigned this_lane;
5676 	      if (!bb_vinfo->lookup_stmt (use_stmt)
5677 		  || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5678 		  || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5679 		break;
5680 	      if (bitmap_bit_p (lanes, this_lane))
5681 		break;
5682 	      lanes_found++;
5683 	      bitmap_set_bit (lanes, this_lane);
5684 	      gassign *use_ass = as_a <gassign *> (use_stmt);
5685 	      lane_defs.quick_push (std::make_pair
5686 				     (this_lane, gimple_assign_rhs2 (use_ass)));
5687 	      last = bb_vinfo->lookup_stmt (use_ass);
5688 	      roots.safe_push (last);
5689 	      def = gimple_assign_lhs (use_ass);
5690 	    }
5691 	  while (lanes_found < nlanes);
5692 	  if (roots.length () > 1)
5693 	    std::swap(roots[0], roots[roots.length () - 1]);
5694 	  if (lanes_found < nlanes)
5695 	    {
5696 	      /* Now search the def chain.  */
5697 	      def = gimple_assign_rhs1 (assign);
5698 	      do
5699 		{
5700 		  if (TREE_CODE (def) != SSA_NAME
5701 		      || !has_single_use (def))
5702 		    break;
5703 		  gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5704 		  unsigned this_lane;
5705 		  if (!bb_vinfo->lookup_stmt (def_stmt)
5706 		      || !vect_slp_is_lane_insert (def_stmt,
5707 						   NULL_TREE, &this_lane)
5708 		      || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5709 		    break;
5710 		  if (bitmap_bit_p (lanes, this_lane))
5711 		    break;
5712 		  lanes_found++;
5713 		  bitmap_set_bit (lanes, this_lane);
5714 		  lane_defs.quick_push (std::make_pair
5715 					  (this_lane,
5716 					   gimple_assign_rhs2 (def_stmt)));
5717 		  roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5718 		  def = gimple_assign_rhs1 (def_stmt);
5719 		}
5720 	      while (lanes_found < nlanes);
5721 	    }
5722 	  if (lanes_found == nlanes)
5723 	    {
5724 	      /* Sort lane_defs after the lane index and register the root.  */
5725 	      lane_defs.qsort (vld_cmp);
5726 	      vec<stmt_vec_info> stmts;
5727 	      stmts.create (nlanes);
5728 	      for (unsigned i = 0; i < nlanes; ++i)
5729 		stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5730 	      bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5731 						   stmts, roots));
5732 	    }
5733 	  else
5734 	    roots.release ();
5735 	}
5736       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5737 	       && (associative_tree_code (code) || code == MINUS_EXPR)
5738 	       /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
5739 		  checks pessimize a two-element reduction.  PR54400.
5740 		  ???  In-order reduction could be handled if we only
5741 		  traverse one operand chain in vect_slp_linearize_chain.  */
5742 	       && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5743 		   || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5744 		       && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5745 	       /* Ops with constants at the tail can be stripped here.  */
5746 	       && TREE_CODE (rhs) == SSA_NAME
5747 	       && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5748 	       /* Should be the chain end.  */
5749 	       && (!single_imm_use (gimple_assign_lhs (assign),
5750 				    &use_p, &use_stmt)
5751 		   || !is_gimple_assign (use_stmt)
5752 		   || (gimple_assign_rhs_code (use_stmt) != code
5753 		       && ((code != PLUS_EXPR && code != MINUS_EXPR)
5754 			   || (gimple_assign_rhs_code (use_stmt)
5755 			       != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5756 	{
5757 	  /* We start the match at the end of a possible association
5758 	     chain.  */
5759 	  auto_vec<chain_op_t> chain;
5760 	  auto_vec<std::pair<tree_code, gimple *> > worklist;
5761 	  auto_vec<gimple *> chain_stmts;
5762 	  gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5763 	  if (code == MINUS_EXPR)
5764 	    code = PLUS_EXPR;
5765 	  internal_fn reduc_fn;
5766 	  if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5767 	      || reduc_fn == IFN_LAST)
5768 	    continue;
5769 	  vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5770 				    /* ??? */
5771 				    code_stmt, alt_code_stmt, &chain_stmts);
5772 	  if (chain.length () > 1)
5773 	    {
5774 	      /* Sort the chain according to def_type and operation.  */
5775 	      chain.sort (dt_sort_cmp, bb_vinfo);
5776 	      /* ???  Now we'd want to strip externals and constants
5777 		 but record those to be handled in the epilogue.  */
5778 	      /* ???  For now do not allow mixing ops or externs/constants.  */
5779 	      bool invalid = false;
5780 	      for (unsigned i = 0; i < chain.length (); ++i)
5781 		if (chain[i].dt != vect_internal_def
5782 		    || chain[i].code != code)
5783 		  invalid = true;
5784 	      if (!invalid)
5785 		{
5786 		  vec<stmt_vec_info> stmts;
5787 		  stmts.create (chain.length ());
5788 		  for (unsigned i = 0; i < chain.length (); ++i)
5789 		    stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5790 		  vec<stmt_vec_info> roots;
5791 		  roots.create (chain_stmts.length ());
5792 		  for (unsigned i = 0; i < chain_stmts.length (); ++i)
5793 		    roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5794 		  bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5795 						       stmts, roots));
5796 		}
5797 	    }
5798 	}
5799     }
5800 }
5801 
5802 /* Walk the grouped store chains and replace entries with their
5803    pattern variant if any.  */
5804 
5805 static void
vect_fixup_store_groups_with_patterns(vec_info * vinfo)5806 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5807 {
5808   stmt_vec_info first_element;
5809   unsigned i;
5810 
5811   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5812     {
5813       /* We also have CTORs in this array.  */
5814       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5815 	continue;
5816       if (STMT_VINFO_IN_PATTERN_P (first_element))
5817 	{
5818 	  stmt_vec_info orig = first_element;
5819 	  first_element = STMT_VINFO_RELATED_STMT (first_element);
5820 	  DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5821 	  DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5822 	  DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5823 	  DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5824 	  vinfo->grouped_stores[i] = first_element;
5825 	}
5826       stmt_vec_info prev = first_element;
5827       while (DR_GROUP_NEXT_ELEMENT (prev))
5828 	{
5829 	  stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5830 	  if (STMT_VINFO_IN_PATTERN_P (elt))
5831 	    {
5832 	      stmt_vec_info orig = elt;
5833 	      elt = STMT_VINFO_RELATED_STMT (elt);
5834 	      DR_GROUP_NEXT_ELEMENT (prev) = elt;
5835 	      DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5836 	      DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5837 	    }
5838 	  DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5839 	  prev = elt;
5840 	}
5841     }
5842 }
5843 
5844 /* Check if the region described by BB_VINFO can be vectorized, returning
5845    true if so.  When returning false, set FATAL to true if the same failure
5846    would prevent vectorization at other vector sizes, false if it is still
5847    worth trying other sizes.  N_STMTS is the number of statements in the
5848    region.  */
5849 
5850 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal,vec<int> * dataref_groups)5851 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5852 		       vec<int> *dataref_groups)
5853 {
5854   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5855 
5856   slp_instance instance;
5857   int i;
5858   poly_uint64 min_vf = 2;
5859 
5860   /* The first group of checks is independent of the vector size.  */
5861   fatal = true;
5862 
5863   /* Analyze the data references.  */
5864 
5865   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5866     {
5867       if (dump_enabled_p ())
5868         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5869 			 "not vectorized: unhandled data-ref in basic "
5870 			 "block.\n");
5871       return false;
5872     }
5873 
5874   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5875     {
5876      if (dump_enabled_p ())
5877        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5878 			"not vectorized: unhandled data access in "
5879 			"basic block.\n");
5880       return false;
5881     }
5882 
5883   vect_slp_check_for_constructors (bb_vinfo);
5884 
5885   /* If there are no grouped stores and no constructors in the region
5886      there is no need to continue with pattern recog as vect_analyze_slp
5887      will fail anyway.  */
5888   if (bb_vinfo->grouped_stores.is_empty ()
5889       && bb_vinfo->roots.is_empty ())
5890     {
5891       if (dump_enabled_p ())
5892 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5893 			 "not vectorized: no grouped stores in "
5894 			 "basic block.\n");
5895       return false;
5896     }
5897 
5898   /* While the rest of the analysis below depends on it in some way.  */
5899   fatal = false;
5900 
5901   vect_pattern_recog (bb_vinfo);
5902 
5903   /* Update store groups from pattern processing.  */
5904   vect_fixup_store_groups_with_patterns (bb_vinfo);
5905 
5906   /* Check the SLP opportunities in the basic block, analyze and build SLP
5907      trees.  */
5908   if (!vect_analyze_slp (bb_vinfo, n_stmts))
5909     {
5910       if (dump_enabled_p ())
5911 	{
5912 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5913 			   "Failed to SLP the basic block.\n");
5914 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5915 			   "not vectorized: failed to find SLP opportunities "
5916 			   "in basic block.\n");
5917 	}
5918       return false;
5919     }
5920 
5921   /* Optimize permutations.  */
5922   vect_optimize_slp (bb_vinfo);
5923 
5924   /* Gather the loads reachable from the SLP graph entries.  */
5925   vect_gather_slp_loads (bb_vinfo);
5926 
5927   vect_record_base_alignments (bb_vinfo);
5928 
5929   /* Analyze and verify the alignment of data references and the
5930      dependence in the SLP instances.  */
5931   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5932     {
5933       vect_location = instance->location ();
5934       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5935 	  || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5936 	{
5937 	  slp_tree node = SLP_INSTANCE_TREE (instance);
5938 	  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5939 	  if (dump_enabled_p ())
5940 	    dump_printf_loc (MSG_NOTE, vect_location,
5941 			     "removing SLP instance operations starting from: %G",
5942 			     stmt_info->stmt);
5943 	  vect_free_slp_instance (instance);
5944 	  BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5945 	  continue;
5946 	}
5947 
5948       /* Mark all the statements that we want to vectorize as pure SLP and
5949 	 relevant.  */
5950       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5951       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5952       unsigned j;
5953       stmt_vec_info root;
5954       /* Likewise consider instance root stmts as vectorized.  */
5955       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5956 	STMT_SLP_TYPE (root) = pure_slp;
5957 
5958       i++;
5959     }
5960   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5961     return false;
5962 
5963   if (!vect_slp_analyze_operations (bb_vinfo))
5964     {
5965       if (dump_enabled_p ())
5966         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5967 			 "not vectorized: bad operation in basic block.\n");
5968       return false;
5969     }
5970 
5971   vect_bb_partition_graph (bb_vinfo);
5972 
5973   return true;
5974 }
5975 
5976 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
5977    basic blocks in BBS, returning true on success.
5978    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
5979 
5980 static bool
vect_slp_region(vec<basic_block> bbs,vec<data_reference_p> datarefs,vec<int> * dataref_groups,unsigned int n_stmts,loop_p orig_loop)5981 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5982 		 vec<int> *dataref_groups, unsigned int n_stmts,
5983 		 loop_p orig_loop)
5984 {
5985   bb_vec_info bb_vinfo;
5986   auto_vector_modes vector_modes;
5987 
5988   /* Autodetect first vector size we try.  */
5989   machine_mode next_vector_mode = VOIDmode;
5990   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5991   unsigned int mode_i = 0;
5992 
5993   vec_info_shared shared;
5994 
5995   machine_mode autodetected_vector_mode = VOIDmode;
5996   while (1)
5997     {
5998       bool vectorized = false;
5999       bool fatal = false;
6000       bb_vinfo = new _bb_vec_info (bbs, &shared);
6001 
6002       bool first_time_p = shared.datarefs.is_empty ();
6003       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
6004       if (first_time_p)
6005 	bb_vinfo->shared->save_datarefs ();
6006       else
6007 	bb_vinfo->shared->check_datarefs ();
6008       bb_vinfo->vector_mode = next_vector_mode;
6009 
6010       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
6011 	{
6012 	  if (dump_enabled_p ())
6013 	    {
6014 	      dump_printf_loc (MSG_NOTE, vect_location,
6015 			       "***** Analysis succeeded with vector mode"
6016 			       " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
6017 	      dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
6018 	    }
6019 
6020 	  bb_vinfo->shared->check_datarefs ();
6021 
6022 	  auto_vec<slp_instance> profitable_subgraphs;
6023 	  for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
6024 	    {
6025 	      if (instance->subgraph_entries.is_empty ())
6026 		continue;
6027 
6028 	      vect_location = instance->location ();
6029 	      if (!unlimited_cost_model (NULL)
6030 		  && !vect_bb_vectorization_profitable_p
6031 			(bb_vinfo, instance->subgraph_entries, orig_loop))
6032 		{
6033 		  if (dump_enabled_p ())
6034 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6035 				     "not vectorized: vectorization is not "
6036 				     "profitable.\n");
6037 		  continue;
6038 		}
6039 
6040 	      if (!dbg_cnt (vect_slp))
6041 		continue;
6042 
6043 	      profitable_subgraphs.safe_push (instance);
6044 	    }
6045 
6046 	  /* When we're vectorizing an if-converted loop body make sure
6047 	     we vectorized all if-converted code.  */
6048 	  if (!profitable_subgraphs.is_empty ()
6049 	      && orig_loop)
6050 	    {
6051 	      gcc_assert (bb_vinfo->bbs.length () == 1);
6052 	      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
6053 		   !gsi_end_p (gsi); gsi_next (&gsi))
6054 		{
6055 		  /* The costing above left us with DCEable vectorized scalar
6056 		     stmts having the visited flag set on profitable
6057 		     subgraphs.  Do the delayed clearing of the flag here.  */
6058 		  if (gimple_visited_p (gsi_stmt (gsi)))
6059 		    {
6060 		      gimple_set_visited (gsi_stmt (gsi), false);
6061 		      continue;
6062 		    }
6063 		  if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
6064 		    continue;
6065 
6066 		  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
6067 		    if (gimple_assign_rhs_code (ass) == COND_EXPR)
6068 		      {
6069 			if (!profitable_subgraphs.is_empty ()
6070 			    && dump_enabled_p ())
6071 			  dump_printf_loc (MSG_NOTE, vect_location,
6072 					   "not profitable because of "
6073 					   "unprofitable if-converted scalar "
6074 					   "code\n");
6075 			profitable_subgraphs.truncate (0);
6076 		      }
6077 		}
6078 	    }
6079 
6080 	  /* Finally schedule the profitable subgraphs.  */
6081 	  for (slp_instance instance : profitable_subgraphs)
6082 	    {
6083 	      if (!vectorized && dump_enabled_p ())
6084 		dump_printf_loc (MSG_NOTE, vect_location,
6085 				 "Basic block will be vectorized "
6086 				 "using SLP\n");
6087 	      vectorized = true;
6088 
6089 	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
6090 
6091 	      unsigned HOST_WIDE_INT bytes;
6092 	      if (dump_enabled_p ())
6093 		{
6094 		  if (GET_MODE_SIZE
6095 			(bb_vinfo->vector_mode).is_constant (&bytes))
6096 		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6097 				     "basic block part vectorized using %wu "
6098 				     "byte vectors\n", bytes);
6099 		  else
6100 		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6101 				     "basic block part vectorized using "
6102 				     "variable length vectors\n");
6103 		}
6104 	    }
6105 	}
6106       else
6107 	{
6108 	  if (dump_enabled_p ())
6109 	    dump_printf_loc (MSG_NOTE, vect_location,
6110 			     "***** Analysis failed with vector mode %s\n",
6111 			     GET_MODE_NAME (bb_vinfo->vector_mode));
6112 	}
6113 
6114       if (mode_i == 0)
6115 	autodetected_vector_mode = bb_vinfo->vector_mode;
6116 
6117       if (!fatal)
6118 	while (mode_i < vector_modes.length ()
6119 	       && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6120 	  {
6121 	    if (dump_enabled_p ())
6122 	      dump_printf_loc (MSG_NOTE, vect_location,
6123 			       "***** The result for vector mode %s would"
6124 			       " be the same\n",
6125 			       GET_MODE_NAME (vector_modes[mode_i]));
6126 	    mode_i += 1;
6127 	  }
6128 
6129       delete bb_vinfo;
6130 
6131       if (mode_i < vector_modes.length ()
6132 	  && VECTOR_MODE_P (autodetected_vector_mode)
6133 	  && (related_vector_mode (vector_modes[mode_i],
6134 				   GET_MODE_INNER (autodetected_vector_mode))
6135 	      == autodetected_vector_mode)
6136 	  && (related_vector_mode (autodetected_vector_mode,
6137 				   GET_MODE_INNER (vector_modes[mode_i]))
6138 	      == vector_modes[mode_i]))
6139 	{
6140 	  if (dump_enabled_p ())
6141 	    dump_printf_loc (MSG_NOTE, vect_location,
6142 			     "***** Skipping vector mode %s, which would"
6143 			     " repeat the analysis for %s\n",
6144 			     GET_MODE_NAME (vector_modes[mode_i]),
6145 			     GET_MODE_NAME (autodetected_vector_mode));
6146 	  mode_i += 1;
6147 	}
6148 
6149       if (vectorized
6150 	  || mode_i == vector_modes.length ()
6151 	  || autodetected_vector_mode == VOIDmode
6152 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
6153 	     vector sizes will fail do not bother iterating.  */
6154 	  || fatal)
6155 	return vectorized;
6156 
6157       /* Try the next biggest vector size.  */
6158       next_vector_mode = vector_modes[mode_i++];
6159       if (dump_enabled_p ())
6160 	dump_printf_loc (MSG_NOTE, vect_location,
6161 			 "***** Re-trying analysis with vector mode %s\n",
6162 			 GET_MODE_NAME (next_vector_mode));
6163     }
6164 }
6165 
6166 
6167 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
6168    true if anything in the basic-block was vectorized.  */
6169 
6170 static bool
vect_slp_bbs(const vec<basic_block> & bbs,loop_p orig_loop)6171 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6172 {
6173   vec<data_reference_p> datarefs = vNULL;
6174   auto_vec<int> dataref_groups;
6175   int insns = 0;
6176   int current_group = 0;
6177 
6178   for (unsigned i = 0; i < bbs.length (); i++)
6179     {
6180       basic_block bb = bbs[i];
6181       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6182 	   gsi_next (&gsi))
6183 	{
6184 	  gimple *stmt = gsi_stmt (gsi);
6185 	  if (is_gimple_debug (stmt))
6186 	    continue;
6187 
6188 	  insns++;
6189 
6190 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
6191 	    vect_location = stmt;
6192 
6193 	  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6194 					      &dataref_groups, current_group))
6195 	    ++current_group;
6196 	}
6197       /* New BBs always start a new DR group.  */
6198       ++current_group;
6199     }
6200 
6201   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6202 }
6203 
6204 /* Special entry for the BB vectorizer.  Analyze and transform a single
6205    if-converted BB with ORIG_LOOPs body being the not if-converted
6206    representation.  Returns true if anything in the basic-block was
6207    vectorized.  */
6208 
6209 bool
vect_slp_if_converted_bb(basic_block bb,loop_p orig_loop)6210 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6211 {
6212   auto_vec<basic_block> bbs;
6213   bbs.safe_push (bb);
6214   return vect_slp_bbs (bbs, orig_loop);
6215 }
6216 
6217 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
6218    true if anything in the basic-block was vectorized.  */
6219 
6220 bool
vect_slp_function(function * fun)6221 vect_slp_function (function *fun)
6222 {
6223   bool r = false;
6224   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6225   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6226 
6227   /* For the moment split the function into pieces to avoid making
6228      the iteration on the vector mode moot.  Split at points we know
6229      to not handle well which is CFG merges (SLP discovery doesn't
6230      handle non-loop-header PHIs) and loop exits.  Since pattern
6231      recog requires reverse iteration to visit uses before defs
6232      simply chop RPO into pieces.  */
6233   auto_vec<basic_block> bbs;
6234   for (unsigned i = 0; i < n; i++)
6235     {
6236       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6237       bool split = false;
6238 
6239       /* Split when a BB is not dominated by the first block.  */
6240       if (!bbs.is_empty ()
6241 	  && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6242 	{
6243 	  if (dump_enabled_p ())
6244 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245 			     "splitting region at dominance boundary bb%d\n",
6246 			     bb->index);
6247 	  split = true;
6248 	}
6249       /* Split when the loop determined by the first block
6250 	 is exited.  This is because we eventually insert
6251 	 invariants at region begin.  */
6252       else if (!bbs.is_empty ()
6253 	       && bbs[0]->loop_father != bb->loop_father
6254 	       && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6255 	{
6256 	  if (dump_enabled_p ())
6257 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 			     "splitting region at loop %d exit at bb%d\n",
6259 			     bbs[0]->loop_father->num, bb->index);
6260 	  split = true;
6261 	}
6262 
6263       if (split && !bbs.is_empty ())
6264 	{
6265 	  r |= vect_slp_bbs (bbs, NULL);
6266 	  bbs.truncate (0);
6267 	}
6268 
6269       /* We need to be able to insert at the head of the region which
6270 	 we cannot for region starting with a returns-twice call.  */
6271       if (bbs.is_empty ())
6272 	if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
6273 	  if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
6274 	    {
6275 	      if (dump_enabled_p ())
6276 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277 				 "skipping bb%d as start of region as it "
6278 				 "starts with returns-twice call\n",
6279 				 bb->index);
6280 	      continue;
6281 	    }
6282 
6283       bbs.safe_push (bb);
6284 
6285       /* When we have a stmt ending this block and defining a
6286 	 value we have to insert on edges when inserting after it for
6287 	 a vector containing its definition.  Avoid this for now.  */
6288       if (gimple *last = last_stmt (bb))
6289 	if (gimple_get_lhs (last)
6290 	    && is_ctrl_altering_stmt (last))
6291 	  {
6292 	    if (dump_enabled_p ())
6293 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6294 			       "splitting region at control altering "
6295 			       "definition %G", last);
6296 	    r |= vect_slp_bbs (bbs, NULL);
6297 	    bbs.truncate (0);
6298 	  }
6299     }
6300 
6301   if (!bbs.is_empty ())
6302     r |= vect_slp_bbs (bbs, NULL);
6303 
6304   free (rpo);
6305 
6306   return r;
6307 }
6308 
6309 /* Build a variable-length vector in which the elements in ELTS are repeated
6310    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
6311    RESULTS and add any new instructions to SEQ.
6312 
6313    The approach we use is:
6314 
6315    (1) Find a vector mode VM with integer elements of mode IM.
6316 
6317    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6318        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
6319        from small vectors to IM.
6320 
6321    (3) Duplicate each ELTS'[I] into a vector of mode VM.
6322 
6323    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6324        correct byte contents.
6325 
6326    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6327 
6328    We try to find the largest IM for which this sequence works, in order
6329    to cut down on the number of interleaves.  */
6330 
6331 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,const vec<tree> & elts,unsigned int nresults,vec<tree> & results)6332 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6333 			  const vec<tree> &elts, unsigned int nresults,
6334 			  vec<tree> &results)
6335 {
6336   unsigned int nelts = elts.length ();
6337   tree element_type = TREE_TYPE (vector_type);
6338 
6339   /* (1) Find a vector mode VM with integer elements of mode IM.  */
6340   unsigned int nvectors = 1;
6341   tree new_vector_type;
6342   tree permutes[2];
6343   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6344 				       &nvectors, &new_vector_type,
6345 				       permutes))
6346     gcc_unreachable ();
6347 
6348   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
6349   unsigned int partial_nelts = nelts / nvectors;
6350   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6351 
6352   tree_vector_builder partial_elts;
6353   auto_vec<tree, 32> pieces (nvectors * 2);
6354   pieces.quick_grow_cleared (nvectors * 2);
6355   for (unsigned int i = 0; i < nvectors; ++i)
6356     {
6357       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6358 	     ELTS' has mode IM.  */
6359       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6360       for (unsigned int j = 0; j < partial_nelts; ++j)
6361 	partial_elts.quick_push (elts[i * partial_nelts + j]);
6362       tree t = gimple_build_vector (seq, &partial_elts);
6363       t = gimple_build (seq, VIEW_CONVERT_EXPR,
6364 			TREE_TYPE (new_vector_type), t);
6365 
6366       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
6367       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6368     }
6369 
6370   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6371 	 correct byte contents.
6372 
6373      Conceptually, we need to repeat the following operation log2(nvectors)
6374      times, where hi_start = nvectors / 2:
6375 
6376 	out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6377 	out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6378 
6379      However, if each input repeats every N elements and the VF is
6380      a multiple of N * 2, the HI result is the same as the LO result.
6381      This will be true for the first N1 iterations of the outer loop,
6382      followed by N2 iterations for which both the LO and HI results
6383      are needed.  I.e.:
6384 
6385 	N1 + N2 = log2(nvectors)
6386 
6387      Each "N1 iteration" doubles the number of redundant vectors and the
6388      effect of the process as a whole is to have a sequence of nvectors/2**N1
6389      vectors that repeats 2**N1 times.  Rather than generate these redundant
6390      vectors, we halve the number of vectors for each N1 iteration.  */
6391   unsigned int in_start = 0;
6392   unsigned int out_start = nvectors;
6393   unsigned int new_nvectors = nvectors;
6394   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6395     {
6396       unsigned int hi_start = new_nvectors / 2;
6397       unsigned int out_i = 0;
6398       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6399 	{
6400 	  if ((in_i & 1) != 0
6401 	      && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6402 			     2 * in_repeat))
6403 	    continue;
6404 
6405 	  tree output = make_ssa_name (new_vector_type);
6406 	  tree input1 = pieces[in_start + (in_i / 2)];
6407 	  tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6408 	  gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6409 					       input1, input2,
6410 					       permutes[in_i & 1]);
6411 	  gimple_seq_add_stmt (seq, stmt);
6412 	  pieces[out_start + out_i] = output;
6413 	  out_i += 1;
6414 	}
6415       std::swap (in_start, out_start);
6416       new_nvectors = out_i;
6417     }
6418 
6419   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
6420   results.reserve (nresults);
6421   for (unsigned int i = 0; i < nresults; ++i)
6422     if (i < new_nvectors)
6423       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6424 					pieces[in_start + i]));
6425     else
6426       results.quick_push (results[i - new_nvectors]);
6427 }
6428 
6429 
6430 /* For constant and loop invariant defs in OP_NODE this function creates
6431    vector defs that will be used in the vectorized stmts and stores them
6432    to SLP_TREE_VEC_DEFS of OP_NODE.  */
6433 
6434 static void
vect_create_constant_vectors(vec_info * vinfo,slp_tree op_node)6435 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6436 {
6437   unsigned HOST_WIDE_INT nunits;
6438   tree vec_cst;
6439   unsigned j, number_of_places_left_in_vector;
6440   tree vector_type;
6441   tree vop;
6442   int group_size = op_node->ops.length ();
6443   unsigned int vec_num, i;
6444   unsigned number_of_copies = 1;
6445   bool constant_p;
6446   gimple_seq ctor_seq = NULL;
6447   auto_vec<tree, 16> permute_results;
6448 
6449   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
6450   vector_type = SLP_TREE_VECTYPE (op_node);
6451 
6452   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6453   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6454   auto_vec<tree> voprnds (number_of_vectors);
6455 
6456   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6457      created vectors. It is greater than 1 if unrolling is performed.
6458 
6459      For example, we have two scalar operands, s1 and s2 (e.g., group of
6460      strided accesses of size two), while NUNITS is four (i.e., four scalars
6461      of this type can be packed in a vector).  The output vector will contain
6462      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
6463      will be 2).
6464 
6465      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6466      containing the operands.
6467 
6468      For example, NUNITS is four as before, and the group size is 8
6469      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
6470      {s5, s6, s7, s8}.  */
6471 
6472   /* When using duplicate_and_interleave, we just need one element for
6473      each scalar statement.  */
6474   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6475     nunits = group_size;
6476 
6477   number_of_copies = nunits * number_of_vectors / group_size;
6478 
6479   number_of_places_left_in_vector = nunits;
6480   constant_p = true;
6481   tree_vector_builder elts (vector_type, nunits, 1);
6482   elts.quick_grow (nunits);
6483   stmt_vec_info insert_after = NULL;
6484   for (j = 0; j < number_of_copies; j++)
6485     {
6486       tree op;
6487       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6488         {
6489           /* Create 'vect_ = {op0,op1,...,opn}'.  */
6490           number_of_places_left_in_vector--;
6491 	  tree orig_op = op;
6492 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6493 	    {
6494 	      if (CONSTANT_CLASS_P (op))
6495 		{
6496 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6497 		    {
6498 		      /* Can't use VIEW_CONVERT_EXPR for booleans because
6499 			 of possibly different sizes of scalar value and
6500 			 vector element.  */
6501 		      if (integer_zerop (op))
6502 			op = build_int_cst (TREE_TYPE (vector_type), 0);
6503 		      else if (integer_onep (op))
6504 			op = build_all_ones_cst (TREE_TYPE (vector_type));
6505 		      else
6506 			gcc_unreachable ();
6507 		    }
6508 		  else
6509 		    op = fold_unary (VIEW_CONVERT_EXPR,
6510 				     TREE_TYPE (vector_type), op);
6511 		  gcc_assert (op && CONSTANT_CLASS_P (op));
6512 		}
6513 	      else
6514 		{
6515 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6516 		  gimple *init_stmt;
6517 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6518 		    {
6519 		      tree true_val
6520 			= build_all_ones_cst (TREE_TYPE (vector_type));
6521 		      tree false_val
6522 			= build_zero_cst (TREE_TYPE (vector_type));
6523 		      gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6524 		      init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6525 						       op, true_val,
6526 						       false_val);
6527 		    }
6528 		  else
6529 		    {
6530 		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6531 				   op);
6532 		      init_stmt
6533 			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6534 					       op);
6535 		    }
6536 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
6537 		  op = new_temp;
6538 		}
6539 	    }
6540 	  elts[number_of_places_left_in_vector] = op;
6541 	  if (!CONSTANT_CLASS_P (op))
6542 	    constant_p = false;
6543 	  /* For BB vectorization we have to compute an insert location
6544 	     when a def is inside the analyzed region since we cannot
6545 	     simply insert at the BB start in this case.  */
6546 	  stmt_vec_info opdef;
6547 	  if (TREE_CODE (orig_op) == SSA_NAME
6548 	      && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6549 	      && is_a <bb_vec_info> (vinfo)
6550 	      && (opdef = vinfo->lookup_def (orig_op)))
6551 	    {
6552 	      if (!insert_after)
6553 		insert_after = opdef;
6554 	      else
6555 		insert_after = get_later_stmt (insert_after, opdef);
6556 	    }
6557 
6558           if (number_of_places_left_in_vector == 0)
6559             {
6560 	      if (constant_p
6561 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6562 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6563 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
6564 	      else
6565 		{
6566 		  if (permute_results.is_empty ())
6567 		    duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6568 					      elts, number_of_vectors,
6569 					      permute_results);
6570 		  vec_cst = permute_results[number_of_vectors - j - 1];
6571 		}
6572 	      if (!gimple_seq_empty_p (ctor_seq))
6573 		{
6574 		  if (insert_after)
6575 		    {
6576 		      gimple_stmt_iterator gsi;
6577 		      if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6578 			{
6579 			  gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6580 			  gsi_insert_seq_before (&gsi, ctor_seq,
6581 						 GSI_CONTINUE_LINKING);
6582 			}
6583 		      else if (!stmt_ends_bb_p (insert_after->stmt))
6584 			{
6585 			  gsi = gsi_for_stmt (insert_after->stmt);
6586 			  gsi_insert_seq_after (&gsi, ctor_seq,
6587 						GSI_CONTINUE_LINKING);
6588 			}
6589 		      else
6590 			{
6591 			  /* When we want to insert after a def where the
6592 			     defining stmt throws then insert on the fallthru
6593 			     edge.  */
6594 			  edge e = find_fallthru_edge
6595 				     (gimple_bb (insert_after->stmt)->succs);
6596 			  basic_block new_bb
6597 			    = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6598 			  gcc_assert (!new_bb);
6599 			}
6600 		    }
6601 		  else
6602 		    vinfo->insert_seq_on_entry (NULL, ctor_seq);
6603 		  ctor_seq = NULL;
6604 		}
6605 	      voprnds.quick_push (vec_cst);
6606 	      insert_after = NULL;
6607               number_of_places_left_in_vector = nunits;
6608 	      constant_p = true;
6609 	      elts.new_vector (vector_type, nunits, 1);
6610 	      elts.quick_grow (nunits);
6611             }
6612         }
6613     }
6614 
6615   /* Since the vectors are created in the reverse order, we should invert
6616      them.  */
6617   vec_num = voprnds.length ();
6618   for (j = vec_num; j != 0; j--)
6619     {
6620       vop = voprnds[j - 1];
6621       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6622     }
6623 
6624   /* In case that VF is greater than the unrolling factor needed for the SLP
6625      group of stmts, NUMBER_OF_VECTORS to be created is greater than
6626      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6627      to replicate the vectors.  */
6628   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6629     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6630 	 i++)
6631       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6632 }
6633 
6634 /* Get the Ith vectorized definition from SLP_NODE.  */
6635 
6636 tree
vect_get_slp_vect_def(slp_tree slp_node,unsigned i)6637 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6638 {
6639   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6640     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6641   else
6642     return SLP_TREE_VEC_DEFS (slp_node)[i];
6643 }
6644 
6645 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
6646 
6647 void
vect_get_slp_defs(slp_tree slp_node,vec<tree> * vec_defs)6648 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6649 {
6650   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6651   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6652     {
6653       unsigned j;
6654       gimple *vec_def_stmt;
6655       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6656 	vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6657     }
6658   else
6659     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6660 }
6661 
6662 /* Get N vectorized definitions for SLP_NODE.  */
6663 
6664 void
vect_get_slp_defs(vec_info *,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)6665 vect_get_slp_defs (vec_info *,
6666 		   slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6667 {
6668   if (n == -1U)
6669     n = SLP_TREE_CHILDREN (slp_node).length ();
6670 
6671   for (unsigned i = 0; i < n; ++i)
6672     {
6673       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6674       vec<tree> vec_defs = vNULL;
6675       vect_get_slp_defs (child, &vec_defs);
6676       vec_oprnds->quick_push (vec_defs);
6677     }
6678 }
6679 
6680 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6681    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6682    permute statements for the SLP node NODE.  Store the number of vector
6683    permute instructions in *N_PERMS and the number of vector load
6684    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
6685    that were not needed.  */
6686 
6687 bool
vect_transform_slp_perm_load(vec_info * vinfo,slp_tree node,const vec<tree> & dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,bool analyze_only,unsigned * n_perms,unsigned int * n_loads,bool dce_chain)6688 vect_transform_slp_perm_load (vec_info *vinfo,
6689 			      slp_tree node, const vec<tree> &dr_chain,
6690 			      gimple_stmt_iterator *gsi, poly_uint64 vf,
6691 			      bool analyze_only, unsigned *n_perms,
6692 			      unsigned int *n_loads, bool dce_chain)
6693 {
6694   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6695   int vec_index = 0;
6696   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6697   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6698   unsigned int mask_element;
6699   machine_mode mode;
6700 
6701   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6702     return false;
6703 
6704   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6705 
6706   mode = TYPE_MODE (vectype);
6707   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6708 
6709   /* Initialize the vect stmts of NODE to properly insert the generated
6710      stmts later.  */
6711   if (! analyze_only)
6712     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6713 	 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6714       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6715 
6716   /* Generate permutation masks for every NODE. Number of masks for each NODE
6717      is equal to GROUP_SIZE.
6718      E.g., we have a group of three nodes with three loads from the same
6719      location in each node, and the vector size is 4. I.e., we have a
6720      a0b0c0a1b1c1... sequence and we need to create the following vectors:
6721      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6722      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6723      ...
6724 
6725      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6726      The last mask is illegal since we assume two operands for permute
6727      operation, and the mask element values can't be outside that range.
6728      Hence, the last mask must be converted into {2,5,5,5}.
6729      For the first two permutations we need the first and the second input
6730      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6731      we need the second and the third vectors: {b1,c1,a2,b2} and
6732      {c2,a3,b3,c3}.  */
6733 
6734   int vect_stmts_counter = 0;
6735   unsigned int index = 0;
6736   int first_vec_index = -1;
6737   int second_vec_index = -1;
6738   bool noop_p = true;
6739   *n_perms = 0;
6740 
6741   vec_perm_builder mask;
6742   unsigned int nelts_to_build;
6743   unsigned int nvectors_per_build;
6744   unsigned int in_nlanes;
6745   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6746 		      && multiple_p (nunits, group_size));
6747   if (repeating_p)
6748     {
6749       /* A single vector contains a whole number of copies of the node, so:
6750 	 (a) all permutes can use the same mask; and
6751 	 (b) the permutes only need a single vector input.  */
6752       mask.new_vector (nunits, group_size, 3);
6753       nelts_to_build = mask.encoded_nelts ();
6754       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6755       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6756     }
6757   else
6758     {
6759       /* We need to construct a separate mask for each vector statement.  */
6760       unsigned HOST_WIDE_INT const_nunits, const_vf;
6761       if (!nunits.is_constant (&const_nunits)
6762 	  || !vf.is_constant (&const_vf))
6763 	return false;
6764       mask.new_vector (const_nunits, const_nunits, 1);
6765       nelts_to_build = const_vf * group_size;
6766       nvectors_per_build = 1;
6767       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6768     }
6769   auto_sbitmap used_in_lanes (in_nlanes);
6770   bitmap_clear (used_in_lanes);
6771   auto_bitmap used_defs;
6772 
6773   unsigned int count = mask.encoded_nelts ();
6774   mask.quick_grow (count);
6775   vec_perm_indices indices;
6776 
6777   for (unsigned int j = 0; j < nelts_to_build; j++)
6778     {
6779       unsigned int iter_num = j / group_size;
6780       unsigned int stmt_num = j % group_size;
6781       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6782 			+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6783       bitmap_set_bit (used_in_lanes, i);
6784       if (repeating_p)
6785 	{
6786 	  first_vec_index = 0;
6787 	  mask_element = i;
6788 	}
6789       else
6790 	{
6791 	  /* Enforced before the loop when !repeating_p.  */
6792 	  unsigned int const_nunits = nunits.to_constant ();
6793 	  vec_index = i / const_nunits;
6794 	  mask_element = i % const_nunits;
6795 	  if (vec_index == first_vec_index
6796 	      || first_vec_index == -1)
6797 	    {
6798 	      first_vec_index = vec_index;
6799 	    }
6800 	  else if (vec_index == second_vec_index
6801 		   || second_vec_index == -1)
6802 	    {
6803 	      second_vec_index = vec_index;
6804 	      mask_element += const_nunits;
6805 	    }
6806 	  else
6807 	    {
6808 	      if (dump_enabled_p ())
6809 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810 				 "permutation requires at "
6811 				 "least three vectors %G",
6812 				 stmt_info->stmt);
6813 	      gcc_assert (analyze_only);
6814 	      return false;
6815 	    }
6816 
6817 	  gcc_assert (mask_element < 2 * const_nunits);
6818 	}
6819 
6820       if (mask_element != index)
6821 	noop_p = false;
6822       mask[index++] = mask_element;
6823 
6824       if (index == count && !noop_p)
6825 	{
6826 	  indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6827 	  if (!can_vec_perm_const_p (mode, indices))
6828 	    {
6829 	      if (dump_enabled_p ())
6830 		{
6831 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6832 				   vect_location,
6833 				   "unsupported vect permute { ");
6834 		  for (i = 0; i < count; ++i)
6835 		    {
6836 		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6837 		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6838 		    }
6839 		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6840 		}
6841 	      gcc_assert (analyze_only);
6842 	      return false;
6843 	    }
6844 
6845 	  ++*n_perms;
6846 	}
6847 
6848       if (index == count)
6849 	{
6850 	  if (!analyze_only)
6851 	    {
6852 	      tree mask_vec = NULL_TREE;
6853 
6854 	      if (! noop_p)
6855 		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6856 
6857 	      if (second_vec_index == -1)
6858 		second_vec_index = first_vec_index;
6859 
6860 	      for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6861 		{
6862 		  /* Generate the permute statement if necessary.  */
6863 		  tree first_vec = dr_chain[first_vec_index + ri];
6864 		  tree second_vec = dr_chain[second_vec_index + ri];
6865 		  gimple *perm_stmt;
6866 		  if (! noop_p)
6867 		    {
6868 		      gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6869 		      tree perm_dest
6870 			= vect_create_destination_var (gimple_assign_lhs (stmt),
6871 						       vectype);
6872 		      perm_dest = make_ssa_name (perm_dest);
6873 		      perm_stmt
6874 			= gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6875 					       first_vec, second_vec,
6876 					       mask_vec);
6877 		      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6878 						   gsi);
6879 		      if (dce_chain)
6880 			{
6881 			  bitmap_set_bit (used_defs, first_vec_index + ri);
6882 			  bitmap_set_bit (used_defs, second_vec_index + ri);
6883 			}
6884 		    }
6885 		  else
6886 		    {
6887 		      /* If mask was NULL_TREE generate the requested
6888 			 identity transform.  */
6889 		      perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6890 		      if (dce_chain)
6891 			bitmap_set_bit (used_defs, first_vec_index + ri);
6892 		    }
6893 
6894 		  /* Store the vector statement in NODE.  */
6895 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6896 		}
6897 	    }
6898 
6899 	  index = 0;
6900 	  first_vec_index = -1;
6901 	  second_vec_index = -1;
6902 	  noop_p = true;
6903 	}
6904     }
6905 
6906   if (n_loads)
6907     {
6908       if (repeating_p)
6909 	*n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6910       else
6911 	{
6912 	  /* Enforced above when !repeating_p.  */
6913 	  unsigned int const_nunits = nunits.to_constant ();
6914 	  *n_loads = 0;
6915 	  bool load_seen = false;
6916 	  for (unsigned i = 0; i < in_nlanes; ++i)
6917 	    {
6918 	      if (i % const_nunits == 0)
6919 		{
6920 		  if (load_seen)
6921 		    *n_loads += 1;
6922 		  load_seen = false;
6923 		}
6924 	      if (bitmap_bit_p (used_in_lanes, i))
6925 		load_seen = true;
6926 	    }
6927 	  if (load_seen)
6928 	    *n_loads += 1;
6929 	}
6930     }
6931 
6932   if (dce_chain)
6933     for (unsigned i = 0; i < dr_chain.length (); ++i)
6934       if (!bitmap_bit_p (used_defs, i))
6935 	{
6936 	  gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6937 	  gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6938 	  gsi_remove (&rgsi, true);
6939 	  release_defs (stmt);
6940 	}
6941 
6942   return true;
6943 }
6944 
6945 /* Produce the next vector result for SLP permutation NODE by adding a vector
6946    statement at GSI.  If MASK_VEC is nonnull, add:
6947 
6948       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6949 
6950    otherwise add:
6951 
6952       <new SSA name> = FIRST_DEF.  */
6953 
6954 static void
vect_add_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,tree first_def,tree second_def,tree mask_vec)6955 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6956 			  slp_tree node, tree first_def, tree second_def,
6957 			  tree mask_vec)
6958 {
6959   tree vectype = SLP_TREE_VECTYPE (node);
6960 
6961   /* ???  We SLP match existing vector element extracts but
6962      allow punning which we need to re-instantiate at uses
6963      but have no good way of explicitly representing.  */
6964   if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6965     {
6966       gassign *conv_stmt
6967 	= gimple_build_assign (make_ssa_name (vectype),
6968 			       build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6969       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6970       first_def = gimple_assign_lhs (conv_stmt);
6971     }
6972   gassign *perm_stmt;
6973   tree perm_dest = make_ssa_name (vectype);
6974   if (mask_vec)
6975     {
6976       if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6977 	{
6978 	  gassign *conv_stmt
6979 	    = gimple_build_assign (make_ssa_name (vectype),
6980 				   build1 (VIEW_CONVERT_EXPR,
6981 					   vectype, second_def));
6982 	  vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6983 	  second_def = gimple_assign_lhs (conv_stmt);
6984 	}
6985       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6986 				       first_def, second_def,
6987 				       mask_vec);
6988     }
6989   else
6990     /* We need a copy here in case the def was external.  */
6991     perm_stmt = gimple_build_assign (perm_dest, first_def);
6992   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6993   /* Store the vector statement in NODE.  */
6994   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6995 }
6996 
6997 /* Vectorize the SLP permutations in NODE as specified
6998    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6999    child number and lane number.
7000    Interleaving of two two-lane two-child SLP subtrees (not supported):
7001      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
7002    A blend of two four-lane two-child SLP subtrees:
7003      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
7004    Highpart of a four-lane one-child SLP subtree (not supported):
7005      [ { 0, 2 }, { 0, 3 } ]
7006    Where currently only a subset is supported by code generating below.  */
7007 
7008 static bool
vectorizable_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,stmt_vector_for_cost * cost_vec)7009 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
7010 			      slp_tree node, stmt_vector_for_cost *cost_vec)
7011 {
7012   tree vectype = SLP_TREE_VECTYPE (node);
7013 
7014   /* ???  We currently only support all same vector input and output types
7015      while the SLP IL should really do a concat + select and thus accept
7016      arbitrary mismatches.  */
7017   slp_tree child;
7018   unsigned i;
7019   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7020   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
7021   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7022     {
7023       if (!vect_maybe_update_slp_op_vectype (child, vectype)
7024 	  || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
7025 	{
7026 	  if (dump_enabled_p ())
7027 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7028 			     "Unsupported lane permutation\n");
7029 	  return false;
7030 	}
7031       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
7032 	repeating_p = false;
7033     }
7034 
7035   vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
7036   gcc_assert (perm.length () == SLP_TREE_LANES (node));
7037   if (dump_enabled_p ())
7038     {
7039       dump_printf_loc (MSG_NOTE, vect_location,
7040 		       "vectorizing permutation");
7041       for (unsigned i = 0; i < perm.length (); ++i)
7042 	dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
7043       if (repeating_p)
7044 	dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
7045       dump_printf (MSG_NOTE, "\n");
7046     }
7047 
7048   /* REPEATING_P is true if every output vector is guaranteed to use the
7049      same permute vector.  We can handle that case for both variable-length
7050      and constant-length vectors, but we only handle other cases for
7051      constant-length vectors.
7052 
7053      Set:
7054 
7055      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7056        mask vector that we want to build.
7057 
7058      - NCOPIES to the number of copies of PERM that we need in order
7059        to build the necessary permute mask vectors.
7060 
7061      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7062        for each permute mask vector.  This is only relevant when GSI is
7063        nonnull.  */
7064   uint64_t npatterns;
7065   unsigned nelts_per_pattern;
7066   uint64_t ncopies;
7067   unsigned noutputs_per_mask;
7068   if (repeating_p)
7069     {
7070       /* We need a single permute mask vector that has the form:
7071 
7072 	   { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7073 
7074 	 In other words, the original n-element permute in PERM is
7075 	 "unrolled" to fill a full vector.  The stepped vector encoding
7076 	 that we use for permutes requires 3n elements.  */
7077       npatterns = SLP_TREE_LANES (node);
7078       nelts_per_pattern = ncopies = 3;
7079       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7080     }
7081   else
7082     {
7083       /* Calculate every element of every permute mask vector explicitly,
7084 	 instead of relying on the pattern described above.  */
7085       if (!nunits.is_constant (&npatterns))
7086 	return false;
7087       nelts_per_pattern = ncopies = 1;
7088       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
7089 	if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
7090 	  return false;
7091       noutputs_per_mask = 1;
7092     }
7093   unsigned olanes = ncopies * SLP_TREE_LANES (node);
7094   gcc_assert (repeating_p || multiple_p (olanes, nunits));
7095 
7096   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7097      from the { SLP operand, scalar lane } permutation as recorded in the
7098      SLP node as intermediate step.  This part should already work
7099      with SLP children with arbitrary number of lanes.  */
7100   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
7101   auto_vec<unsigned> active_lane;
7102   vperm.create (olanes);
7103   active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
7104   for (unsigned i = 0; i < ncopies; ++i)
7105     {
7106       for (unsigned pi = 0; pi < perm.length (); ++pi)
7107 	{
7108 	  std::pair<unsigned, unsigned> p = perm[pi];
7109 	  tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7110 	  if (repeating_p)
7111 	    vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7112 	  else
7113 	    {
7114 	      /* We checked above that the vectors are constant-length.  */
7115 	      unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7116 	      unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7117 	      unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7118 	      vperm.quick_push ({{p.first, vi}, vl});
7119 	    }
7120 	}
7121       /* Advance to the next group.  */
7122       for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7123 	active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7124     }
7125 
7126   if (dump_enabled_p ())
7127     {
7128       dump_printf_loc (MSG_NOTE, vect_location, "as");
7129       for (unsigned i = 0; i < vperm.length (); ++i)
7130 	{
7131 	  if (i != 0
7132 	      && (repeating_p
7133 		  ? multiple_p (i, npatterns)
7134 		  : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7135 	    dump_printf (MSG_NOTE, ",");
7136 	  dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7137 		       vperm[i].first.first, vperm[i].first.second,
7138 		       vperm[i].second);
7139 	}
7140       dump_printf (MSG_NOTE, "\n");
7141     }
7142 
7143   /* We can only handle two-vector permutes, everything else should
7144      be lowered on the SLP level.  The following is closely inspired
7145      by vect_transform_slp_perm_load and is supposed to eventually
7146      replace it.
7147      ???   As intermediate step do code-gen in the SLP tree representation
7148      somehow?  */
7149   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7150   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7151   unsigned int index = 0;
7152   poly_uint64 mask_element;
7153   vec_perm_builder mask;
7154   mask.new_vector (nunits, npatterns, nelts_per_pattern);
7155   unsigned int count = mask.encoded_nelts ();
7156   mask.quick_grow (count);
7157   vec_perm_indices indices;
7158   unsigned nperms = 0;
7159   for (unsigned i = 0; i < vperm.length (); ++i)
7160     {
7161       mask_element = vperm[i].second;
7162       if (first_vec.first == -1U
7163 	  || first_vec == vperm[i].first)
7164 	first_vec = vperm[i].first;
7165       else if (second_vec.first == -1U
7166 	       || second_vec == vperm[i].first)
7167 	{
7168 	  second_vec = vperm[i].first;
7169 	  mask_element += nunits;
7170 	}
7171       else
7172 	{
7173 	  if (dump_enabled_p ())
7174 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7175 			     "permutation requires at "
7176 			     "least three vectors\n");
7177 	  gcc_assert (!gsi);
7178 	  return false;
7179 	}
7180 
7181       mask[index++] = mask_element;
7182 
7183       if (index == count)
7184 	{
7185 	  indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7186 	  bool identity_p = indices.series_p (0, 1, 0, 1);
7187 	  if (!identity_p
7188 	      && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7189 	    {
7190 	      if (dump_enabled_p ())
7191 		{
7192 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7193 				   vect_location,
7194 				   "unsupported vect permute { ");
7195 		  for (i = 0; i < count; ++i)
7196 		    {
7197 		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7198 		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7199 		    }
7200 		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7201 		}
7202 	      gcc_assert (!gsi);
7203 	      return false;
7204 	    }
7205 
7206 	  if (!identity_p)
7207 	    nperms++;
7208 	  if (gsi)
7209 	    {
7210 	      if (second_vec.first == -1U)
7211 		second_vec = first_vec;
7212 
7213 	      slp_tree
7214 		first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7215 		second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7216 
7217 	      tree mask_vec = NULL_TREE;
7218 	      if (!identity_p)
7219 		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7220 
7221 	      for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7222 		{
7223 		  tree first_def
7224 		    = vect_get_slp_vect_def (first_node,
7225 					     first_vec.second + vi);
7226 		  tree second_def
7227 		    = vect_get_slp_vect_def (second_node,
7228 					     second_vec.second + vi);
7229 		  vect_add_slp_permutation (vinfo, gsi, node, first_def,
7230 					    second_def, mask_vec);
7231 		}
7232 	    }
7233 
7234 	  index = 0;
7235 	  first_vec = std::make_pair (-1U, -1U);
7236 	  second_vec = std::make_pair (-1U, -1U);
7237 	}
7238     }
7239 
7240   if (!gsi)
7241     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7242 
7243   return true;
7244 }
7245 
7246 /* Vectorize SLP NODE.  */
7247 
7248 static void
vect_schedule_slp_node(vec_info * vinfo,slp_tree node,slp_instance instance)7249 vect_schedule_slp_node (vec_info *vinfo,
7250 			slp_tree node, slp_instance instance)
7251 {
7252   gimple_stmt_iterator si;
7253   int i;
7254   slp_tree child;
7255 
7256   /* Vectorize externals and constants.  */
7257   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7258       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7259     {
7260       /* ???  vectorizable_shift can end up using a scalar operand which is
7261 	 currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
7262 	 node in this case.  */
7263       if (!SLP_TREE_VECTYPE (node))
7264 	return;
7265 
7266       /* There are two reasons vector defs might already exist.  The first
7267 	 is that we are vectorizing an existing vector def.  The second is
7268 	 when performing BB vectorization shared constant/external nodes
7269 	 are not split apart during partitioning so during the code-gen
7270 	 DFS walk we can end up visiting them twice.  */
7271       if (! SLP_TREE_VEC_DEFS (node).exists ())
7272 	vect_create_constant_vectors (vinfo, node);
7273       return;
7274     }
7275 
7276   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
7277 
7278   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7279 
7280   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7281   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7282 
7283   if (dump_enabled_p ())
7284     dump_printf_loc (MSG_NOTE, vect_location,
7285 		     "------>vectorizing SLP node starting from: %G",
7286 		     stmt_info->stmt);
7287 
7288   if (STMT_VINFO_DATA_REF (stmt_info)
7289       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7290     {
7291       /* Vectorized loads go before the first scalar load to make it
7292 	 ready early, vectorized stores go before the last scalar
7293 	 stmt which is where all uses are ready.  */
7294       stmt_vec_info last_stmt_info = NULL;
7295       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7296 	last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7297       else /* DR_IS_WRITE */
7298 	last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7299       si = gsi_for_stmt (last_stmt_info->stmt);
7300     }
7301   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7302 	    || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7303 	    || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7304 	   && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7305     {
7306       /* For PHI node vectorization we do not use the insertion iterator.  */
7307       si = gsi_none ();
7308     }
7309   else
7310     {
7311       /* Emit other stmts after the children vectorized defs which is
7312 	 earliest possible.  */
7313       gimple *last_stmt = NULL;
7314       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
7315 	if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7316 	    || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7317 	  {
7318 	    /* But avoid scheduling internal defs outside of the loop when
7319 	       we might have only implicitly tracked loop mask/len defs.  */
7320 	    gimple_stmt_iterator si
7321 	      = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
7322 	    last_stmt = gsi_stmt (si);
7323 	  }
7324       bool seen_vector_def = false;
7325       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7326 	if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7327 	  {
7328 	    /* For fold-left reductions we are retaining the scalar
7329 	       reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7330 	       set so the representation isn't perfect.  Resort to the
7331 	       last scalar def here.  */
7332 	    if (SLP_TREE_VEC_STMTS (child).is_empty ())
7333 	      {
7334 		gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7335 			    == cycle_phi_info_type);
7336 		gphi *phi = as_a <gphi *>
7337 			      (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7338 		if (!last_stmt
7339 		    || vect_stmt_dominates_stmt_p (last_stmt, phi))
7340 		  last_stmt = phi;
7341 	      }
7342 	    /* We are emitting all vectorized stmts in the same place and
7343 	       the last one is the last.
7344 	       ???  Unless we have a load permutation applied and that
7345 	       figures to re-use an earlier generated load.  */
7346 	    unsigned j;
7347 	    gimple *vstmt;
7348 	    FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7349 	      if (!last_stmt
7350 		  || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7351 		last_stmt = vstmt;
7352 	  }
7353 	else if (!SLP_TREE_VECTYPE (child))
7354 	  {
7355 	    /* For externals we use unvectorized at all scalar defs.  */
7356 	    unsigned j;
7357 	    tree def;
7358 	    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7359 	      if (TREE_CODE (def) == SSA_NAME
7360 		  && !SSA_NAME_IS_DEFAULT_DEF (def))
7361 		{
7362 		  gimple *stmt = SSA_NAME_DEF_STMT (def);
7363 		  if (!last_stmt
7364 		      || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7365 		    last_stmt = stmt;
7366 		}
7367 	  }
7368 	else
7369 	  {
7370 	    /* For externals we have to look at all defs since their
7371 	       insertion place is decided per vector.  But beware
7372 	       of pre-existing vectors where we need to make sure
7373 	       we do not insert before the region boundary.  */
7374 	    if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7375 		&& !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7376 	      seen_vector_def = true;
7377 	    else
7378 	      {
7379 		unsigned j;
7380 		tree vdef;
7381 		FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7382 		  if (TREE_CODE (vdef) == SSA_NAME
7383 		      && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7384 		    {
7385 		      gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7386 		      if (!last_stmt
7387 			  || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7388 			last_stmt = vstmt;
7389 		    }
7390 	      }
7391 	  }
7392       /* This can happen when all children are pre-existing vectors or
7393 	 constants.  */
7394       if (!last_stmt)
7395 	last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7396       if (!last_stmt)
7397 	{
7398 	  gcc_assert (seen_vector_def);
7399 	  si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7400 	}
7401       else if (is_ctrl_altering_stmt (last_stmt))
7402 	{
7403 	  /* We split regions to vectorize at control altering stmts
7404 	     with a definition so this must be an external which
7405 	     we can insert at the start of the region.  */
7406 	  si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7407 	}
7408       else if (is_a <bb_vec_info> (vinfo)
7409 	       && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7410 	       && gimple_could_trap_p (stmt_info->stmt))
7411 	{
7412 	  /* We've constrained possibly trapping operations to all come
7413 	     from the same basic-block, if vectorized defs would allow earlier
7414 	     scheduling still force vectorized stmts to the original block.
7415 	     This is only necessary for BB vectorization since for loop vect
7416 	     all operations are in a single BB and scalar stmt based
7417 	     placement doesn't play well with epilogue vectorization.  */
7418 	  gcc_assert (dominated_by_p (CDI_DOMINATORS,
7419 				      gimple_bb (stmt_info->stmt),
7420 				      gimple_bb (last_stmt)));
7421 	  si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7422 	}
7423       else if (is_a <gphi *> (last_stmt))
7424 	si = gsi_after_labels (gimple_bb (last_stmt));
7425       else
7426 	{
7427 	  si = gsi_for_stmt (last_stmt);
7428 	  gsi_next (&si);
7429 	}
7430     }
7431 
7432   /* Handle purely internal nodes.  */
7433   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7434     {
7435       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
7436 	 be shared with different SLP nodes (but usually it's the same
7437 	 operation apart from the case the stmt is only there for denoting
7438 	 the actual scalar lane defs ...).  So do not call vect_transform_stmt
7439 	 but open-code it here (partly).  */
7440       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7441       gcc_assert (done);
7442       stmt_vec_info slp_stmt_info;
7443       unsigned int i;
7444       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7445 	if (STMT_VINFO_LIVE_P (slp_stmt_info))
7446 	  {
7447 	    done = vectorizable_live_operation (vinfo,
7448 						slp_stmt_info, &si, node,
7449 						instance, i, true, NULL);
7450 	    gcc_assert (done);
7451 	  }
7452     }
7453   else
7454     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7455 }
7456 
7457 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7458    For loop vectorization this is done in vectorizable_call, but for SLP
7459    it needs to be deferred until end of vect_schedule_slp, because multiple
7460    SLP instances may refer to the same scalar stmt.  */
7461 
7462 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited)7463 vect_remove_slp_scalar_calls (vec_info *vinfo,
7464 			      slp_tree node, hash_set<slp_tree> &visited)
7465 {
7466   gimple *new_stmt;
7467   gimple_stmt_iterator gsi;
7468   int i;
7469   slp_tree child;
7470   tree lhs;
7471   stmt_vec_info stmt_info;
7472 
7473   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7474     return;
7475 
7476   if (visited.add (node))
7477     return;
7478 
7479   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7480     vect_remove_slp_scalar_calls (vinfo, child, visited);
7481 
7482   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7483     {
7484       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7485       if (!stmt || gimple_bb (stmt) == NULL)
7486 	continue;
7487       if (is_pattern_stmt_p (stmt_info)
7488 	  || !PURE_SLP_STMT (stmt_info))
7489 	continue;
7490       lhs = gimple_call_lhs (stmt);
7491       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7492       gsi = gsi_for_stmt (stmt);
7493       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7494       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7495     }
7496 }
7497 
7498 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node)7499 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7500 {
7501   hash_set<slp_tree> visited;
7502   vect_remove_slp_scalar_calls (vinfo, node, visited);
7503 }
7504 
7505 /* Vectorize the instance root.  */
7506 
7507 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)7508 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7509 {
7510   gassign *rstmt = NULL;
7511 
7512   if (instance->kind == slp_inst_kind_ctor)
7513     {
7514       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7515 	{
7516 	  gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7517 	  tree vect_lhs = gimple_get_lhs (child_stmt);
7518 	  tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7519 	  if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7520 					  TREE_TYPE (vect_lhs)))
7521 	    vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7522 			       vect_lhs);
7523 	  rstmt = gimple_build_assign (root_lhs, vect_lhs);
7524 	}
7525       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7526 	{
7527 	  int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7528 	  gimple *child_stmt;
7529 	  int j;
7530 	  vec<constructor_elt, va_gc> *v;
7531 	  vec_alloc (v, nelts);
7532 
7533 	  /* A CTOR can handle V16HI composition from VNx8HI so we
7534 	     do not need to convert vector elements if the types
7535 	     do not match.  */
7536 	  FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7537 	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7538 				    gimple_get_lhs (child_stmt));
7539 	  tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7540 	  tree rtype
7541 	    = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7542 	  tree r_constructor = build_constructor (rtype, v);
7543 	  rstmt = gimple_build_assign (lhs, r_constructor);
7544 	}
7545     }
7546   else if (instance->kind == slp_inst_kind_bb_reduc)
7547     {
7548       /* Largely inspired by reduction chain epilogue handling in
7549 	 vect_create_epilog_for_reduction.  */
7550       vec<tree> vec_defs = vNULL;
7551       vect_get_slp_defs (node, &vec_defs);
7552       enum tree_code reduc_code
7553 	= gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7554       /* ???  We actually have to reflect signs somewhere.  */
7555       if (reduc_code == MINUS_EXPR)
7556 	reduc_code = PLUS_EXPR;
7557       gimple_seq epilogue = NULL;
7558       /* We may end up with more than one vector result, reduce them
7559 	 to one vector.  */
7560       tree vec_def = vec_defs[0];
7561       for (unsigned i = 1; i < vec_defs.length (); ++i)
7562 	vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7563 				vec_def, vec_defs[i]);
7564       vec_defs.release ();
7565       /* ???  Support other schemes than direct internal fn.  */
7566       internal_fn reduc_fn;
7567       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7568 	  || reduc_fn == IFN_LAST)
7569 	gcc_unreachable ();
7570       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7571 				      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7572 
7573       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7574       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7575       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7576       update_stmt (gsi_stmt (rgsi));
7577       return;
7578     }
7579   else
7580     gcc_unreachable ();
7581 
7582   gcc_assert (rstmt);
7583 
7584   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7585   gsi_replace (&rgsi, rstmt, true);
7586 }
7587 
7588 struct slp_scc_info
7589 {
7590   bool on_stack;
7591   int dfs;
7592   int lowlink;
7593 };
7594 
7595 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
7596 
7597 static void
vect_schedule_scc(vec_info * vinfo,slp_tree node,slp_instance instance,hash_map<slp_tree,slp_scc_info> & scc_info,int & maxdfs,vec<slp_tree> & stack)7598 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7599 		   hash_map<slp_tree, slp_scc_info> &scc_info,
7600 		   int &maxdfs, vec<slp_tree> &stack)
7601 {
7602   bool existed_p;
7603   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7604   gcc_assert (!existed_p);
7605   info->dfs = maxdfs;
7606   info->lowlink = maxdfs;
7607   maxdfs++;
7608 
7609   /* Leaf.  */
7610   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7611     {
7612       info->on_stack = false;
7613       vect_schedule_slp_node (vinfo, node, instance);
7614       return;
7615     }
7616 
7617   info->on_stack = true;
7618   stack.safe_push (node);
7619 
7620   unsigned i;
7621   slp_tree child;
7622   /* DFS recurse.  */
7623   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7624     {
7625       if (!child)
7626 	continue;
7627       slp_scc_info *child_info = scc_info.get (child);
7628       if (!child_info)
7629 	{
7630 	  vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7631 	  /* Recursion might have re-allocated the node.  */
7632 	  info = scc_info.get (node);
7633 	  child_info = scc_info.get (child);
7634 	  info->lowlink = MIN (info->lowlink, child_info->lowlink);
7635 	}
7636       else if (child_info->on_stack)
7637 	info->lowlink = MIN (info->lowlink, child_info->dfs);
7638     }
7639   if (info->lowlink != info->dfs)
7640     return;
7641 
7642   auto_vec<slp_tree, 4> phis_to_fixup;
7643 
7644   /* Singleton.  */
7645   if (stack.last () == node)
7646     {
7647       stack.pop ();
7648       info->on_stack = false;
7649       vect_schedule_slp_node (vinfo, node, instance);
7650       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7651 	  && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7652 	phis_to_fixup.quick_push (node);
7653     }
7654   else
7655     {
7656       /* SCC.  */
7657       int last_idx = stack.length () - 1;
7658       while (stack[last_idx] != node)
7659 	last_idx--;
7660       /* We can break the cycle at PHIs who have at least one child
7661 	 code generated.  Then we could re-start the DFS walk until
7662 	 all nodes in the SCC are covered (we might have new entries
7663 	 for only back-reachable nodes).  But it's simpler to just
7664 	 iterate and schedule those that are ready.  */
7665       unsigned todo = stack.length () - last_idx;
7666       do
7667 	{
7668 	  for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7669 	    {
7670 	      slp_tree entry = stack[idx];
7671 	      if (!entry)
7672 		continue;
7673 	      bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7674 			  && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7675 	      bool ready = !phi;
7676 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7677 		  if (!child)
7678 		    {
7679 		      gcc_assert (phi);
7680 		      ready = true;
7681 		      break;
7682 		    }
7683 		  else if (scc_info.get (child)->on_stack)
7684 		    {
7685 		      if (!phi)
7686 			{
7687 			  ready = false;
7688 			  break;
7689 			}
7690 		    }
7691 		  else
7692 		    {
7693 		      if (phi)
7694 			{
7695 			  ready = true;
7696 			  break;
7697 			}
7698 		    }
7699 	      if (ready)
7700 		{
7701 		  vect_schedule_slp_node (vinfo, entry, instance);
7702 		  scc_info.get (entry)->on_stack = false;
7703 		  stack[idx] = NULL;
7704 		  todo--;
7705 		  if (phi)
7706 		    phis_to_fixup.safe_push (entry);
7707 		}
7708 	    }
7709 	}
7710       while (todo != 0);
7711 
7712       /* Pop the SCC.  */
7713       stack.truncate (last_idx);
7714     }
7715 
7716   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
7717   slp_tree phi_node;
7718   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7719     {
7720       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7721       edge_iterator ei;
7722       edge e;
7723       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7724 	{
7725 	  unsigned dest_idx = e->dest_idx;
7726 	  child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7727 	  if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7728 	    continue;
7729 	  /* Simply fill all args.  */
7730 	  for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7731 	    add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7732 			 vect_get_slp_vect_def (child, i),
7733 			 e, gimple_phi_arg_location (phi, dest_idx));
7734 	}
7735     }
7736 }
7737 
7738 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
7739 
7740 void
vect_schedule_slp(vec_info * vinfo,const vec<slp_instance> & slp_instances)7741 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7742 {
7743   slp_instance instance;
7744   unsigned int i;
7745 
7746   hash_map<slp_tree, slp_scc_info> scc_info;
7747   int maxdfs = 0;
7748   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7749     {
7750       slp_tree node = SLP_INSTANCE_TREE (instance);
7751       if (dump_enabled_p ())
7752 	{
7753 	  dump_printf_loc (MSG_NOTE, vect_location,
7754 			   "Vectorizing SLP tree:\n");
7755 	  /* ???  Dump all?  */
7756 	  if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7757 	    dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7758 			 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7759 	  vect_print_slp_graph (MSG_NOTE, vect_location,
7760 				SLP_INSTANCE_TREE (instance));
7761 	}
7762       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7763 	 have a PHI be the node breaking the cycle.  */
7764       auto_vec<slp_tree> stack;
7765       if (!scc_info.get (node))
7766 	vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7767 
7768       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7769 	vectorize_slp_instance_root_stmt (node, instance);
7770 
7771       if (dump_enabled_p ())
7772 	dump_printf_loc (MSG_NOTE, vect_location,
7773                          "vectorizing stmts using SLP.\n");
7774     }
7775 
7776   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7777     {
7778       slp_tree root = SLP_INSTANCE_TREE (instance);
7779       stmt_vec_info store_info;
7780       unsigned int j;
7781 
7782       /* Remove scalar call stmts.  Do not do this for basic-block
7783 	 vectorization as not all uses may be vectorized.
7784 	 ???  Why should this be necessary?  DCE should be able to
7785 	 remove the stmts itself.
7786 	 ???  For BB vectorization we can as well remove scalar
7787 	 stmts starting from the SLP tree root if they have no
7788 	 uses.  */
7789       if (is_a <loop_vec_info> (vinfo))
7790 	vect_remove_slp_scalar_calls (vinfo, root);
7791 
7792       /* Remove vectorized stores original scalar stmts.  */
7793       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7794         {
7795 	  if (!STMT_VINFO_DATA_REF (store_info)
7796 	      || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7797 	    break;
7798 
7799 	  store_info = vect_orig_stmt (store_info);
7800 	  /* Free the attached stmt_vec_info and remove the stmt.  */
7801 	  vinfo->remove_stmt (store_info);
7802 
7803 	  /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7804 	     to not crash in vect_free_slp_tree later.  */
7805 	  if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7806 	    SLP_TREE_REPRESENTATIVE (root) = NULL;
7807         }
7808     }
7809 }
7810