1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59
60 void
vect_slp_init(void)61 vect_slp_init (void)
62 {
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65
66 void
vect_slp_fini(void)67 vect_slp_fini (void)
68 {
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
73 }
74
75 void *
operator new(size_t n)76 _slp_tree::operator new (size_t n)
77 {
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
80 }
81
82 void
operator delete(void * node,size_t n)83 _slp_tree::operator delete (void *node, size_t n)
84 {
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
87 }
88
89
90 /* Initialize a SLP node. */
91
_slp_tree()92 _slp_tree::_slp_tree ()
93 {
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
115 }
116
117 /* Tear down a SLP node. */
118
~_slp_tree()119 _slp_tree::~_slp_tree ()
120 {
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
136 }
137
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
139
140 void
vect_free_slp_tree(slp_tree node)141 vect_free_slp_tree (slp_tree node)
142 {
143 int i;
144 slp_tree child;
145
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
148
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
152
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157 {
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161 }
162
163 delete node;
164 }
165
166 /* Return a location suitable for dumpings related to the SLP instance. */
167
168 dump_user_location_t
location() const169 _slp_instance::location () const
170 {
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176
177
178 /* Free the memory allocated for the SLP instance. */
179
180 void
vect_free_slp_instance(slp_instance instance)181 vect_free_slp_instance (slp_instance instance)
182 {
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
189 }
190
191
192 /* Create an SLP node for SCALAR_STMTS. */
193
194 slp_tree
vect_create_new_slp_node(unsigned nops,tree_code code)195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS. */
205
206 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<stmt_vec_info> scalar_stmts,unsigned nops)207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
216 }
217
218 /* Create an SLP node for SCALAR_STMTS. */
219
220 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts,unsigned nops)221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225
226 /* Create an SLP node for OPS. */
227
228 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<tree> ops)229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
235 }
236
237 /* Create an SLP node for OPS. */
238
239 static slp_tree
vect_create_new_slp_node(vec<tree> ops)240 vect_create_new_slp_node (vec<tree> ops)
241 {
242 return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244
245
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
250 {
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
262
263
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)267 vect_create_oprnd_info (int nops, int group_size)
268 {
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
272
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
275 {
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
283 }
284
285 return oprnds_info;
286 }
287
288
289 /* Free operands info. */
290
291 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294 int i;
295 slp_oprnd_info oprnd_info;
296
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298 {
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
302 }
303
304 oprnds_info.release ();
305 }
306
307
308 /* Return true if STMTS contains a pattern statement. */
309
310 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
319 }
320
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
323
324 static bool
vect_slp_tree_uniform_p(slp_tree node)325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
333
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
341
342 return true;
343 }
344
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
348
349 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
352 {
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
355
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
358
359 do
360 {
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
366 }
367 while (next_stmt_info);
368
369 return -1;
370 }
371
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
377
378 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
383 {
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
387
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
392 {
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 {
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
406 {
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
415 {
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
420 }
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 {
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
431 {
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
436 }
437 return true;
438 }
439 }
440 }
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
444 }
445 }
446
447 /* Return true if DTA and DTB match. */
448
449 static bool
vect_def_types_match(enum vect_def_type dta,enum vect_def_type dtb)450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456
457 static const int cond_expr_maps[3][5] = {
458 { 4, -1, -2, 1, 2 },
459 { 4, -2, -1, 1, 2 },
460 { 4, -1, -2, 2, 1 }
461 };
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
465
466 /* For most SLP statements, there is a one-to-one mapping between
467 gimple arguments and child nodes. If that is not true for STMT,
468 return an array that contains:
469
470 - the number of child nodes, followed by
471 - for each child node, the index of the argument associated with that node.
472 The special index -1 is the first operand of an embedded comparison and
473 the special index -2 is the second operand of an embedded comparison.
474
475 SWAP is as for vect_get_and_check_slp_defs. */
476
477 static const int *
vect_get_operand_map(const gimple * stmt,unsigned char swap=0)478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
479 {
480 if (auto assign = dyn_cast<const gassign *> (stmt))
481 {
482 if (gimple_assign_rhs_code (assign) == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484 return cond_expr_maps[swap];
485 }
486 gcc_assert (!swap);
487 if (auto call = dyn_cast<const gcall *> (stmt))
488 {
489 if (gimple_call_internal_p (call))
490 switch (gimple_call_internal_fn (call))
491 {
492 case IFN_MASK_LOAD:
493 return arg2_map;
494
495 case IFN_GATHER_LOAD:
496 return arg1_map;
497
498 case IFN_MASK_GATHER_LOAD:
499 return arg1_arg4_map;
500
501 default:
502 break;
503 }
504 }
505 return nullptr;
506 }
507
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509 they are of a valid type and that they match the defs of the first stmt of
510 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
511 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
512 indicates swap is required for cond_expr stmts. Specifically, SWAP
513 is 1 if STMT is cond and operands of comparison need to be swapped;
514 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
515
516 If there was a fatal error return -1; if the error could be corrected by
517 swapping operands of father node of this one, return 1; if everything is
518 ok return 0. */
519 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char swap,bool * skip_args,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521 bool *skip_args,
522 vec<stmt_vec_info> stmts, unsigned stmt_num,
523 vec<slp_oprnd_info> *oprnds_info)
524 {
525 stmt_vec_info stmt_info = stmts[stmt_num];
526 tree oprnd;
527 unsigned int i, number_of_oprnds;
528 enum vect_def_type dt = vect_uninitialized_def;
529 slp_oprnd_info oprnd_info;
530 unsigned int commutative_op = -1U;
531 bool first = stmt_num == 0;
532
533 if (!is_a<gcall *> (stmt_info->stmt)
534 && !is_a<gassign *> (stmt_info->stmt)
535 && !is_a<gphi *> (stmt_info->stmt))
536 return -1;
537
538 number_of_oprnds = gimple_num_args (stmt_info->stmt);
539 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540 if (map)
541 number_of_oprnds = *map++;
542 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
543 {
544 if (gimple_call_internal_p (stmt))
545 {
546 internal_fn ifn = gimple_call_internal_fn (stmt);
547 commutative_op = first_commutative_argument (ifn);
548 }
549 }
550 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
551 {
552 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553 commutative_op = 0;
554 }
555
556 bool swapped = (swap != 0);
557 bool backedge = false;
558 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559 for (i = 0; i < number_of_oprnds; i++)
560 {
561 int opno = map ? map[i] : int (i);
562 if (opno < 0)
563 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564 else
565 {
566 oprnd = gimple_arg (stmt_info->stmt, opno);
567 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568 backedge = dominated_by_p (CDI_DOMINATORS,
569 gimple_phi_arg_edge (stmt, opno)->src,
570 gimple_bb (stmt_info->stmt));
571 }
572 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573 oprnd = TREE_OPERAND (oprnd, 0);
574
575 oprnd_info = (*oprnds_info)[i];
576
577 stmt_vec_info def_stmt_info;
578 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "Build SLP failed: can't analyze def for %T\n",
583 oprnd);
584
585 return -1;
586 }
587
588 if (skip_args[i])
589 {
590 oprnd_info->def_stmts.quick_push (NULL);
591 oprnd_info->ops.quick_push (NULL_TREE);
592 oprnd_info->first_dt = vect_uninitialized_def;
593 continue;
594 }
595
596 oprnd_info->def_stmts.quick_push (def_stmt_info);
597 oprnd_info->ops.quick_push (oprnd);
598
599 if (def_stmt_info
600 && is_pattern_stmt_p (def_stmt_info))
601 {
602 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603 != def_stmt_info)
604 oprnd_info->any_pattern = true;
605 else
606 /* If we promote this to external use the original stmt def. */
607 oprnd_info->ops.last ()
608 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
609 }
610
611 /* If there's a extern def on a backedge make sure we can
612 code-generate at the region start.
613 ??? This is another case that could be fixed by adjusting
614 how we split the function but at the moment we'd have conflicting
615 goals there. */
616 if (backedge
617 && dts[i] == vect_external_def
618 && is_a <bb_vec_info> (vinfo)
619 && TREE_CODE (oprnd) == SSA_NAME
620 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621 && !dominated_by_p (CDI_DOMINATORS,
622 as_a <bb_vec_info> (vinfo)->bbs[0],
623 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
624 {
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "Build SLP failed: extern def %T only defined "
628 "on backedge\n", oprnd);
629 return -1;
630 }
631
632 if (first)
633 {
634 tree type = TREE_TYPE (oprnd);
635 dt = dts[i];
636 if ((dt == vect_constant_def
637 || dt == vect_external_def)
638 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639 && (TREE_CODE (type) == BOOLEAN_TYPE
640 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641 type)))
642 {
643 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 "Build SLP failed: invalid type of def "
646 "for variable-length SLP %T\n", oprnd);
647 return -1;
648 }
649
650 /* For the swapping logic below force vect_reduction_def
651 for the reduction op in a SLP reduction group. */
652 if (!STMT_VINFO_DATA_REF (stmt_info)
653 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655 && def_stmt_info)
656 dts[i] = dt = vect_reduction_def;
657
658 /* Check the types of the definition. */
659 switch (dt)
660 {
661 case vect_external_def:
662 case vect_constant_def:
663 case vect_internal_def:
664 case vect_reduction_def:
665 case vect_induction_def:
666 case vect_nested_cycle:
667 break;
668
669 default:
670 /* FORNOW: Not supported. */
671 if (dump_enabled_p ())
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 "Build SLP failed: illegal type of def %T\n",
674 oprnd);
675 return -1;
676 }
677
678 oprnd_info->first_dt = dt;
679 oprnd_info->first_op_type = type;
680 }
681 }
682 if (first)
683 return 0;
684
685 /* Now match the operand definition types to that of the first stmt. */
686 for (i = 0; i < number_of_oprnds;)
687 {
688 if (skip_args[i])
689 {
690 ++i;
691 continue;
692 }
693
694 oprnd_info = (*oprnds_info)[i];
695 dt = dts[i];
696 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697 oprnd = oprnd_info->ops[stmt_num];
698 tree type = TREE_TYPE (oprnd);
699
700 if (!types_compatible_p (oprnd_info->first_op_type, type))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operand types\n");
705 return 1;
706 }
707
708 /* Not first stmt of the group, check that the def-stmt/s match
709 the def-stmt/s of the first stmt. Allow different definition
710 types for reduction chains: the first stmt must be a
711 vect_reduction_def (a phi node), and the rest
712 end in the reduction chain. */
713 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714 && !(oprnd_info->first_dt == vect_reduction_def
715 && !STMT_VINFO_DATA_REF (stmt_info)
716 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 && def_stmt_info
718 && !STMT_VINFO_DATA_REF (def_stmt_info)
719 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721 || (!STMT_VINFO_DATA_REF (stmt_info)
722 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 && ((!def_stmt_info
724 || STMT_VINFO_DATA_REF (def_stmt_info)
725 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727 != (oprnd_info->first_dt != vect_reduction_def))))
728 {
729 /* Try swapping operands if we got a mismatch. For BB
730 vectorization only in case it will clearly improve things. */
731 if (i == commutative_op && !swapped
732 && (!is_a <bb_vec_info> (vinfo)
733 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734 dts[i+1])
735 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736 || vect_def_types_match
737 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
738 {
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "trying swapped operands\n");
742 std::swap (dts[i], dts[i+1]);
743 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745 std::swap ((*oprnds_info)[i]->ops[stmt_num],
746 (*oprnds_info)[i+1]->ops[stmt_num]);
747 swapped = true;
748 continue;
749 }
750
751 if (is_a <bb_vec_info> (vinfo)
752 && !oprnd_info->any_pattern)
753 {
754 /* Now for commutative ops we should see whether we can
755 make the other operand matching. */
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "treating operand as external\n");
759 oprnd_info->first_dt = dt = vect_external_def;
760 }
761 else
762 {
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "Build SLP failed: different types\n");
766 return 1;
767 }
768 }
769
770 /* Make sure to demote the overall operand to external. */
771 if (dt == vect_external_def)
772 oprnd_info->first_dt = vect_external_def;
773 /* For a SLP reduction chain we want to duplicate the reduction to
774 each of the chain members. That gets us a sane SLP graph (still
775 the stmts are not 100% correct wrt the initial values). */
776 else if ((dt == vect_internal_def
777 || dt == vect_reduction_def)
778 && oprnd_info->first_dt == vect_reduction_def
779 && !STMT_VINFO_DATA_REF (stmt_info)
780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 && !STMT_VINFO_DATA_REF (def_stmt_info)
782 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
784 {
785 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
787 }
788
789 ++i;
790 }
791
792 /* Swap operands. */
793 if (swapped)
794 {
795 if (dump_enabled_p ())
796 dump_printf_loc (MSG_NOTE, vect_location,
797 "swapped operands to match def types in %G",
798 stmt_info->stmt);
799 }
800
801 return 0;
802 }
803
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805 to be combined into the same SLP group. */
806
807 bool
compatible_calls_p(gcall * call1,gcall * call2)808 compatible_calls_p (gcall *call1, gcall *call2)
809 {
810 unsigned int nargs = gimple_call_num_args (call1);
811 if (nargs != gimple_call_num_args (call2))
812 return false;
813
814 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815 return false;
816
817 if (gimple_call_internal_p (call1))
818 {
819 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820 TREE_TYPE (gimple_call_lhs (call2))))
821 return false;
822 for (unsigned int i = 0; i < nargs; ++i)
823 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824 TREE_TYPE (gimple_call_arg (call2, i))))
825 return false;
826 }
827 else
828 {
829 if (!operand_equal_p (gimple_call_fn (call1),
830 gimple_call_fn (call2), 0))
831 return false;
832
833 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834 return false;
835 }
836
837 /* Check that any unvectorized arguments are equal. */
838 if (const int *map = vect_get_operand_map (call1))
839 {
840 unsigned int nkept = *map++;
841 unsigned int mapi = 0;
842 for (unsigned int i = 0; i < nargs; ++i)
843 if (mapi < nkept && map[mapi] == int (i))
844 mapi += 1;
845 else if (!operand_equal_p (gimple_call_arg (call1, i),
846 gimple_call_arg (call2, i)))
847 return false;
848 }
849
850 return true;
851 }
852
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854 caller's attempt to find the vector type in STMT_INFO with the narrowest
855 element type. Return true if VECTYPE is nonnull and if it is valid
856 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
857 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
858 vect_build_slp_tree. */
859
860 static bool
vect_record_max_nunits(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862 unsigned int group_size,
863 tree vectype, poly_uint64 *max_nunits)
864 {
865 if (!vectype)
866 {
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 "Build SLP failed: unsupported data-type in %G\n",
870 stmt_info->stmt);
871 /* Fatal mismatch. */
872 return false;
873 }
874
875 /* If populating the vector type requires unrolling then fail
876 before adjusting *max_nunits for basic-block vectorization. */
877 if (is_a <bb_vec_info> (vinfo)
878 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879 {
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 "Build SLP failed: unrolling required "
883 "in basic block SLP\n");
884 /* Fatal mismatch. */
885 return false;
886 }
887
888 /* In case of multiple types we need to detect the smallest type. */
889 vect_update_max_nunits (max_nunits, vectype);
890 return true;
891 }
892
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894 permutation or are of unsupported types of operation. Return
895 true if they are, otherwise return false and indicate in *MATCHES
896 which stmts are not isomorphic to the first one. If MATCHES[0]
897 is false then this indicates the comparison could not be
898 carried out or the stmts will never be vectorized by SLP.
899
900 Note COND_EXPR is possibly isomorphic to another one after swapping its
901 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902 the first stmt by swapping the two operands of comparison; set SWAP[i]
903 to 2 if stmt I is isormorphic to the first stmt by inverting the code
904 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
906
907 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators,tree * node_vectype)908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909 vec<stmt_vec_info> stmts, unsigned int group_size,
910 poly_uint64 *max_nunits, bool *matches,
911 bool *two_operators, tree *node_vectype)
912 {
913 unsigned int i;
914 stmt_vec_info first_stmt_info = stmts[0];
915 code_helper first_stmt_code = ERROR_MARK;
916 code_helper alt_stmt_code = ERROR_MARK;
917 code_helper rhs_code = ERROR_MARK;
918 code_helper first_cond_code = ERROR_MARK;
919 tree lhs;
920 bool need_same_oprnds = false;
921 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922 stmt_vec_info first_load = NULL, prev_first_load = NULL;
923 bool first_stmt_load_p = false, load_p = false;
924 bool first_stmt_phi_p = false, phi_p = false;
925 bool maybe_soft_fail = false;
926 tree soft_fail_nunits_vectype = NULL_TREE;
927
928 /* For every stmt in NODE find its def stmt/s. */
929 stmt_vec_info stmt_info;
930 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
931 {
932 gimple *stmt = stmt_info->stmt;
933 swap[i] = 0;
934 matches[i] = false;
935
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
938
939 /* Fail to vectorize statements marked as unvectorizable, throw
940 or are volatile. */
941 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942 || stmt_can_throw_internal (cfun, stmt)
943 || gimple_has_volatile_ops (stmt))
944 {
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "Build SLP failed: unvectorizable statement %G",
948 stmt);
949 /* ??? For BB vectorization we want to commutate operands in a way
950 to shuffle all unvectorizable defs into one operand and have
951 the other still vectorized. The following doesn't reliably
952 work for this though but it's the easiest we can do here. */
953 if (is_a <bb_vec_info> (vinfo) && i != 0)
954 continue;
955 /* Fatal mismatch. */
956 matches[0] = false;
957 return false;
958 }
959
960 lhs = gimple_get_lhs (stmt);
961 if (lhs == NULL_TREE)
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: not GIMPLE_ASSIGN nor "
966 "GIMPLE_CALL %G", stmt);
967 if (is_a <bb_vec_info> (vinfo) && i != 0)
968 continue;
969 /* Fatal mismatch. */
970 matches[0] = false;
971 return false;
972 }
973
974 tree nunits_vectype;
975 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976 &nunits_vectype, group_size))
977 {
978 if (is_a <bb_vec_info> (vinfo) && i != 0)
979 continue;
980 /* Fatal mismatch. */
981 matches[0] = false;
982 return false;
983 }
984 /* Record nunits required but continue analysis, producing matches[]
985 as if nunits was not an issue. This allows splitting of groups
986 to happen. */
987 if (nunits_vectype
988 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989 nunits_vectype, max_nunits))
990 {
991 gcc_assert (is_a <bb_vec_info> (vinfo));
992 maybe_soft_fail = true;
993 soft_fail_nunits_vectype = nunits_vectype;
994 }
995
996 gcc_assert (vectype);
997
998 gcall *call_stmt = dyn_cast <gcall *> (stmt);
999 if (call_stmt)
1000 {
1001 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002 if (cfn != CFN_LAST)
1003 rhs_code = cfn;
1004 else
1005 rhs_code = CALL_EXPR;
1006
1007 if (cfn == CFN_MASK_LOAD
1008 || cfn == CFN_GATHER_LOAD
1009 || cfn == CFN_MASK_GATHER_LOAD)
1010 load_p = true;
1011 else if ((internal_fn_p (cfn)
1012 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013 || gimple_call_tail_p (call_stmt)
1014 || gimple_call_noreturn_p (call_stmt)
1015 || gimple_call_chain (call_stmt))
1016 {
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported call type %G",
1020 call_stmt);
1021 if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 continue;
1023 /* Fatal mismatch. */
1024 matches[0] = false;
1025 return false;
1026 }
1027 }
1028 else if (gimple_code (stmt) == GIMPLE_PHI)
1029 {
1030 rhs_code = ERROR_MARK;
1031 phi_p = true;
1032 }
1033 else
1034 {
1035 rhs_code = gimple_assign_rhs_code (stmt);
1036 load_p = gimple_vuse (stmt);
1037 }
1038
1039 /* Check the operation. */
1040 if (i == 0)
1041 {
1042 *node_vectype = vectype;
1043 first_stmt_code = rhs_code;
1044 first_stmt_load_p = load_p;
1045 first_stmt_phi_p = phi_p;
1046
1047 /* Shift arguments should be equal in all the packed stmts for a
1048 vector shift with scalar shift operand. */
1049 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050 || rhs_code == LROTATE_EXPR
1051 || rhs_code == RROTATE_EXPR)
1052 {
1053 /* First see if we have a vector/vector shift. */
1054 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1055 {
1056 /* No vector/vector shift, try for a vector/scalar shift. */
1057 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1058 {
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "Build SLP failed: "
1062 "op not supported by target.\n");
1063 if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 continue;
1065 /* Fatal mismatch. */
1066 matches[0] = false;
1067 return false;
1068 }
1069 need_same_oprnds = true;
1070 first_op1 = gimple_assign_rhs2 (stmt);
1071 }
1072 }
1073 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1074 {
1075 need_same_oprnds = true;
1076 first_op1 = gimple_assign_rhs2 (stmt);
1077 }
1078 else if (!load_p
1079 && rhs_code == BIT_FIELD_REF)
1080 {
1081 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082 if (!is_a <bb_vec_info> (vinfo)
1083 || TREE_CODE (vec) != SSA_NAME
1084 || !operand_equal_p (TYPE_SIZE (vectype),
1085 TYPE_SIZE (TREE_TYPE (vec))))
1086 {
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 "Build SLP failed: "
1090 "BIT_FIELD_REF not supported\n");
1091 /* Fatal mismatch. */
1092 matches[0] = false;
1093 return false;
1094 }
1095 }
1096 else if (rhs_code == CFN_DIV_POW2)
1097 {
1098 need_same_oprnds = true;
1099 first_op1 = gimple_call_arg (call_stmt, 1);
1100 }
1101 }
1102 else
1103 {
1104 if (first_stmt_code != rhs_code
1105 && alt_stmt_code == ERROR_MARK)
1106 alt_stmt_code = rhs_code;
1107 if ((first_stmt_code != rhs_code
1108 && (first_stmt_code != IMAGPART_EXPR
1109 || rhs_code != REALPART_EXPR)
1110 && (first_stmt_code != REALPART_EXPR
1111 || rhs_code != IMAGPART_EXPR)
1112 /* Handle mismatches in plus/minus by computing both
1113 and merging the results. */
1114 && !((first_stmt_code == PLUS_EXPR
1115 || first_stmt_code == MINUS_EXPR)
1116 && (alt_stmt_code == PLUS_EXPR
1117 || alt_stmt_code == MINUS_EXPR)
1118 && rhs_code == alt_stmt_code)
1119 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 && (first_stmt_code == ARRAY_REF
1121 || first_stmt_code == BIT_FIELD_REF
1122 || first_stmt_code == INDIRECT_REF
1123 || first_stmt_code == COMPONENT_REF
1124 || first_stmt_code == MEM_REF)
1125 && (rhs_code == ARRAY_REF
1126 || rhs_code == BIT_FIELD_REF
1127 || rhs_code == INDIRECT_REF
1128 || rhs_code == COMPONENT_REF
1129 || rhs_code == MEM_REF)))
1130 || first_stmt_load_p != load_p
1131 || first_stmt_phi_p != phi_p)
1132 {
1133 if (dump_enabled_p ())
1134 {
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 "Build SLP failed: different operation "
1137 "in stmt %G", stmt);
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 "original stmt %G", first_stmt_info->stmt);
1140 }
1141 /* Mismatch. */
1142 continue;
1143 }
1144
1145 if (!load_p
1146 && first_stmt_code == BIT_FIELD_REF
1147 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1148 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1149 {
1150 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 "Build SLP failed: different BIT_FIELD_REF "
1153 "arguments in %G", stmt);
1154 /* Mismatch. */
1155 continue;
1156 }
1157
1158 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1159 {
1160 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1161 call_stmt))
1162 {
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "Build SLP failed: different calls in %G",
1166 stmt);
1167 /* Mismatch. */
1168 continue;
1169 }
1170 }
1171
1172 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1173 && (gimple_bb (first_stmt_info->stmt)
1174 != gimple_bb (stmt_info->stmt)))
1175 {
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "Build SLP failed: different BB for PHI "
1179 "or possibly trapping operation in %G", stmt);
1180 /* Mismatch. */
1181 continue;
1182 }
1183
1184 if (need_same_oprnds)
1185 {
1186 tree other_op1 = gimple_arg (stmt, 1);
1187 if (!operand_equal_p (first_op1, other_op1, 0))
1188 {
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 "Build SLP failed: different shift "
1192 "arguments in %G", stmt);
1193 /* Mismatch. */
1194 continue;
1195 }
1196 }
1197
1198 if (!types_compatible_p (vectype, *node_vectype))
1199 {
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 "Build SLP failed: different vector type "
1203 "in %G", stmt);
1204 /* Mismatch. */
1205 continue;
1206 }
1207 }
1208
1209 /* Grouped store or load. */
1210 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211 {
1212 if (REFERENCE_CLASS_P (lhs))
1213 {
1214 /* Store. */
1215 ;
1216 }
1217 else
1218 {
1219 /* Load. */
1220 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1221 if (prev_first_load)
1222 {
1223 /* Check that there are no loads from different interleaving
1224 chains in the same node. */
1225 if (prev_first_load != first_load)
1226 {
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1229 vect_location,
1230 "Build SLP failed: different "
1231 "interleaving chains in one node %G",
1232 stmt);
1233 /* Mismatch. */
1234 continue;
1235 }
1236 }
1237 else
1238 prev_first_load = first_load;
1239 }
1240 } /* Grouped access. */
1241 else
1242 {
1243 if (load_p
1244 && rhs_code != CFN_GATHER_LOAD
1245 && rhs_code != CFN_MASK_GATHER_LOAD)
1246 {
1247 /* Not grouped load. */
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "Build SLP failed: not grouped load %G", stmt);
1251
1252 /* FORNOW: Not grouped loads are not supported. */
1253 if (is_a <bb_vec_info> (vinfo) && i != 0)
1254 continue;
1255 /* Fatal mismatch. */
1256 matches[0] = false;
1257 return false;
1258 }
1259
1260 /* Not memory operation. */
1261 if (!phi_p
1262 && rhs_code.is_tree_code ()
1263 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1264 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1265 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1266 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1267 && rhs_code != VIEW_CONVERT_EXPR
1268 && rhs_code != CALL_EXPR
1269 && rhs_code != BIT_FIELD_REF)
1270 {
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273 "Build SLP failed: operation unsupported %G",
1274 stmt);
1275 if (is_a <bb_vec_info> (vinfo) && i != 0)
1276 continue;
1277 /* Fatal mismatch. */
1278 matches[0] = false;
1279 return false;
1280 }
1281
1282 if (rhs_code == COND_EXPR)
1283 {
1284 tree cond_expr = gimple_assign_rhs1 (stmt);
1285 enum tree_code cond_code = TREE_CODE (cond_expr);
1286 enum tree_code swap_code = ERROR_MARK;
1287 enum tree_code invert_code = ERROR_MARK;
1288
1289 if (i == 0)
1290 first_cond_code = TREE_CODE (cond_expr);
1291 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1292 {
1293 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1294 swap_code = swap_tree_comparison (cond_code);
1295 invert_code = invert_tree_comparison (cond_code, honor_nans);
1296 }
1297
1298 if (first_cond_code == cond_code)
1299 ;
1300 /* Isomorphic can be achieved by swapping. */
1301 else if (first_cond_code == swap_code)
1302 swap[i] = 1;
1303 /* Isomorphic can be achieved by inverting. */
1304 else if (first_cond_code == invert_code)
1305 swap[i] = 2;
1306 else
1307 {
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "Build SLP failed: different"
1311 " operation %G", stmt);
1312 /* Mismatch. */
1313 continue;
1314 }
1315 }
1316 }
1317
1318 matches[i] = true;
1319 }
1320
1321 for (i = 0; i < group_size; ++i)
1322 if (!matches[i])
1323 return false;
1324
1325 /* If we allowed a two-operation SLP node verify the target can cope
1326 with the permute we are going to use. */
1327 if (alt_stmt_code != ERROR_MARK
1328 && (!alt_stmt_code.is_tree_code ()
1329 || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1330 {
1331 *two_operators = true;
1332 }
1333
1334 if (maybe_soft_fail)
1335 {
1336 unsigned HOST_WIDE_INT const_nunits;
1337 if (!TYPE_VECTOR_SUBPARTS
1338 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1339 || const_nunits > group_size)
1340 matches[0] = false;
1341 else
1342 {
1343 /* With constant vector elements simulate a mismatch at the
1344 point we need to split. */
1345 unsigned tail = group_size & (const_nunits - 1);
1346 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1347 }
1348 return false;
1349 }
1350
1351 return true;
1352 }
1353
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355 Note we never remove apart from at destruction time so we do not
1356 need a special value for deleted that differs from empty. */
1357 struct bst_traits
1358 {
1359 typedef vec <stmt_vec_info> value_type;
1360 typedef vec <stmt_vec_info> compare_type;
1361 static inline hashval_t hash (value_type);
1362 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1363 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1364 static inline bool is_deleted (value_type x) { return !x.exists (); }
1365 static const bool empty_zero_p = true;
mark_emptybst_traits1366 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1367 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1368 static inline void remove (value_type &x) { x.release (); }
1369 };
1370 inline hashval_t
hash(value_type x)1371 bst_traits::hash (value_type x)
1372 {
1373 inchash::hash h;
1374 for (unsigned i = 0; i < x.length (); ++i)
1375 h.add_int (gimple_uid (x[i]->stmt));
1376 return h.end ();
1377 }
1378 inline bool
equal(value_type existing,value_type candidate)1379 bst_traits::equal (value_type existing, value_type candidate)
1380 {
1381 if (existing.length () != candidate.length ())
1382 return false;
1383 for (unsigned i = 0; i < existing.length (); ++i)
1384 if (existing[i] != candidate[i])
1385 return false;
1386 return true;
1387 }
1388
1389 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390 but then vec::insert does memmove and that's not compatible with
1391 std::pair. */
1392 struct chain_op_t
1393 {
chain_op_tchain_op_t1394 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1395 : code (code_), dt (dt_), op (op_) {}
1396 tree_code code;
1397 vect_def_type dt;
1398 tree op;
1399 };
1400
1401 /* Comparator for sorting associatable chains. */
1402
1403 static int
dt_sort_cmp(const void * op1_,const void * op2_,void *)1404 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1405 {
1406 auto *op1 = (const chain_op_t *) op1_;
1407 auto *op2 = (const chain_op_t *) op2_;
1408 if (op1->dt != op2->dt)
1409 return (int)op1->dt - (int)op2->dt;
1410 return (int)op1->code - (int)op2->code;
1411 }
1412
1413 /* Linearize the associatable expression chain at START with the
1414 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415 filling CHAIN with the result and using WORKLIST as intermediate storage.
1416 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1418 stmts, starting with START. */
1419
1420 static void
vect_slp_linearize_chain(vec_info * vinfo,vec<std::pair<tree_code,gimple * >> & worklist,vec<chain_op_t> & chain,enum tree_code code,gimple * start,gimple * & code_stmt,gimple * & alt_code_stmt,vec<gimple * > * chain_stmts)1421 vect_slp_linearize_chain (vec_info *vinfo,
1422 vec<std::pair<tree_code, gimple *> > &worklist,
1423 vec<chain_op_t> &chain,
1424 enum tree_code code, gimple *start,
1425 gimple *&code_stmt, gimple *&alt_code_stmt,
1426 vec<gimple *> *chain_stmts)
1427 {
1428 /* For each lane linearize the addition/subtraction (or other
1429 uniform associatable operation) expression tree. */
1430 worklist.safe_push (std::make_pair (code, start));
1431 while (!worklist.is_empty ())
1432 {
1433 auto entry = worklist.pop ();
1434 gassign *stmt = as_a <gassign *> (entry.second);
1435 enum tree_code in_code = entry.first;
1436 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1437 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1438 if (!code_stmt
1439 && gimple_assign_rhs_code (stmt) == code)
1440 code_stmt = stmt;
1441 else if (!alt_code_stmt
1442 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1443 alt_code_stmt = stmt;
1444 if (chain_stmts)
1445 chain_stmts->safe_push (stmt);
1446 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1447 {
1448 tree op = gimple_op (stmt, opnum);
1449 vect_def_type dt;
1450 stmt_vec_info def_stmt_info;
1451 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1452 gcc_assert (res);
1453 if (dt == vect_internal_def
1454 && is_pattern_stmt_p (def_stmt_info))
1455 op = gimple_get_lhs (def_stmt_info->stmt);
1456 gimple *use_stmt;
1457 use_operand_p use_p;
1458 if (dt == vect_internal_def
1459 && single_imm_use (op, &use_p, &use_stmt)
1460 && is_gimple_assign (def_stmt_info->stmt)
1461 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1462 || (code == PLUS_EXPR
1463 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1464 == MINUS_EXPR))))
1465 {
1466 tree_code op_def_code = this_code;
1467 if (op_def_code == MINUS_EXPR && opnum == 1)
1468 op_def_code = PLUS_EXPR;
1469 if (in_code == MINUS_EXPR)
1470 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1471 worklist.safe_push (std::make_pair (op_def_code,
1472 def_stmt_info->stmt));
1473 }
1474 else
1475 {
1476 tree_code op_def_code = this_code;
1477 if (op_def_code == MINUS_EXPR && opnum == 1)
1478 op_def_code = PLUS_EXPR;
1479 if (in_code == MINUS_EXPR)
1480 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1481 chain.safe_push (chain_op_t (op_def_code, dt, op));
1482 }
1483 }
1484 }
1485 }
1486
1487 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1488 simple_hashmap_traits <bst_traits, slp_tree> >
1489 scalar_stmts_to_slp_tree_map_t;
1490
1491 static slp_tree
1492 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1493 vec<stmt_vec_info> stmts, unsigned int group_size,
1494 poly_uint64 *max_nunits,
1495 bool *matches, unsigned *limit, unsigned *tree_size,
1496 scalar_stmts_to_slp_tree_map_t *bst_map);
1497
1498 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1499 vect_build_slp_tree (vec_info *vinfo,
1500 vec<stmt_vec_info> stmts, unsigned int group_size,
1501 poly_uint64 *max_nunits,
1502 bool *matches, unsigned *limit, unsigned *tree_size,
1503 scalar_stmts_to_slp_tree_map_t *bst_map)
1504 {
1505 if (slp_tree *leader = bst_map->get (stmts))
1506 {
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1509 !(*leader)->failed ? "" : "failed ", *leader);
1510 if (!(*leader)->failed)
1511 {
1512 SLP_TREE_REF_COUNT (*leader)++;
1513 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1514 stmts.release ();
1515 return *leader;
1516 }
1517 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1518 return NULL;
1519 }
1520
1521 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522 so we can pick up backedge destinations during discovery. */
1523 slp_tree res = new _slp_tree;
1524 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1525 SLP_TREE_SCALAR_STMTS (res) = stmts;
1526 bst_map->put (stmts.copy (), res);
1527
1528 if (*limit == 0)
1529 {
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE, vect_location,
1532 "SLP discovery limit exceeded\n");
1533 /* Mark the node invalid so we can detect those when still in use
1534 as backedge destinations. */
1535 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1536 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1537 res->failed = XNEWVEC (bool, group_size);
1538 memset (res->failed, 0, sizeof (bool) * group_size);
1539 memset (matches, 0, sizeof (bool) * group_size);
1540 return NULL;
1541 }
1542 --*limit;
1543
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_NOTE, vect_location,
1546 "starting SLP discovery for node %p\n", res);
1547
1548 poly_uint64 this_max_nunits = 1;
1549 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1550 &this_max_nunits,
1551 matches, limit, tree_size, bst_map);
1552 if (!res_)
1553 {
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_NOTE, vect_location,
1556 "SLP discovery for node %p failed\n", res);
1557 /* Mark the node invalid so we can detect those when still in use
1558 as backedge destinations. */
1559 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1560 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1561 res->failed = XNEWVEC (bool, group_size);
1562 if (flag_checking)
1563 {
1564 unsigned i;
1565 for (i = 0; i < group_size; ++i)
1566 if (!matches[i])
1567 break;
1568 gcc_assert (i < group_size);
1569 }
1570 memcpy (res->failed, matches, sizeof (bool) * group_size);
1571 }
1572 else
1573 {
1574 if (dump_enabled_p ())
1575 dump_printf_loc (MSG_NOTE, vect_location,
1576 "SLP discovery for node %p succeeded\n", res);
1577 gcc_assert (res_ == res);
1578 res->max_nunits = this_max_nunits;
1579 vect_update_max_nunits (max_nunits, this_max_nunits);
1580 /* Keep a reference for the bst_map use. */
1581 SLP_TREE_REF_COUNT (res)++;
1582 }
1583 return res_;
1584 }
1585
1586 /* Helper for building an associated SLP node chain. */
1587
1588 static void
vect_slp_build_two_operator_nodes(slp_tree perm,tree vectype,slp_tree op0,slp_tree op1,stmt_vec_info oper1,stmt_vec_info oper2,vec<std::pair<unsigned,unsigned>> lperm)1589 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1590 slp_tree op0, slp_tree op1,
1591 stmt_vec_info oper1, stmt_vec_info oper2,
1592 vec<std::pair<unsigned, unsigned> > lperm)
1593 {
1594 unsigned group_size = SLP_TREE_LANES (op1);
1595
1596 slp_tree child1 = new _slp_tree;
1597 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1598 SLP_TREE_VECTYPE (child1) = vectype;
1599 SLP_TREE_LANES (child1) = group_size;
1600 SLP_TREE_CHILDREN (child1).create (2);
1601 SLP_TREE_CHILDREN (child1).quick_push (op0);
1602 SLP_TREE_CHILDREN (child1).quick_push (op1);
1603 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1604
1605 slp_tree child2 = new _slp_tree;
1606 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1607 SLP_TREE_VECTYPE (child2) = vectype;
1608 SLP_TREE_LANES (child2) = group_size;
1609 SLP_TREE_CHILDREN (child2).create (2);
1610 SLP_TREE_CHILDREN (child2).quick_push (op0);
1611 SLP_TREE_REF_COUNT (op0)++;
1612 SLP_TREE_CHILDREN (child2).quick_push (op1);
1613 SLP_TREE_REF_COUNT (op1)++;
1614 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1615
1616 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1617 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1618 SLP_TREE_VECTYPE (perm) = vectype;
1619 SLP_TREE_LANES (perm) = group_size;
1620 /* ??? We should set this NULL but that's not expected. */
1621 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1622 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1623 SLP_TREE_CHILDREN (perm).quick_push (child1);
1624 SLP_TREE_CHILDREN (perm).quick_push (child2);
1625 }
1626
1627 /* Recursively build an SLP tree starting from NODE.
1628 Fail (and return a value not equal to zero) if def-stmts are not
1629 isomorphic, require data permutation or are of unsupported types of
1630 operation. Otherwise, return 0.
1631 The value returned is the depth in the SLP tree where a mismatch
1632 was found. */
1633
1634 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1635 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1636 vec<stmt_vec_info> stmts, unsigned int group_size,
1637 poly_uint64 *max_nunits,
1638 bool *matches, unsigned *limit, unsigned *tree_size,
1639 scalar_stmts_to_slp_tree_map_t *bst_map)
1640 {
1641 unsigned nops, i, this_tree_size = 0;
1642 poly_uint64 this_max_nunits = *max_nunits;
1643
1644 matches[0] = false;
1645
1646 stmt_vec_info stmt_info = stmts[0];
1647 if (!is_a<gcall *> (stmt_info->stmt)
1648 && !is_a<gassign *> (stmt_info->stmt)
1649 && !is_a<gphi *> (stmt_info->stmt))
1650 return NULL;
1651
1652 nops = gimple_num_args (stmt_info->stmt);
1653 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1654 nops = map[0];
1655
1656 /* If the SLP node is a PHI (induction or reduction), terminate
1657 the recursion. */
1658 bool *skip_args = XALLOCAVEC (bool, nops);
1659 memset (skip_args, 0, sizeof (bool) * nops);
1660 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1661 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1662 {
1663 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1664 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1665 group_size);
1666 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1667 max_nunits))
1668 return NULL;
1669
1670 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1671 if (def_type == vect_induction_def)
1672 {
1673 /* Induction PHIs are not cycles but walk the initial
1674 value. Only for inner loops through, for outer loops
1675 we need to pick up the value from the actual PHIs
1676 to more easily support peeling and epilogue vectorization. */
1677 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678 if (!nested_in_vect_loop_p (loop, stmt_info))
1679 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1680 else
1681 loop = loop->inner;
1682 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1683 }
1684 else if (def_type == vect_reduction_def
1685 || def_type == vect_double_reduction_def
1686 || def_type == vect_nested_cycle)
1687 {
1688 /* Else def types have to match. */
1689 stmt_vec_info other_info;
1690 bool all_same = true;
1691 FOR_EACH_VEC_ELT (stmts, i, other_info)
1692 {
1693 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1694 return NULL;
1695 if (other_info != stmt_info)
1696 all_same = false;
1697 }
1698 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699 /* Reduction initial values are not explicitely represented. */
1700 if (!nested_in_vect_loop_p (loop, stmt_info))
1701 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1702 /* Reduction chain backedge defs are filled manually.
1703 ??? Need a better way to identify a SLP reduction chain PHI.
1704 Or a better overall way to SLP match those. */
1705 if (all_same && def_type == vect_reduction_def)
1706 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1707 }
1708 else if (def_type != vect_internal_def)
1709 return NULL;
1710 }
1711
1712
1713 bool two_operators = false;
1714 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1715 tree vectype = NULL_TREE;
1716 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1717 &this_max_nunits, matches, &two_operators,
1718 &vectype))
1719 return NULL;
1720
1721 /* If the SLP node is a load, terminate the recursion unless masked. */
1722 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1723 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1724 {
1725 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1726 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1727 else
1728 {
1729 *max_nunits = this_max_nunits;
1730 (*tree_size)++;
1731 node = vect_create_new_slp_node (node, stmts, 0);
1732 SLP_TREE_VECTYPE (node) = vectype;
1733 /* And compute the load permutation. Whether it is actually
1734 a permutation depends on the unrolling factor which is
1735 decided later. */
1736 vec<unsigned> load_permutation;
1737 int j;
1738 stmt_vec_info load_info;
1739 load_permutation.create (group_size);
1740 stmt_vec_info first_stmt_info
1741 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1742 bool any_permute = false;
1743 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1744 {
1745 int load_place = vect_get_place_in_interleaving_chain
1746 (load_info, first_stmt_info);
1747 gcc_assert (load_place != -1);
1748 any_permute |= load_place != j;
1749 load_permutation.quick_push (load_place);
1750 }
1751
1752 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1753 {
1754 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1755 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1756 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1757 load_permutation.release ();
1758 /* We cannot handle permuted masked loads, see PR114375. */
1759 if (any_permute
1760 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1761 && DR_GROUP_SIZE (first_stmt_info) != group_size)
1762 || STMT_VINFO_STRIDED_P (stmt_info))
1763 {
1764 matches[0] = false;
1765 return NULL;
1766 }
1767 }
1768 else
1769 {
1770 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1771 return node;
1772 }
1773 }
1774 }
1775 else if (gimple_assign_single_p (stmt_info->stmt)
1776 && !gimple_vuse (stmt_info->stmt)
1777 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1778 {
1779 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1780 the same SSA name vector of a compatible type to vectype. */
1781 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1782 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1783 stmt_vec_info estmt_info;
1784 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1785 {
1786 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1787 tree bfref = gimple_assign_rhs1 (estmt);
1788 HOST_WIDE_INT lane;
1789 if (!known_eq (bit_field_size (bfref),
1790 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1791 || !constant_multiple_p (bit_field_offset (bfref),
1792 bit_field_size (bfref), &lane))
1793 {
1794 lperm.release ();
1795 matches[0] = false;
1796 return NULL;
1797 }
1798 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1799 }
1800 slp_tree vnode = vect_create_new_slp_node (vNULL);
1801 /* ??? We record vectype here but we hide eventually necessary
1802 punning and instead rely on code generation to materialize
1803 VIEW_CONVERT_EXPRs as necessary. We instead should make
1804 this explicit somehow. */
1805 SLP_TREE_VECTYPE (vnode) = vectype;
1806 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1807 /* We are always building a permutation node even if it is an identity
1808 permute to shield the rest of the vectorizer from the odd node
1809 representing an actual vector without any scalar ops.
1810 ??? We could hide it completely with making the permute node
1811 external? */
1812 node = vect_create_new_slp_node (node, stmts, 1);
1813 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1814 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1815 SLP_TREE_VECTYPE (node) = vectype;
1816 SLP_TREE_CHILDREN (node).quick_push (vnode);
1817 return node;
1818 }
1819 /* When discovery reaches an associatable operation see whether we can
1820 improve that to match up lanes in a way superior to the operand
1821 swapping code which at most looks at two defs.
1822 ??? For BB vectorization we cannot do the brute-force search
1823 for matching as we can succeed by means of builds from scalars
1824 and have no good way to "cost" one build against another. */
1825 else if (is_a <loop_vec_info> (vinfo)
1826 /* ??? We don't handle !vect_internal_def defs below. */
1827 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1828 && is_gimple_assign (stmt_info->stmt)
1829 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1830 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1831 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1832 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1833 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1834 {
1835 /* See if we have a chain of (mixed) adds or subtracts or other
1836 associatable ops. */
1837 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1838 if (code == MINUS_EXPR)
1839 code = PLUS_EXPR;
1840 stmt_vec_info other_op_stmt_info = NULL;
1841 stmt_vec_info op_stmt_info = NULL;
1842 unsigned chain_len = 0;
1843 auto_vec<chain_op_t> chain;
1844 auto_vec<std::pair<tree_code, gimple *> > worklist;
1845 auto_vec<vec<chain_op_t> > chains (group_size);
1846 auto_vec<slp_tree, 4> children;
1847 bool hard_fail = true;
1848 for (unsigned lane = 0; lane < group_size; ++lane)
1849 {
1850 /* For each lane linearize the addition/subtraction (or other
1851 uniform associatable operation) expression tree. */
1852 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1853 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1854 stmts[lane]->stmt, op_stmt, other_op_stmt,
1855 NULL);
1856 if (!op_stmt_info && op_stmt)
1857 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1858 if (!other_op_stmt_info && other_op_stmt)
1859 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1860 if (chain.length () == 2)
1861 {
1862 /* In a chain of just two elements resort to the regular
1863 operand swapping scheme. If we run into a length
1864 mismatch still hard-FAIL. */
1865 if (chain_len == 0)
1866 hard_fail = false;
1867 else
1868 {
1869 matches[lane] = false;
1870 /* ??? We might want to process the other lanes, but
1871 make sure to not give false matching hints to the
1872 caller for lanes we did not process. */
1873 if (lane != group_size - 1)
1874 matches[0] = false;
1875 }
1876 break;
1877 }
1878 else if (chain_len == 0)
1879 chain_len = chain.length ();
1880 else if (chain.length () != chain_len)
1881 {
1882 /* ??? Here we could slip in magic to compensate with
1883 neutral operands. */
1884 matches[lane] = false;
1885 if (lane != group_size - 1)
1886 matches[0] = false;
1887 break;
1888 }
1889 chains.quick_push (chain.copy ());
1890 chain.truncate (0);
1891 }
1892 if (chains.length () == group_size)
1893 {
1894 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1895 if (!op_stmt_info)
1896 {
1897 hard_fail = false;
1898 goto out;
1899 }
1900 /* Now we have a set of chains with the same length. */
1901 /* 1. pre-sort according to def_type and operation. */
1902 for (unsigned lane = 0; lane < group_size; ++lane)
1903 chains[lane].stablesort (dt_sort_cmp, vinfo);
1904 if (dump_enabled_p ())
1905 {
1906 dump_printf_loc (MSG_NOTE, vect_location,
1907 "pre-sorted chains of %s\n",
1908 get_tree_code_name (code));
1909 for (unsigned lane = 0; lane < group_size; ++lane)
1910 {
1911 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1912 dump_printf (MSG_NOTE, "%s %T ",
1913 get_tree_code_name (chains[lane][opnum].code),
1914 chains[lane][opnum].op);
1915 dump_printf (MSG_NOTE, "\n");
1916 }
1917 }
1918 /* 2. try to build children nodes, associating as necessary. */
1919 for (unsigned n = 0; n < chain_len; ++n)
1920 {
1921 vect_def_type dt = chains[0][n].dt;
1922 unsigned lane;
1923 for (lane = 0; lane < group_size; ++lane)
1924 if (chains[lane][n].dt != dt)
1925 {
1926 if (dt == vect_constant_def
1927 && chains[lane][n].dt == vect_external_def)
1928 dt = vect_external_def;
1929 else if (dt == vect_external_def
1930 && chains[lane][n].dt == vect_constant_def)
1931 ;
1932 else
1933 break;
1934 }
1935 if (lane != group_size)
1936 {
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE, vect_location,
1939 "giving up on chain due to mismatched "
1940 "def types\n");
1941 matches[lane] = false;
1942 if (lane != group_size - 1)
1943 matches[0] = false;
1944 goto out;
1945 }
1946 if (dt == vect_constant_def
1947 || dt == vect_external_def)
1948 {
1949 /* Check whether we can build the invariant. If we can't
1950 we never will be able to. */
1951 tree type = TREE_TYPE (chains[0][n].op);
1952 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1953 && (TREE_CODE (type) == BOOLEAN_TYPE
1954 || !can_duplicate_and_interleave_p (vinfo, group_size,
1955 type)))
1956 {
1957 matches[0] = false;
1958 goto out;
1959 }
1960 vec<tree> ops;
1961 ops.create (group_size);
1962 for (lane = 0; lane < group_size; ++lane)
1963 ops.quick_push (chains[lane][n].op);
1964 slp_tree child = vect_create_new_slp_node (ops);
1965 SLP_TREE_DEF_TYPE (child) = dt;
1966 children.safe_push (child);
1967 }
1968 else if (dt != vect_internal_def)
1969 {
1970 /* Not sure, we might need sth special.
1971 gcc.dg/vect/pr96854.c,
1972 gfortran.dg/vect/fast-math-pr37021.f90
1973 and gfortran.dg/vect/pr61171.f trigger. */
1974 /* Soft-fail for now. */
1975 hard_fail = false;
1976 goto out;
1977 }
1978 else
1979 {
1980 vec<stmt_vec_info> op_stmts;
1981 op_stmts.create (group_size);
1982 slp_tree child = NULL;
1983 /* Brute-force our way. We have to consider a lane
1984 failing after fixing an earlier fail up in the
1985 SLP discovery recursion. So track the current
1986 permute per lane. */
1987 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1988 memset (perms, 0, sizeof (unsigned) * group_size);
1989 do
1990 {
1991 op_stmts.truncate (0);
1992 for (lane = 0; lane < group_size; ++lane)
1993 op_stmts.quick_push
1994 (vinfo->lookup_def (chains[lane][n].op));
1995 child = vect_build_slp_tree (vinfo, op_stmts,
1996 group_size, &this_max_nunits,
1997 matches, limit,
1998 &this_tree_size, bst_map);
1999 /* ??? We're likely getting too many fatal mismatches
2000 here so maybe we want to ignore them (but then we
2001 have no idea which lanes fatally mismatched). */
2002 if (child || !matches[0])
2003 break;
2004 /* Swap another lane we have not yet matched up into
2005 lanes that did not match. If we run out of
2006 permute possibilities for a lane terminate the
2007 search. */
2008 bool term = false;
2009 for (lane = 1; lane < group_size; ++lane)
2010 if (!matches[lane])
2011 {
2012 if (n + perms[lane] + 1 == chain_len)
2013 {
2014 term = true;
2015 break;
2016 }
2017 std::swap (chains[lane][n],
2018 chains[lane][n + perms[lane] + 1]);
2019 perms[lane]++;
2020 }
2021 if (term)
2022 break;
2023 }
2024 while (1);
2025 if (!child)
2026 {
2027 if (dump_enabled_p ())
2028 dump_printf_loc (MSG_NOTE, vect_location,
2029 "failed to match up op %d\n", n);
2030 op_stmts.release ();
2031 if (lane != group_size - 1)
2032 matches[0] = false;
2033 else
2034 matches[lane] = false;
2035 goto out;
2036 }
2037 if (dump_enabled_p ())
2038 {
2039 dump_printf_loc (MSG_NOTE, vect_location,
2040 "matched up op %d to\n", n);
2041 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2042 }
2043 children.safe_push (child);
2044 }
2045 }
2046 /* 3. build SLP nodes to combine the chain. */
2047 for (unsigned lane = 0; lane < group_size; ++lane)
2048 if (chains[lane][0].code != code)
2049 {
2050 /* See if there's any alternate all-PLUS entry. */
2051 unsigned n;
2052 for (n = 1; n < chain_len; ++n)
2053 {
2054 for (lane = 0; lane < group_size; ++lane)
2055 if (chains[lane][n].code != code)
2056 break;
2057 if (lane == group_size)
2058 break;
2059 }
2060 if (n != chain_len)
2061 {
2062 /* Swap that in at first position. */
2063 std::swap (children[0], children[n]);
2064 for (lane = 0; lane < group_size; ++lane)
2065 std::swap (chains[lane][0], chains[lane][n]);
2066 }
2067 else
2068 {
2069 /* ??? When this triggers and we end up with two
2070 vect_constant/external_def up-front things break (ICE)
2071 spectacularly finding an insertion place for the
2072 all-constant op. We should have a fully
2073 vect_internal_def operand though(?) so we can swap
2074 that into first place and then prepend the all-zero
2075 constant. */
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE, vect_location,
2078 "inserting constant zero to compensate "
2079 "for (partially) negated first "
2080 "operand\n");
2081 chain_len++;
2082 for (lane = 0; lane < group_size; ++lane)
2083 chains[lane].safe_insert
2084 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2085 vec<tree> zero_ops;
2086 zero_ops.create (group_size);
2087 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2088 for (lane = 1; lane < group_size; ++lane)
2089 zero_ops.quick_push (zero_ops[0]);
2090 slp_tree zero = vect_create_new_slp_node (zero_ops);
2091 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2092 children.safe_insert (0, zero);
2093 }
2094 break;
2095 }
2096 for (unsigned i = 1; i < children.length (); ++i)
2097 {
2098 slp_tree op0 = children[i - 1];
2099 slp_tree op1 = children[i];
2100 bool this_two_op = false;
2101 for (unsigned lane = 0; lane < group_size; ++lane)
2102 if (chains[lane][i].code != chains[0][i].code)
2103 {
2104 this_two_op = true;
2105 break;
2106 }
2107 slp_tree child;
2108 if (i == children.length () - 1)
2109 child = vect_create_new_slp_node (node, stmts, 2);
2110 else
2111 child = vect_create_new_slp_node (2, ERROR_MARK);
2112 if (this_two_op)
2113 {
2114 vec<std::pair<unsigned, unsigned> > lperm;
2115 lperm.create (group_size);
2116 for (unsigned lane = 0; lane < group_size; ++lane)
2117 lperm.quick_push (std::make_pair
2118 (chains[lane][i].code != chains[0][i].code, lane));
2119 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2120 (chains[0][i].code == code
2121 ? op_stmt_info
2122 : other_op_stmt_info),
2123 (chains[0][i].code == code
2124 ? other_op_stmt_info
2125 : op_stmt_info),
2126 lperm);
2127 }
2128 else
2129 {
2130 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2131 SLP_TREE_VECTYPE (child) = vectype;
2132 SLP_TREE_LANES (child) = group_size;
2133 SLP_TREE_CHILDREN (child).quick_push (op0);
2134 SLP_TREE_CHILDREN (child).quick_push (op1);
2135 SLP_TREE_REPRESENTATIVE (child)
2136 = (chains[0][i].code == code
2137 ? op_stmt_info : other_op_stmt_info);
2138 }
2139 children[i] = child;
2140 }
2141 *tree_size += this_tree_size + 1;
2142 *max_nunits = this_max_nunits;
2143 while (!chains.is_empty ())
2144 chains.pop ().release ();
2145 return node;
2146 }
2147 out:
2148 while (!children.is_empty ())
2149 vect_free_slp_tree (children.pop ());
2150 while (!chains.is_empty ())
2151 chains.pop ().release ();
2152 /* Hard-fail, otherwise we might run into quadratic processing of the
2153 chains starting one stmt into the chain again. */
2154 if (hard_fail)
2155 return NULL;
2156 /* Fall thru to normal processing. */
2157 }
2158
2159 /* Get at the operands, verifying they are compatible. */
2160 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2161 slp_oprnd_info oprnd_info;
2162 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2163 {
2164 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2165 stmts, i, &oprnds_info);
2166 if (res != 0)
2167 matches[(res == -1) ? 0 : i] = false;
2168 if (!matches[0])
2169 break;
2170 }
2171 for (i = 0; i < group_size; ++i)
2172 if (!matches[i])
2173 {
2174 vect_free_oprnd_info (oprnds_info);
2175 return NULL;
2176 }
2177 swap = NULL;
2178
2179 auto_vec<slp_tree, 4> children;
2180
2181 stmt_info = stmts[0];
2182
2183 /* Create SLP_TREE nodes for the definition node/s. */
2184 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2185 {
2186 slp_tree child;
2187 unsigned int j;
2188
2189 /* We're skipping certain operands from processing, for example
2190 outer loop reduction initial defs. */
2191 if (skip_args[i])
2192 {
2193 children.safe_push (NULL);
2194 continue;
2195 }
2196
2197 if (oprnd_info->first_dt == vect_uninitialized_def)
2198 {
2199 /* COND_EXPR have one too many eventually if the condition
2200 is a SSA name. */
2201 gcc_assert (i == 3 && nops == 4);
2202 continue;
2203 }
2204
2205 if (is_a <bb_vec_info> (vinfo)
2206 && oprnd_info->first_dt == vect_internal_def
2207 && !oprnd_info->any_pattern)
2208 {
2209 /* For BB vectorization, if all defs are the same do not
2210 bother to continue the build along the single-lane
2211 graph but use a splat of the scalar value. */
2212 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2213 for (j = 1; j < group_size; ++j)
2214 if (oprnd_info->def_stmts[j] != first_def)
2215 break;
2216 if (j == group_size
2217 /* But avoid doing this for loads where we may be
2218 able to CSE things, unless the stmt is not
2219 vectorizable. */
2220 && (!STMT_VINFO_VECTORIZABLE (first_def)
2221 || !gimple_vuse (first_def->stmt)))
2222 {
2223 if (dump_enabled_p ())
2224 dump_printf_loc (MSG_NOTE, vect_location,
2225 "Using a splat of the uniform operand %G",
2226 first_def->stmt);
2227 oprnd_info->first_dt = vect_external_def;
2228 }
2229 }
2230
2231 if (oprnd_info->first_dt == vect_external_def
2232 || oprnd_info->first_dt == vect_constant_def)
2233 {
2234 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2235 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2236 oprnd_info->ops = vNULL;
2237 children.safe_push (invnode);
2238 continue;
2239 }
2240
2241 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2242 group_size, &this_max_nunits,
2243 matches, limit,
2244 &this_tree_size, bst_map)) != NULL)
2245 {
2246 oprnd_info->def_stmts = vNULL;
2247 children.safe_push (child);
2248 continue;
2249 }
2250
2251 /* If the SLP build for operand zero failed and operand zero
2252 and one can be commutated try that for the scalar stmts
2253 that failed the match. */
2254 if (i == 0
2255 /* A first scalar stmt mismatch signals a fatal mismatch. */
2256 && matches[0]
2257 /* ??? For COND_EXPRs we can swap the comparison operands
2258 as well as the arms under some constraints. */
2259 && nops == 2
2260 && oprnds_info[1]->first_dt == vect_internal_def
2261 && is_gimple_assign (stmt_info->stmt)
2262 /* Swapping operands for reductions breaks assumptions later on. */
2263 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2264 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2265 {
2266 /* See whether we can swap the matching or the non-matching
2267 stmt operands. */
2268 bool swap_not_matching = true;
2269 do
2270 {
2271 for (j = 0; j < group_size; ++j)
2272 {
2273 if (matches[j] != !swap_not_matching)
2274 continue;
2275 stmt_vec_info stmt_info = stmts[j];
2276 /* Verify if we can swap operands of this stmt. */
2277 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2278 if (!stmt
2279 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2280 {
2281 if (!swap_not_matching)
2282 goto fail;
2283 swap_not_matching = false;
2284 break;
2285 }
2286 }
2287 }
2288 while (j != group_size);
2289
2290 /* Swap mismatched definition stmts. */
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_NOTE, vect_location,
2293 "Re-trying with swapped operands of stmts ");
2294 for (j = 0; j < group_size; ++j)
2295 if (matches[j] == !swap_not_matching)
2296 {
2297 std::swap (oprnds_info[0]->def_stmts[j],
2298 oprnds_info[1]->def_stmts[j]);
2299 std::swap (oprnds_info[0]->ops[j],
2300 oprnds_info[1]->ops[j]);
2301 if (dump_enabled_p ())
2302 dump_printf (MSG_NOTE, "%d ", j);
2303 }
2304 if (dump_enabled_p ())
2305 dump_printf (MSG_NOTE, "\n");
2306 /* After swapping some operands we lost track whether an
2307 operand has any pattern defs so be conservative here. */
2308 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2309 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2310 /* And try again with scratch 'matches' ... */
2311 bool *tem = XALLOCAVEC (bool, group_size);
2312 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2313 group_size, &this_max_nunits,
2314 tem, limit,
2315 &this_tree_size, bst_map)) != NULL)
2316 {
2317 oprnd_info->def_stmts = vNULL;
2318 children.safe_push (child);
2319 continue;
2320 }
2321 }
2322 fail:
2323
2324 /* If the SLP build failed and we analyze a basic-block
2325 simply treat nodes we fail to build as externally defined
2326 (and thus build vectors from the scalar defs).
2327 The cost model will reject outright expensive cases.
2328 ??? This doesn't treat cases where permutation ultimatively
2329 fails (or we don't try permutation below). Ideally we'd
2330 even compute a permutation that will end up with the maximum
2331 SLP tree size... */
2332 if (is_a <bb_vec_info> (vinfo)
2333 /* ??? Rejecting patterns this way doesn't work. We'd have to
2334 do extra work to cancel the pattern so the uses see the
2335 scalar version. */
2336 && !is_pattern_stmt_p (stmt_info)
2337 && !oprnd_info->any_pattern)
2338 {
2339 /* But if there's a leading vector sized set of matching stmts
2340 fail here so we can split the group. This matches the condition
2341 vect_analyze_slp_instance uses. */
2342 /* ??? We might want to split here and combine the results to support
2343 multiple vector sizes better. */
2344 for (j = 0; j < group_size; ++j)
2345 if (!matches[j])
2346 break;
2347 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2348 {
2349 if (dump_enabled_p ())
2350 dump_printf_loc (MSG_NOTE, vect_location,
2351 "Building vector operands from scalars\n");
2352 this_tree_size++;
2353 child = vect_create_new_slp_node (oprnd_info->ops);
2354 children.safe_push (child);
2355 oprnd_info->ops = vNULL;
2356 continue;
2357 }
2358 }
2359
2360 gcc_assert (child == NULL);
2361 FOR_EACH_VEC_ELT (children, j, child)
2362 if (child)
2363 vect_free_slp_tree (child);
2364 vect_free_oprnd_info (oprnds_info);
2365 return NULL;
2366 }
2367
2368 vect_free_oprnd_info (oprnds_info);
2369
2370 /* If we have all children of a child built up from uniform scalars
2371 or does more than one possibly expensive vector construction then
2372 just throw that away, causing it built up from scalars.
2373 The exception is the SLP node for the vector store. */
2374 if (is_a <bb_vec_info> (vinfo)
2375 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2376 /* ??? Rejecting patterns this way doesn't work. We'd have to
2377 do extra work to cancel the pattern so the uses see the
2378 scalar version. */
2379 && !is_pattern_stmt_p (stmt_info))
2380 {
2381 slp_tree child;
2382 unsigned j;
2383 bool all_uniform_p = true;
2384 unsigned n_vector_builds = 0;
2385 FOR_EACH_VEC_ELT (children, j, child)
2386 {
2387 if (!child)
2388 ;
2389 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2390 all_uniform_p = false;
2391 else if (!vect_slp_tree_uniform_p (child))
2392 {
2393 all_uniform_p = false;
2394 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2395 n_vector_builds++;
2396 }
2397 }
2398 if (all_uniform_p
2399 || n_vector_builds > 1
2400 || (n_vector_builds == children.length ()
2401 && is_a <gphi *> (stmt_info->stmt)))
2402 {
2403 /* Roll back. */
2404 matches[0] = false;
2405 FOR_EACH_VEC_ELT (children, j, child)
2406 if (child)
2407 vect_free_slp_tree (child);
2408
2409 if (dump_enabled_p ())
2410 dump_printf_loc (MSG_NOTE, vect_location,
2411 "Building parent vector operands from "
2412 "scalars instead\n");
2413 return NULL;
2414 }
2415 }
2416
2417 *tree_size += this_tree_size + 1;
2418 *max_nunits = this_max_nunits;
2419
2420 if (two_operators)
2421 {
2422 /* ??? We'd likely want to either cache in bst_map sth like
2423 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2424 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2425 explicit stmts to put in so the keying on 'stmts' doesn't
2426 work (but we have the same issue with nodes that use 'ops'). */
2427 slp_tree one = new _slp_tree;
2428 slp_tree two = new _slp_tree;
2429 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2430 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2431 SLP_TREE_VECTYPE (one) = vectype;
2432 SLP_TREE_VECTYPE (two) = vectype;
2433 SLP_TREE_CHILDREN (one).safe_splice (children);
2434 SLP_TREE_CHILDREN (two).safe_splice (children);
2435 slp_tree child;
2436 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2437 SLP_TREE_REF_COUNT (child)++;
2438
2439 /* Here we record the original defs since this
2440 node represents the final lane configuration. */
2441 node = vect_create_new_slp_node (node, stmts, 2);
2442 SLP_TREE_VECTYPE (node) = vectype;
2443 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2444 SLP_TREE_CHILDREN (node).quick_push (one);
2445 SLP_TREE_CHILDREN (node).quick_push (two);
2446 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2447 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2448 enum tree_code ocode = ERROR_MARK;
2449 stmt_vec_info ostmt_info;
2450 unsigned j = 0;
2451 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2452 {
2453 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2454 if (gimple_assign_rhs_code (ostmt) != code0)
2455 {
2456 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2457 ocode = gimple_assign_rhs_code (ostmt);
2458 j = i;
2459 }
2460 else
2461 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2462 }
2463 SLP_TREE_CODE (one) = code0;
2464 SLP_TREE_CODE (two) = ocode;
2465 SLP_TREE_LANES (one) = stmts.length ();
2466 SLP_TREE_LANES (two) = stmts.length ();
2467 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2468 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2469 return node;
2470 }
2471
2472 node = vect_create_new_slp_node (node, stmts, nops);
2473 SLP_TREE_VECTYPE (node) = vectype;
2474 SLP_TREE_CHILDREN (node).splice (children);
2475 return node;
2476 }
2477
2478 /* Dump a single SLP tree NODE. */
2479
2480 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)2481 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2482 slp_tree node)
2483 {
2484 unsigned i, j;
2485 slp_tree child;
2486 stmt_vec_info stmt_info;
2487 tree op;
2488
2489 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2490 dump_user_location_t user_loc = loc.get_user_location ();
2491 dump_printf_loc (metadata, user_loc,
2492 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2493 ", refcnt=%u)",
2494 SLP_TREE_DEF_TYPE (node) == vect_external_def
2495 ? " (external)"
2496 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2497 ? " (constant)"
2498 : ""), node,
2499 estimated_poly_value (node->max_nunits),
2500 SLP_TREE_REF_COUNT (node));
2501 if (SLP_TREE_VECTYPE (node))
2502 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2503 dump_printf (metadata, "\n");
2504 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2505 {
2506 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2507 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2508 else
2509 dump_printf_loc (metadata, user_loc, "op template: %G",
2510 SLP_TREE_REPRESENTATIVE (node)->stmt);
2511 }
2512 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2513 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2514 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2515 else
2516 {
2517 dump_printf_loc (metadata, user_loc, "\t{ ");
2518 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2519 dump_printf (metadata, "%T%s ", op,
2520 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2521 dump_printf (metadata, "}\n");
2522 }
2523 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2524 {
2525 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2526 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2527 dump_printf (dump_kind, " %u", j);
2528 dump_printf (dump_kind, " }\n");
2529 }
2530 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2531 {
2532 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2533 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2534 dump_printf (dump_kind, " %u[%u]",
2535 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2536 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2537 dump_printf (dump_kind, " }\n");
2538 }
2539 if (SLP_TREE_CHILDREN (node).is_empty ())
2540 return;
2541 dump_printf_loc (metadata, user_loc, "\tchildren");
2542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2543 dump_printf (dump_kind, " %p", (void *)child);
2544 dump_printf (dump_kind, "\n");
2545 }
2546
2547 DEBUG_FUNCTION void
debug(slp_tree node)2548 debug (slp_tree node)
2549 {
2550 debug_dump_context ctx;
2551 vect_print_slp_tree (MSG_NOTE,
2552 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2553 node);
2554 }
2555
2556 /* Recursive helper for the dot producer below. */
2557
2558 static void
dot_slp_tree(FILE * f,slp_tree node,hash_set<slp_tree> & visited)2559 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2560 {
2561 if (visited.add (node))
2562 return;
2563
2564 fprintf (f, "\"%p\" [label=\"", (void *)node);
2565 vect_print_slp_tree (MSG_NOTE,
2566 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2567 node);
2568 fprintf (f, "\"];\n");
2569
2570
2571 for (slp_tree child : SLP_TREE_CHILDREN (node))
2572 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2573
2574 for (slp_tree child : SLP_TREE_CHILDREN (node))
2575 if (child)
2576 dot_slp_tree (f, child, visited);
2577 }
2578
2579 DEBUG_FUNCTION void
dot_slp_tree(const char * fname,slp_tree node)2580 dot_slp_tree (const char *fname, slp_tree node)
2581 {
2582 FILE *f = fopen (fname, "w");
2583 fprintf (f, "digraph {\n");
2584 fflush (f);
2585 {
2586 debug_dump_context ctx (f);
2587 hash_set<slp_tree> visited;
2588 dot_slp_tree (f, node, visited);
2589 }
2590 fflush (f);
2591 fprintf (f, "}\n");
2592 fclose (f);
2593 }
2594
2595 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2596
2597 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)2598 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2599 slp_tree node, hash_set<slp_tree> &visited)
2600 {
2601 unsigned i;
2602 slp_tree child;
2603
2604 if (visited.add (node))
2605 return;
2606
2607 vect_print_slp_tree (dump_kind, loc, node);
2608
2609 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2610 if (child)
2611 vect_print_slp_graph (dump_kind, loc, child, visited);
2612 }
2613
2614 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree entry)2615 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2616 slp_tree entry)
2617 {
2618 hash_set<slp_tree> visited;
2619 vect_print_slp_graph (dump_kind, loc, entry, visited);
2620 }
2621
2622 /* Mark the tree rooted at NODE with PURE_SLP. */
2623
2624 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)2625 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2626 {
2627 int i;
2628 stmt_vec_info stmt_info;
2629 slp_tree child;
2630
2631 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2632 return;
2633
2634 if (visited.add (node))
2635 return;
2636
2637 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2638 STMT_SLP_TYPE (stmt_info) = pure_slp;
2639
2640 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2641 if (child)
2642 vect_mark_slp_stmts (child, visited);
2643 }
2644
2645 static void
vect_mark_slp_stmts(slp_tree node)2646 vect_mark_slp_stmts (slp_tree node)
2647 {
2648 hash_set<slp_tree> visited;
2649 vect_mark_slp_stmts (node, visited);
2650 }
2651
2652 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2653
2654 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)2655 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2656 {
2657 int i;
2658 stmt_vec_info stmt_info;
2659 slp_tree child;
2660
2661 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2662 return;
2663
2664 if (visited.add (node))
2665 return;
2666
2667 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2668 {
2669 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2670 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2671 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2672 }
2673
2674 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2675 if (child)
2676 vect_mark_slp_stmts_relevant (child, visited);
2677 }
2678
2679 static void
vect_mark_slp_stmts_relevant(slp_tree node)2680 vect_mark_slp_stmts_relevant (slp_tree node)
2681 {
2682 hash_set<slp_tree> visited;
2683 vect_mark_slp_stmts_relevant (node, visited);
2684 }
2685
2686
2687 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2688
2689 static void
vect_gather_slp_loads(vec<slp_tree> & loads,slp_tree node,hash_set<slp_tree> & visited)2690 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2691 hash_set<slp_tree> &visited)
2692 {
2693 if (!node || visited.add (node))
2694 return;
2695
2696 if (SLP_TREE_CHILDREN (node).length () == 0)
2697 {
2698 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2699 return;
2700 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2701 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2702 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2703 loads.safe_push (node);
2704 }
2705 else
2706 {
2707 unsigned i;
2708 slp_tree child;
2709 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2710 vect_gather_slp_loads (loads, child, visited);
2711 }
2712 }
2713
2714
2715 /* Find the last store in SLP INSTANCE. */
2716
2717 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2718 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2719 {
2720 stmt_vec_info last = NULL;
2721 stmt_vec_info stmt_vinfo;
2722
2723 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2724 {
2725 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2726 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2727 }
2728
2729 return last;
2730 }
2731
2732 /* Find the first stmt in NODE. */
2733
2734 stmt_vec_info
vect_find_first_scalar_stmt_in_slp(slp_tree node)2735 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2736 {
2737 stmt_vec_info first = NULL;
2738 stmt_vec_info stmt_vinfo;
2739
2740 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2741 {
2742 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2743 if (!first
2744 || get_later_stmt (stmt_vinfo, first) == first)
2745 first = stmt_vinfo;
2746 }
2747
2748 return first;
2749 }
2750
2751 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2752 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2753 (also containing the first GROUP1_SIZE stmts, since stores are
2754 consecutive), the second containing the remainder.
2755 Return the first stmt in the second group. */
2756
2757 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2758 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2759 {
2760 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2761 gcc_assert (group1_size > 0);
2762 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2763 gcc_assert (group2_size > 0);
2764 DR_GROUP_SIZE (first_vinfo) = group1_size;
2765
2766 stmt_vec_info stmt_info = first_vinfo;
2767 for (unsigned i = group1_size; i > 1; i--)
2768 {
2769 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2770 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2771 }
2772 /* STMT is now the last element of the first group. */
2773 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2774 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2775
2776 DR_GROUP_SIZE (group2) = group2_size;
2777 for (stmt_info = group2; stmt_info;
2778 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2779 {
2780 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2781 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2782 }
2783
2784 /* For the second group, the DR_GROUP_GAP is that before the original group,
2785 plus skipping over the first vector. */
2786 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2787
2788 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2789 DR_GROUP_GAP (first_vinfo) += group2_size;
2790
2791 if (dump_enabled_p ())
2792 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2793 group1_size, group2_size);
2794
2795 return group2;
2796 }
2797
2798 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2799 statements and a vector of NUNITS elements. */
2800
2801 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2802 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2803 {
2804 return exact_div (common_multiple (nunits, group_size), group_size);
2805 }
2806
2807 /* Helper that checks to see if a node is a load node. */
2808
2809 static inline bool
vect_is_slp_load_node(slp_tree root)2810 vect_is_slp_load_node (slp_tree root)
2811 {
2812 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2813 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2814 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2815 }
2816
2817
2818 /* Helper function of optimize_load_redistribution that performs the operation
2819 recursively. */
2820
2821 static slp_tree
optimize_load_redistribution_1(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2822 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2823 vec_info *vinfo, unsigned int group_size,
2824 hash_map<slp_tree, slp_tree> *load_map,
2825 slp_tree root)
2826 {
2827 if (slp_tree *leader = load_map->get (root))
2828 return *leader;
2829
2830 slp_tree node;
2831 unsigned i;
2832
2833 /* For now, we don't know anything about externals so do not do anything. */
2834 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2835 return NULL;
2836 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2837 {
2838 /* First convert this node into a load node and add it to the leaves
2839 list and flatten the permute from a lane to a load one. If it's
2840 unneeded it will be elided later. */
2841 vec<stmt_vec_info> stmts;
2842 stmts.create (SLP_TREE_LANES (root));
2843 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2844 for (unsigned j = 0; j < lane_perm.length (); j++)
2845 {
2846 std::pair<unsigned, unsigned> perm = lane_perm[j];
2847 node = SLP_TREE_CHILDREN (root)[perm.first];
2848
2849 if (!vect_is_slp_load_node (node)
2850 || SLP_TREE_CHILDREN (node).exists ())
2851 {
2852 stmts.release ();
2853 goto next;
2854 }
2855
2856 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2857 }
2858
2859 if (dump_enabled_p ())
2860 dump_printf_loc (MSG_NOTE, vect_location,
2861 "converting stmts on permute node %p\n", root);
2862
2863 bool *matches = XALLOCAVEC (bool, group_size);
2864 poly_uint64 max_nunits = 1;
2865 unsigned tree_size = 0, limit = 1;
2866 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2867 matches, &limit, &tree_size, bst_map);
2868 if (!node)
2869 stmts.release ();
2870
2871 load_map->put (root, node);
2872 return node;
2873 }
2874
2875 next:
2876 load_map->put (root, NULL);
2877
2878 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2879 {
2880 slp_tree value
2881 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2882 node);
2883 if (value)
2884 {
2885 SLP_TREE_REF_COUNT (value)++;
2886 SLP_TREE_CHILDREN (root)[i] = value;
2887 /* ??? We know the original leafs of the replaced nodes will
2888 be referenced by bst_map, only the permutes created by
2889 pattern matching are not. */
2890 if (SLP_TREE_REF_COUNT (node) == 1)
2891 load_map->remove (node);
2892 vect_free_slp_tree (node);
2893 }
2894 }
2895
2896 return NULL;
2897 }
2898
2899 /* Temporary workaround for loads not being CSEd during SLP build. This
2900 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2901 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2902 same DR such that the final operation is equal to a permuted load. Such
2903 NODES are then directly converted into LOADS themselves. The nodes are
2904 CSEd using BST_MAP. */
2905
2906 static void
optimize_load_redistribution(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2907 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2908 vec_info *vinfo, unsigned int group_size,
2909 hash_map<slp_tree, slp_tree> *load_map,
2910 slp_tree root)
2911 {
2912 slp_tree node;
2913 unsigned i;
2914
2915 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2916 {
2917 slp_tree value
2918 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2919 node);
2920 if (value)
2921 {
2922 SLP_TREE_REF_COUNT (value)++;
2923 SLP_TREE_CHILDREN (root)[i] = value;
2924 /* ??? We know the original leafs of the replaced nodes will
2925 be referenced by bst_map, only the permutes created by
2926 pattern matching are not. */
2927 if (SLP_TREE_REF_COUNT (node) == 1)
2928 load_map->remove (node);
2929 vect_free_slp_tree (node);
2930 }
2931 }
2932 }
2933
2934 /* Helper function of vect_match_slp_patterns.
2935
2936 Attempts to match patterns against the slp tree rooted in REF_NODE using
2937 VINFO. Patterns are matched in post-order traversal.
2938
2939 If matching is successful the value in REF_NODE is updated and returned, if
2940 not then it is returned unchanged. */
2941
2942 static bool
vect_match_slp_patterns_2(slp_tree * ref_node,vec_info * vinfo,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache,hash_set<slp_tree> * visited)2943 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2944 slp_tree_to_load_perm_map_t *perm_cache,
2945 slp_compat_nodes_map_t *compat_cache,
2946 hash_set<slp_tree> *visited)
2947 {
2948 unsigned i;
2949 slp_tree node = *ref_node;
2950 bool found_p = false;
2951 if (!node || visited->add (node))
2952 return false;
2953
2954 slp_tree child;
2955 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2956 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2957 vinfo, perm_cache, compat_cache,
2958 visited);
2959
2960 for (unsigned x = 0; x < num__slp_patterns; x++)
2961 {
2962 vect_pattern *pattern
2963 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
2964 if (pattern)
2965 {
2966 pattern->build (vinfo);
2967 delete pattern;
2968 found_p = true;
2969 }
2970 }
2971
2972 return found_p;
2973 }
2974
2975 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2976 vec_info VINFO.
2977
2978 The modified tree is returned. Patterns are tried in order and multiple
2979 patterns may match. */
2980
2981 static bool
vect_match_slp_patterns(slp_instance instance,vec_info * vinfo,hash_set<slp_tree> * visited,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache)2982 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2983 hash_set<slp_tree> *visited,
2984 slp_tree_to_load_perm_map_t *perm_cache,
2985 slp_compat_nodes_map_t *compat_cache)
2986 {
2987 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2988 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2989
2990 if (dump_enabled_p ())
2991 dump_printf_loc (MSG_NOTE, vect_location,
2992 "Analyzing SLP tree %p for patterns\n",
2993 SLP_INSTANCE_TREE (instance));
2994
2995 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2996 visited);
2997 }
2998
2999 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3000 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3001 Return true if we could use IFN_STORE_LANES instead and if that appears
3002 to be the better approach. */
3003
3004 static bool
vect_slp_prefer_store_lanes_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,unsigned int new_group_size)3005 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3006 unsigned int group_size,
3007 unsigned int new_group_size)
3008 {
3009 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3010 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3011 if (!vectype)
3012 return false;
3013 /* Allow the split if one of the two new groups would operate on full
3014 vectors *within* rather than across one scalar loop iteration.
3015 This is purely a heuristic, but it should work well for group
3016 sizes of 3 and 4, where the possible splits are:
3017
3018 3->2+1: OK if the vector has exactly two elements
3019 4->2+2: Likewise
3020 4->3+1: Less clear-cut. */
3021 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3022 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3023 return false;
3024 return vect_store_lanes_supported (vectype, group_size, false);
3025 }
3026
3027 /* Analyze an SLP instance starting from a group of grouped stores. Call
3028 vect_build_slp_tree to build a tree of packed stmts if possible.
3029 Return FALSE if it's impossible to SLP any stmt in the loop. */
3030
3031 static bool
3032 vect_analyze_slp_instance (vec_info *vinfo,
3033 scalar_stmts_to_slp_tree_map_t *bst_map,
3034 stmt_vec_info stmt_info, slp_instance_kind kind,
3035 unsigned max_tree_size, unsigned *limit);
3036
3037 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3038 of KIND. Return true if successful. */
3039
3040 static bool
vect_build_slp_instance(vec_info * vinfo,slp_instance_kind kind,vec<stmt_vec_info> & scalar_stmts,vec<stmt_vec_info> & root_stmt_infos,unsigned max_tree_size,unsigned * limit,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info_)3041 vect_build_slp_instance (vec_info *vinfo,
3042 slp_instance_kind kind,
3043 vec<stmt_vec_info> &scalar_stmts,
3044 vec<stmt_vec_info> &root_stmt_infos,
3045 unsigned max_tree_size, unsigned *limit,
3046 scalar_stmts_to_slp_tree_map_t *bst_map,
3047 /* ??? We need stmt_info for group splitting. */
3048 stmt_vec_info stmt_info_)
3049 {
3050 if (dump_enabled_p ())
3051 {
3052 dump_printf_loc (MSG_NOTE, vect_location,
3053 "Starting SLP discovery for\n");
3054 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3055 dump_printf_loc (MSG_NOTE, vect_location,
3056 " %G", scalar_stmts[i]->stmt);
3057 }
3058
3059 /* Build the tree for the SLP instance. */
3060 unsigned int group_size = scalar_stmts.length ();
3061 bool *matches = XALLOCAVEC (bool, group_size);
3062 poly_uint64 max_nunits = 1;
3063 unsigned tree_size = 0;
3064 unsigned i;
3065 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3066 &max_nunits, matches, limit,
3067 &tree_size, bst_map);
3068 if (node != NULL)
3069 {
3070 /* Calculate the unrolling factor based on the smallest type. */
3071 poly_uint64 unrolling_factor
3072 = calculate_unrolling_factor (max_nunits, group_size);
3073
3074 if (maybe_ne (unrolling_factor, 1U)
3075 && is_a <bb_vec_info> (vinfo))
3076 {
3077 unsigned HOST_WIDE_INT const_max_nunits;
3078 if (!max_nunits.is_constant (&const_max_nunits)
3079 || const_max_nunits > group_size)
3080 {
3081 if (dump_enabled_p ())
3082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3083 "Build SLP failed: store group "
3084 "size not a multiple of the vector size "
3085 "in basic block SLP\n");
3086 vect_free_slp_tree (node);
3087 return false;
3088 }
3089 /* Fatal mismatch. */
3090 if (dump_enabled_p ())
3091 dump_printf_loc (MSG_NOTE, vect_location,
3092 "SLP discovery succeeded but node needs "
3093 "splitting\n");
3094 memset (matches, true, group_size);
3095 matches[group_size / const_max_nunits * const_max_nunits] = false;
3096 vect_free_slp_tree (node);
3097 }
3098 else
3099 {
3100 /* Create a new SLP instance. */
3101 slp_instance new_instance = XNEW (class _slp_instance);
3102 SLP_INSTANCE_TREE (new_instance) = node;
3103 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3104 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3105 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3106 SLP_INSTANCE_KIND (new_instance) = kind;
3107 new_instance->reduc_phis = NULL;
3108 new_instance->cost_vec = vNULL;
3109 new_instance->subgraph_entries = vNULL;
3110
3111 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE, vect_location,
3113 "SLP size %u vs. limit %u.\n",
3114 tree_size, max_tree_size);
3115
3116 /* Fixup SLP reduction chains. */
3117 if (kind == slp_inst_kind_reduc_chain)
3118 {
3119 /* If this is a reduction chain with a conversion in front
3120 amend the SLP tree with a node for that. */
3121 gimple *scalar_def
3122 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3123 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3124 {
3125 /* Get at the conversion stmt - we know it's the single use
3126 of the last stmt of the reduction chain. */
3127 use_operand_p use_p;
3128 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3129 &use_p, &scalar_def);
3130 gcc_assert (r);
3131 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3132 next_info = vect_stmt_to_vectorize (next_info);
3133 scalar_stmts = vNULL;
3134 scalar_stmts.create (group_size);
3135 for (unsigned i = 0; i < group_size; ++i)
3136 scalar_stmts.quick_push (next_info);
3137 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3138 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3139 SLP_TREE_CHILDREN (conv).quick_push (node);
3140 SLP_INSTANCE_TREE (new_instance) = conv;
3141 /* We also have to fake this conversion stmt as SLP reduction
3142 group so we don't have to mess with too much code
3143 elsewhere. */
3144 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3145 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3146 }
3147 /* Fill the backedge child of the PHI SLP node. The
3148 general matching code cannot find it because the
3149 scalar code does not reflect how we vectorize the
3150 reduction. */
3151 use_operand_p use_p;
3152 imm_use_iterator imm_iter;
3153 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3154 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3155 gimple_get_lhs (scalar_def))
3156 /* There are exactly two non-debug uses, the reduction
3157 PHI and the loop-closed PHI node. */
3158 if (!is_gimple_debug (USE_STMT (use_p))
3159 && gimple_bb (USE_STMT (use_p)) == loop->header)
3160 {
3161 auto_vec<stmt_vec_info, 64> phis (group_size);
3162 stmt_vec_info phi_info
3163 = vinfo->lookup_stmt (USE_STMT (use_p));
3164 for (unsigned i = 0; i < group_size; ++i)
3165 phis.quick_push (phi_info);
3166 slp_tree *phi_node = bst_map->get (phis);
3167 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3168 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3169 = SLP_INSTANCE_TREE (new_instance);
3170 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3171 }
3172 }
3173
3174 vinfo->slp_instances.safe_push (new_instance);
3175
3176 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3177 the number of scalar stmts in the root in a few places.
3178 Verify that assumption holds. */
3179 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3180 .length () == group_size);
3181
3182 if (dump_enabled_p ())
3183 {
3184 dump_printf_loc (MSG_NOTE, vect_location,
3185 "Final SLP tree for instance %p:\n", new_instance);
3186 vect_print_slp_graph (MSG_NOTE, vect_location,
3187 SLP_INSTANCE_TREE (new_instance));
3188 }
3189
3190 return true;
3191 }
3192 }
3193 else
3194 {
3195 /* Failed to SLP. */
3196 /* Free the allocated memory. */
3197 scalar_stmts.release ();
3198 }
3199
3200 stmt_vec_info stmt_info = stmt_info_;
3201 /* Try to break the group up into pieces. */
3202 if (kind == slp_inst_kind_store)
3203 {
3204 /* ??? We could delay all the actual splitting of store-groups
3205 until after SLP discovery of the original group completed.
3206 Then we can recurse to vect_build_slp_instance directly. */
3207 for (i = 0; i < group_size; i++)
3208 if (!matches[i])
3209 break;
3210
3211 /* For basic block SLP, try to break the group up into multiples of
3212 a vector size. */
3213 if (is_a <bb_vec_info> (vinfo)
3214 && (i > 1 && i < group_size))
3215 {
3216 tree scalar_type
3217 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3218 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3219 1 << floor_log2 (i));
3220 unsigned HOST_WIDE_INT const_nunits;
3221 if (vectype
3222 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3223 {
3224 /* Split into two groups at the first vector boundary. */
3225 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3226 unsigned group1_size = i & ~(const_nunits - 1);
3227
3228 if (dump_enabled_p ())
3229 dump_printf_loc (MSG_NOTE, vect_location,
3230 "Splitting SLP group at stmt %u\n", i);
3231 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3232 group1_size);
3233 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3234 kind, max_tree_size,
3235 limit);
3236 /* Split the rest at the failure point and possibly
3237 re-analyze the remaining matching part if it has
3238 at least two lanes. */
3239 if (group1_size < i
3240 && (i + 1 < group_size
3241 || i - group1_size > 1))
3242 {
3243 stmt_vec_info rest2 = rest;
3244 rest = vect_split_slp_store_group (rest, i - group1_size);
3245 if (i - group1_size > 1)
3246 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3247 kind, max_tree_size,
3248 limit);
3249 }
3250 /* Re-analyze the non-matching tail if it has at least
3251 two lanes. */
3252 if (i + 1 < group_size)
3253 res |= vect_analyze_slp_instance (vinfo, bst_map,
3254 rest, kind, max_tree_size,
3255 limit);
3256 return res;
3257 }
3258 }
3259
3260 /* For loop vectorization split into arbitrary pieces of size > 1. */
3261 if (is_a <loop_vec_info> (vinfo)
3262 && (i > 1 && i < group_size)
3263 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3264 {
3265 unsigned group1_size = i;
3266
3267 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "Splitting SLP group at stmt %u\n", i);
3270
3271 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3272 group1_size);
3273 /* Loop vectorization cannot handle gaps in stores, make sure
3274 the split group appears as strided. */
3275 STMT_VINFO_STRIDED_P (rest) = 1;
3276 DR_GROUP_GAP (rest) = 0;
3277 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3278 DR_GROUP_GAP (stmt_info) = 0;
3279
3280 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3281 kind, max_tree_size, limit);
3282 if (i + 1 < group_size)
3283 res |= vect_analyze_slp_instance (vinfo, bst_map,
3284 rest, kind, max_tree_size, limit);
3285
3286 return res;
3287 }
3288
3289 /* Even though the first vector did not all match, we might be able to SLP
3290 (some) of the remainder. FORNOW ignore this possibility. */
3291 }
3292
3293 /* Failed to SLP. */
3294 if (dump_enabled_p ())
3295 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3296 return false;
3297 }
3298
3299
3300 /* Analyze an SLP instance starting from a group of grouped stores. Call
3301 vect_build_slp_tree to build a tree of packed stmts if possible.
3302 Return FALSE if it's impossible to SLP any stmt in the loop. */
3303
3304 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,slp_instance_kind kind,unsigned max_tree_size,unsigned * limit)3305 vect_analyze_slp_instance (vec_info *vinfo,
3306 scalar_stmts_to_slp_tree_map_t *bst_map,
3307 stmt_vec_info stmt_info,
3308 slp_instance_kind kind,
3309 unsigned max_tree_size, unsigned *limit)
3310 {
3311 unsigned int i;
3312 vec<stmt_vec_info> scalar_stmts;
3313
3314 if (is_a <bb_vec_info> (vinfo))
3315 vect_location = stmt_info->stmt;
3316
3317 stmt_vec_info next_info = stmt_info;
3318 if (kind == slp_inst_kind_store)
3319 {
3320 /* Collect the stores and store them in scalar_stmts. */
3321 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3322 while (next_info)
3323 {
3324 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3325 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3326 }
3327 }
3328 else if (kind == slp_inst_kind_reduc_chain)
3329 {
3330 /* Collect the reduction stmts and store them in scalar_stmts. */
3331 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3332 while (next_info)
3333 {
3334 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3335 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3336 }
3337 /* Mark the first element of the reduction chain as reduction to properly
3338 transform the node. In the reduction analysis phase only the last
3339 element of the chain is marked as reduction. */
3340 STMT_VINFO_DEF_TYPE (stmt_info)
3341 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3342 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3343 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3344 }
3345 else if (kind == slp_inst_kind_ctor)
3346 {
3347 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3348 tree val;
3349 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3350 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3351 {
3352 stmt_vec_info def_info = vinfo->lookup_def (val);
3353 def_info = vect_stmt_to_vectorize (def_info);
3354 scalar_stmts.quick_push (def_info);
3355 }
3356 if (dump_enabled_p ())
3357 dump_printf_loc (MSG_NOTE, vect_location,
3358 "Analyzing vectorizable constructor: %G\n",
3359 stmt_info->stmt);
3360 }
3361 else if (kind == slp_inst_kind_reduc_group)
3362 {
3363 /* Collect reduction statements. */
3364 const vec<stmt_vec_info> &reductions
3365 = as_a <loop_vec_info> (vinfo)->reductions;
3366 scalar_stmts.create (reductions.length ());
3367 for (i = 0; reductions.iterate (i, &next_info); i++)
3368 if ((STMT_VINFO_RELEVANT_P (next_info)
3369 || STMT_VINFO_LIVE_P (next_info))
3370 /* ??? Make sure we didn't skip a conversion around a reduction
3371 path. In that case we'd have to reverse engineer that conversion
3372 stmt following the chain using reduc_idx and from the PHI
3373 using reduc_def. */
3374 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3375 scalar_stmts.quick_push (next_info);
3376 /* If less than two were relevant/live there's nothing to SLP. */
3377 if (scalar_stmts.length () < 2)
3378 return false;
3379 }
3380 else
3381 gcc_unreachable ();
3382
3383 vec<stmt_vec_info> roots = vNULL;
3384 if (kind == slp_inst_kind_ctor)
3385 {
3386 roots.create (1);
3387 roots.quick_push (stmt_info);
3388 }
3389 /* Build the tree for the SLP instance. */
3390 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3391 roots,
3392 max_tree_size, limit, bst_map,
3393 kind == slp_inst_kind_store
3394 ? stmt_info : NULL);
3395 if (!res)
3396 roots.release ();
3397
3398 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3399 where we should do store group splitting. */
3400
3401 return res;
3402 }
3403
3404 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3405 trees of packed scalar stmts if SLP is possible. */
3406
3407 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)3408 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3409 {
3410 unsigned int i;
3411 stmt_vec_info first_element;
3412 slp_instance instance;
3413
3414 DUMP_VECT_SCOPE ("vect_analyze_slp");
3415
3416 unsigned limit = max_tree_size;
3417
3418 scalar_stmts_to_slp_tree_map_t *bst_map
3419 = new scalar_stmts_to_slp_tree_map_t ();
3420
3421 /* Find SLP sequences starting from groups of grouped stores. */
3422 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3423 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3424 STMT_VINFO_GROUPED_ACCESS (first_element)
3425 ? slp_inst_kind_store : slp_inst_kind_ctor,
3426 max_tree_size, &limit);
3427
3428 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3429 {
3430 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3431 {
3432 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3433 /* Apply patterns. */
3434 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3435 bb_vinfo->roots[i].stmts[j]
3436 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3437 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3438 bb_vinfo->roots[i].stmts,
3439 bb_vinfo->roots[i].roots,
3440 max_tree_size, &limit, bst_map, NULL))
3441 {
3442 bb_vinfo->roots[i].stmts = vNULL;
3443 bb_vinfo->roots[i].roots = vNULL;
3444 }
3445 }
3446 }
3447
3448 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3449 {
3450 /* Find SLP sequences starting from reduction chains. */
3451 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3452 if (! STMT_VINFO_RELEVANT_P (first_element)
3453 && ! STMT_VINFO_LIVE_P (first_element))
3454 ;
3455 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3456 slp_inst_kind_reduc_chain,
3457 max_tree_size, &limit))
3458 {
3459 /* Dissolve reduction chain group. */
3460 stmt_vec_info vinfo = first_element;
3461 stmt_vec_info last = NULL;
3462 while (vinfo)
3463 {
3464 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3465 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3466 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3467 last = vinfo;
3468 vinfo = next;
3469 }
3470 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3471 /* It can be still vectorized as part of an SLP reduction. */
3472 loop_vinfo->reductions.safe_push (last);
3473 }
3474
3475 /* Find SLP sequences starting from groups of reductions. */
3476 if (loop_vinfo->reductions.length () > 1)
3477 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3478 slp_inst_kind_reduc_group, max_tree_size,
3479 &limit);
3480 }
3481
3482 hash_set<slp_tree> visited_patterns;
3483 slp_tree_to_load_perm_map_t perm_cache;
3484 slp_compat_nodes_map_t compat_cache;
3485
3486 /* See if any patterns can be found in the SLP tree. */
3487 bool pattern_found = false;
3488 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3489 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3490 &visited_patterns, &perm_cache,
3491 &compat_cache);
3492
3493 /* If any were found optimize permutations of loads. */
3494 if (pattern_found)
3495 {
3496 hash_map<slp_tree, slp_tree> load_map;
3497 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3498 {
3499 slp_tree root = SLP_INSTANCE_TREE (instance);
3500 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3501 &load_map, root);
3502 }
3503 }
3504
3505
3506
3507 /* The map keeps a reference on SLP nodes built, release that. */
3508 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3509 it != bst_map->end (); ++it)
3510 if ((*it).second)
3511 vect_free_slp_tree ((*it).second);
3512 delete bst_map;
3513
3514 if (pattern_found && dump_enabled_p ())
3515 {
3516 dump_printf_loc (MSG_NOTE, vect_location,
3517 "Pattern matched SLP tree\n");
3518 hash_set<slp_tree> visited;
3519 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3520 vect_print_slp_graph (MSG_NOTE, vect_location,
3521 SLP_INSTANCE_TREE (instance), visited);
3522 }
3523
3524 return opt_result::success ();
3525 }
3526
3527 struct slpg_vertex
3528 {
slpg_vertexslpg_vertex3529 slpg_vertex (slp_tree node_)
3530 : node (node_), perm_in (-1), perm_out (-1) {}
3531
get_perm_materializedslpg_vertex3532 int get_perm_materialized () const
3533 { return perm_in != perm_out ? perm_in : 0; }
3534
3535 slp_tree node;
3536 /* The common permutation on the incoming lanes (towards SLP children). */
3537 int perm_in;
3538 /* The permutation on the outgoing lanes (towards SLP parents). When
3539 the node is a materialization point for a permute this differs
3540 from perm_in (and is then usually zero). Materialization happens
3541 on the input side. */
3542 int perm_out;
3543 };
3544
3545 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3546
3547 static void
vect_slp_build_vertices(hash_set<slp_tree> & visited,slp_tree node,vec<slpg_vertex> & vertices,vec<int> & leafs)3548 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3549 vec<slpg_vertex> &vertices, vec<int> &leafs)
3550 {
3551 unsigned i;
3552 slp_tree child;
3553
3554 if (visited.add (node))
3555 return;
3556
3557 node->vertex = vertices.length ();
3558 vertices.safe_push (slpg_vertex (node));
3559
3560 bool leaf = true;
3561 bool force_leaf = false;
3562 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3563 if (child)
3564 {
3565 leaf = false;
3566 vect_slp_build_vertices (visited, child, vertices, leafs);
3567 }
3568 else
3569 force_leaf = true;
3570 /* Since SLP discovery works along use-def edges all cycles have an
3571 entry - but there's the exception of cycles where we do not handle
3572 the entry explicitely (but with a NULL SLP node), like some reductions
3573 and inductions. Force those SLP PHIs to act as leafs to make them
3574 backwards reachable. */
3575 if (leaf || force_leaf)
3576 leafs.safe_push (node->vertex);
3577 }
3578
3579 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3580
3581 static void
vect_slp_build_vertices(vec_info * info,vec<slpg_vertex> & vertices,vec<int> & leafs)3582 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3583 vec<int> &leafs)
3584 {
3585 hash_set<slp_tree> visited;
3586 unsigned i;
3587 slp_instance instance;
3588 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3589 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3590 leafs);
3591 }
3592
3593 /* Apply (reverse) bijectite PERM to VEC. */
3594
3595 template <class T>
3596 static void
vect_slp_permute(vec<unsigned> perm,vec<T> & vec,bool reverse)3597 vect_slp_permute (vec<unsigned> perm,
3598 vec<T> &vec, bool reverse)
3599 {
3600 auto_vec<T, 64> saved;
3601 saved.create (vec.length ());
3602 for (unsigned i = 0; i < vec.length (); ++i)
3603 saved.quick_push (vec[i]);
3604
3605 if (reverse)
3606 {
3607 for (unsigned i = 0; i < vec.length (); ++i)
3608 vec[perm[i]] = saved[i];
3609 for (unsigned i = 0; i < vec.length (); ++i)
3610 gcc_assert (vec[perm[i]] == saved[i]);
3611 }
3612 else
3613 {
3614 for (unsigned i = 0; i < vec.length (); ++i)
3615 vec[i] = saved[perm[i]];
3616 for (unsigned i = 0; i < vec.length (); ++i)
3617 gcc_assert (vec[i] == saved[perm[i]]);
3618 }
3619 }
3620
3621 /* Return whether permutations PERM_A and PERM_B as recorded in the
3622 PERMS vector are equal. */
3623
3624 static bool
vect_slp_perms_eq(const vec<vec<unsigned>> & perms,int perm_a,int perm_b)3625 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3626 int perm_a, int perm_b)
3627 {
3628 return (perm_a == perm_b
3629 || (perm_a != -1 && perm_b != -1
3630 && perms[perm_a].length () == perms[perm_b].length ()
3631 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3632 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3633 }
3634
3635 /* Optimize the SLP graph of VINFO. */
3636
3637 void
vect_optimize_slp(vec_info * vinfo)3638 vect_optimize_slp (vec_info *vinfo)
3639 {
3640 if (vinfo->slp_instances.is_empty ())
3641 return;
3642
3643 slp_tree node;
3644 unsigned i;
3645 auto_vec<slpg_vertex> vertices;
3646 auto_vec<int> leafs;
3647 vect_slp_build_vertices (vinfo, vertices, leafs);
3648
3649 struct graph *slpg = new_graph (vertices.length ());
3650 for (slpg_vertex &v : vertices)
3651 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3652 if (child)
3653 add_edge (slpg, v.node->vertex, child->vertex);
3654
3655 /* Compute (reverse) postorder on the inverted graph. */
3656 auto_vec<int> ipo;
3657 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3658
3659 auto_vec<vec<unsigned> > perms;
3660 perms.safe_push (vNULL); /* zero is no permute */
3661
3662 /* Produce initial permutations. */
3663 for (i = 0; i < leafs.length (); ++i)
3664 {
3665 int idx = leafs[i];
3666 slp_tree node = vertices[idx].node;
3667
3668 /* Handle externals and constants optimistically throughout the
3669 iteration. But treat existing vectors as fixed since we
3670 do not handle permuting them below. */
3671 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3672 && !SLP_TREE_VEC_DEFS (node).exists ())
3673 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3674 continue;
3675
3676 /* Leafs do not change across iterations. Note leafs also double
3677 as entries to the reverse graph. */
3678 if (!slpg->vertices[idx].succ)
3679 {
3680 vertices[idx].perm_in = 0;
3681 vertices[idx].perm_out = 0;
3682 }
3683
3684 /* Loads are the only thing generating permutes. */
3685 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3686 continue;
3687
3688 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3689 node unpermuted, record this permute. */
3690 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3691 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3692 continue;
3693 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3694 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3695 bool any_permute = false;
3696 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3697 {
3698 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3699 imin = MIN (imin, idx);
3700 imax = MAX (imax, idx);
3701 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3702 any_permute = true;
3703 }
3704 /* If there's no permute no need to split one out. */
3705 if (!any_permute)
3706 continue;
3707 /* If the span doesn't match we'd disrupt VF computation, avoid
3708 that for now. */
3709 if (imax - imin + 1 != SLP_TREE_LANES (node))
3710 continue;
3711
3712 /* For now only handle true permutes, like
3713 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3714 when permuting constants and invariants keeping the permute
3715 bijective. */
3716 auto_sbitmap load_index (SLP_TREE_LANES (node));
3717 bitmap_clear (load_index);
3718 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3719 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3720 unsigned j;
3721 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3722 if (!bitmap_bit_p (load_index, j))
3723 break;
3724 if (j != SLP_TREE_LANES (node))
3725 continue;
3726
3727 vec<unsigned> perm = vNULL;
3728 perm.safe_grow (SLP_TREE_LANES (node), true);
3729 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3730 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3731 perms.safe_push (perm);
3732 vertices[idx].perm_in = perms.length () - 1;
3733 vertices[idx].perm_out = perms.length () - 1;
3734 }
3735
3736 /* We have to mark outgoing permutations facing non-associating-reduction
3737 graph entries that are not represented as to be materialized. */
3738 for (slp_instance instance : vinfo->slp_instances)
3739 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3740 {
3741 /* Just setting perm_out isn't enough for the propagation to
3742 pick this up. */
3743 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3744 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3745 }
3746 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
3747 {
3748 stmt_vec_info stmt_info
3749 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
3750 stmt_vec_info reduc_info = info_for_reduction (vinfo, stmt_info);
3751 if (needs_fold_left_reduction_p (TREE_TYPE
3752 (gimple_get_lhs (stmt_info->stmt)),
3753 STMT_VINFO_REDUC_CODE (reduc_info)))
3754 {
3755 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
3756 vertices[node_i].perm_in = 0;
3757 vertices[node_i].perm_out = 0;
3758 }
3759 }
3760
3761 /* Propagate permutes along the graph and compute materialization points. */
3762 bool changed;
3763 bool do_materialization = false;
3764 unsigned iteration = 0;
3765 do
3766 {
3767 changed = false;
3768 ++iteration;
3769
3770 if (dump_enabled_p ())
3771 dump_printf_loc (MSG_NOTE, vect_location,
3772 "SLP optimize iteration %d\n", iteration);
3773
3774 for (i = vertices.length (); i > 0 ; --i)
3775 {
3776 int idx = ipo[i-1];
3777 slp_tree node = vertices[idx].node;
3778
3779 /* Handle externals and constants optimistically throughout the
3780 iteration. */
3781 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3782 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3783 continue;
3784
3785 /* We still eventually have failed backedge SLP nodes in the
3786 graph, those are only cancelled when analyzing operations.
3787 Simply treat them as transparent ops, propagating permutes
3788 through them. */
3789 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3790 {
3791 /* We do not handle stores with a permutation, so all
3792 incoming permutes must have been materialized. */
3793 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3794 if (STMT_VINFO_DATA_REF (rep)
3795 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3796 {
3797 /* ??? We're forcing materialization in place
3798 of the child here, we'd need special handling
3799 in materialization to leave perm_in -1 here. */
3800 vertices[idx].perm_in = 0;
3801 vertices[idx].perm_out = 0;
3802 }
3803 /* We cannot move a permute across an operation that is
3804 not independent on lanes. Note this is an explicit
3805 negative list since that's much shorter than the respective
3806 positive one but it's critical to keep maintaining it. */
3807 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3808 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3809 {
3810 case CFN_COMPLEX_ADD_ROT90:
3811 case CFN_COMPLEX_ADD_ROT270:
3812 case CFN_COMPLEX_MUL:
3813 case CFN_COMPLEX_MUL_CONJ:
3814 case CFN_VEC_ADDSUB:
3815 case CFN_VEC_FMADDSUB:
3816 case CFN_VEC_FMSUBADD:
3817 vertices[idx].perm_in = 0;
3818 vertices[idx].perm_out = 0;
3819 default:;
3820 }
3821 }
3822
3823 if (!slpg->vertices[idx].succ)
3824 /* Pick up pre-computed leaf values. */
3825 ;
3826 else
3827 {
3828 bool any_succ_perm_out_m1 = false;
3829 int perm_in = vertices[idx].perm_in;
3830 for (graph_edge *succ = slpg->vertices[idx].succ;
3831 succ; succ = succ->succ_next)
3832 {
3833 int succ_idx = succ->dest;
3834 int succ_perm = vertices[succ_idx].perm_out;
3835 /* Handle unvisited (and constant) nodes optimistically. */
3836 /* ??? But for constants once we want to handle
3837 non-bijective permutes we have to verify the permute,
3838 when unifying lanes, will not unify different constants.
3839 For example see gcc.dg/vect/bb-slp-14.c for a case
3840 that would break. */
3841 if (succ_perm == -1)
3842 {
3843 /* When we handled a non-leaf optimistically, note
3844 that so we can adjust its outgoing permute below. */
3845 slp_tree succ_node = vertices[succ_idx].node;
3846 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3847 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3848 any_succ_perm_out_m1 = true;
3849 continue;
3850 }
3851 if (perm_in == -1)
3852 perm_in = succ_perm;
3853 else if (succ_perm == 0
3854 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3855 {
3856 perm_in = 0;
3857 break;
3858 }
3859 }
3860
3861 /* Adjust any incoming permutes we treated optimistically. */
3862 if (perm_in != -1 && any_succ_perm_out_m1)
3863 {
3864 for (graph_edge *succ = slpg->vertices[idx].succ;
3865 succ; succ = succ->succ_next)
3866 {
3867 slp_tree succ_node = vertices[succ->dest].node;
3868 if (vertices[succ->dest].perm_out == -1
3869 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3870 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3871 {
3872 vertices[succ->dest].perm_out = perm_in;
3873 /* And ensure this propagates. */
3874 if (vertices[succ->dest].perm_in == -1)
3875 vertices[succ->dest].perm_in = perm_in;
3876 }
3877 }
3878 changed = true;
3879 }
3880
3881 if (!vect_slp_perms_eq (perms, perm_in,
3882 vertices[idx].perm_in))
3883 {
3884 /* Make sure we eventually converge. */
3885 gcc_checking_assert (vertices[idx].perm_in == -1
3886 || perm_in == 0);
3887 vertices[idx].perm_in = perm_in;
3888
3889 /* While we can handle VEC_PERM nodes as transparent
3890 pass-through they can be a cheap materialization
3891 point as well. In addition they can act as source
3892 of a random permutation as well.
3893 The following ensures that former materialization
3894 points that now have zero incoming permutes no
3895 longer appear as such and that former "any" permutes
3896 get pass-through. We keep VEC_PERM nodes optimistic
3897 as "any" outgoing permute though. */
3898 if (vertices[idx].perm_out != 0
3899 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3900 vertices[idx].perm_out = perm_in;
3901 changed = true;
3902 }
3903 }
3904
3905 /* Elide pruning at materialization points in the first
3906 iteration phase. */
3907 if (!do_materialization)
3908 continue;
3909
3910 int perm = vertices[idx].perm_out;
3911 if (perm == 0 || perm == -1)
3912 continue;
3913
3914 /* Decide on permute materialization. Look whether there's
3915 a use (pred) edge that is permuted differently than us.
3916 In that case mark ourselves so the permutation is applied. */
3917 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3918 if (all_preds_permuted)
3919 for (graph_edge *pred = slpg->vertices[idx].pred;
3920 pred; pred = pred->pred_next)
3921 {
3922 int pred_perm = vertices[pred->src].perm_in;
3923 gcc_checking_assert (pred_perm != -1);
3924 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3925 {
3926 all_preds_permuted = false;
3927 break;
3928 }
3929 }
3930 if (!all_preds_permuted)
3931 {
3932 vertices[idx].perm_out = 0;
3933 changed = true;
3934 }
3935 }
3936
3937 /* If the initial propagation converged, switch on materialization
3938 and re-propagate. */
3939 if (!changed && !do_materialization)
3940 {
3941 do_materialization = true;
3942 changed = true;
3943 }
3944 }
3945 while (changed);
3946 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3947
3948 /* Materialize. */
3949 for (i = 0; i < vertices.length (); ++i)
3950 {
3951 int perm_in = vertices[i].perm_in;
3952 slp_tree node = vertices[i].node;
3953
3954 /* First permute invariant/external original successors, we handle
3955 those optimistically during propagation and duplicate them if
3956 they are used with different permutations. */
3957 unsigned j;
3958 slp_tree child;
3959 if (perm_in > 0)
3960 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3961 {
3962 if (!child
3963 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3964 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3965 continue;
3966
3967 /* If the vector is uniform there's nothing to do. */
3968 if (vect_slp_tree_uniform_p (child))
3969 continue;
3970
3971 /* We can end up sharing some externals via two_operator
3972 handling. Be prepared to unshare those. */
3973 if (child->refcnt != 1)
3974 {
3975 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3976 SLP_TREE_CHILDREN (node)[j] = child
3977 = vect_create_new_slp_node
3978 (SLP_TREE_SCALAR_OPS (child).copy ());
3979 }
3980 vect_slp_permute (perms[perm_in],
3981 SLP_TREE_SCALAR_OPS (child), true);
3982 }
3983
3984 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3985 {
3986 /* Apply the common permutes to the input vectors. */
3987 if (perm_in > 0)
3988 {
3989 /* If the node is already a permute node we can apply
3990 the permutation to the lane selection, effectively
3991 materializing it on the incoming vectors. */
3992 if (dump_enabled_p ())
3993 dump_printf_loc (MSG_NOTE, vect_location,
3994 "simplifying permute node %p\n",
3995 node);
3996 for (unsigned k = 0;
3997 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3998 SLP_TREE_LANE_PERMUTATION (node)[k].second
3999 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
4000 }
4001 /* Apply the anticipated output permute to the permute and
4002 stmt vectors. */
4003 int perm_out = vertices[i].perm_out;
4004 if (perm_out > 0)
4005 {
4006 vect_slp_permute (perms[perm_out],
4007 SLP_TREE_SCALAR_STMTS (node), true);
4008 vect_slp_permute (perms[perm_out],
4009 SLP_TREE_LANE_PERMUTATION (node), true);
4010 }
4011 }
4012 else if (vertices[i].get_perm_materialized () != 0)
4013 {
4014 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4015 /* For loads simply drop the permutation, the load permutation
4016 already performs the desired permutation. */
4017 ;
4018 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4019 gcc_unreachable ();
4020 else
4021 {
4022 if (dump_enabled_p ())
4023 dump_printf_loc (MSG_NOTE, vect_location,
4024 "inserting permute node in place of %p\n",
4025 node);
4026
4027 /* Make a copy of NODE and in-place change it to a
4028 VEC_PERM node to permute the lanes of the copy. */
4029 slp_tree copy = new _slp_tree;
4030 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
4031 SLP_TREE_CHILDREN (node) = vNULL;
4032 SLP_TREE_SCALAR_STMTS (copy)
4033 = SLP_TREE_SCALAR_STMTS (node).copy ();
4034 vect_slp_permute (perms[perm_in],
4035 SLP_TREE_SCALAR_STMTS (copy), true);
4036 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
4037 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
4038 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4039 SLP_TREE_LANE_PERMUTATION (copy)
4040 = SLP_TREE_LANE_PERMUTATION (node);
4041 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4042 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4043 copy->refcnt = 1;
4044 copy->max_nunits = node->max_nunits;
4045 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4046 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4047 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4048
4049 /* Now turn NODE into a VEC_PERM. */
4050 SLP_TREE_CHILDREN (node).safe_push (copy);
4051 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4052 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4053 SLP_TREE_LANE_PERMUTATION (node)
4054 .quick_push (std::make_pair (0, perms[perm_in][j]));
4055 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4056 }
4057 }
4058 else if (perm_in > 0) /* perm_in == perm_out */
4059 {
4060 /* Apply the reverse permutation to our stmts. */
4061 vect_slp_permute (perms[perm_in],
4062 SLP_TREE_SCALAR_STMTS (node), true);
4063 /* And to the lane/load permutation, which we can simply
4064 make regular by design. */
4065 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4066 {
4067 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4068 /* ??? When we handle non-bijective permutes the idea
4069 is that we can force the load-permutation to be
4070 { min, min + 1, min + 2, ... max }. But then the
4071 scalar defs might no longer match the lane content
4072 which means wrong-code with live lane vectorization.
4073 So we possibly have to have NULL entries for those. */
4074 vect_slp_permute (perms[perm_in],
4075 SLP_TREE_LOAD_PERMUTATION (node), true);
4076 }
4077 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4078 gcc_unreachable ();
4079 }
4080 }
4081
4082 /* Elide any permutations at BB reduction roots. */
4083 if (is_a <bb_vec_info> (vinfo))
4084 {
4085 for (slp_instance instance : vinfo->slp_instances)
4086 {
4087 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4088 continue;
4089 slp_tree old = SLP_INSTANCE_TREE (instance);
4090 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4091 && SLP_TREE_CHILDREN (old).length () == 1)
4092 {
4093 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4094 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4095 {
4096 /* Preserve the special VEC_PERM we use to shield existing
4097 vector defs from the rest. But make it a no-op. */
4098 auto_vec<stmt_vec_info, 64> saved;
4099 saved.create (SLP_TREE_SCALAR_STMTS (old).length ());
4100 for (unsigned i = 0;
4101 i < SLP_TREE_SCALAR_STMTS (old).length (); ++i)
4102 saved.quick_push (SLP_TREE_SCALAR_STMTS (old)[i]);
4103 for (unsigned i = 0;
4104 i < SLP_TREE_SCALAR_STMTS (old).length (); ++i)
4105 SLP_TREE_SCALAR_STMTS (old)[i]
4106 = saved[SLP_TREE_LANE_PERMUTATION (old)[i].second];
4107 unsigned i = 0;
4108 for (std::pair<unsigned, unsigned> &p
4109 : SLP_TREE_LANE_PERMUTATION (old))
4110 p.second = i++;
4111 }
4112 else
4113 {
4114 SLP_INSTANCE_TREE (instance) = child;
4115 SLP_TREE_REF_COUNT (child)++;
4116 vect_free_slp_tree (old);
4117 }
4118 }
4119 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4120 && SLP_TREE_REF_COUNT (old) == 1
4121 && vertices[old->vertex].get_perm_materialized () != 0)
4122 {
4123 /* ??? For loads the situation is more complex since
4124 we can't modify the permute in place in case the
4125 node is used multiple times. In fact for loads this
4126 should be somehow handled in the propagation engine. */
4127 /* Apply the reverse permutation to our stmts. */
4128 int perm = vertices[old->vertex].get_perm_materialized ();
4129 vect_slp_permute (perms[perm],
4130 SLP_TREE_SCALAR_STMTS (old), true);
4131 vect_slp_permute (perms[perm],
4132 SLP_TREE_LOAD_PERMUTATION (old), true);
4133 }
4134 }
4135 }
4136
4137 /* Free the perms vector used for propagation. */
4138 while (!perms.is_empty ())
4139 perms.pop ().release ();
4140 free_graph (slpg);
4141
4142
4143 /* Now elide load permutations that are not necessary. */
4144 for (i = 0; i < leafs.length (); ++i)
4145 {
4146 node = vertices[leafs[i]].node;
4147 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4148 continue;
4149
4150 /* In basic block vectorization we allow any subchain of an interleaving
4151 chain.
4152 FORNOW: not in loop SLP because of realignment complications. */
4153 if (is_a <bb_vec_info> (vinfo))
4154 {
4155 bool subchain_p = true;
4156 stmt_vec_info next_load_info = NULL;
4157 stmt_vec_info load_info;
4158 unsigned j;
4159 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4160 {
4161 if (j != 0
4162 && (next_load_info != load_info
4163 || DR_GROUP_GAP (load_info) != 1))
4164 {
4165 subchain_p = false;
4166 break;
4167 }
4168 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4169 }
4170 if (subchain_p)
4171 {
4172 SLP_TREE_LOAD_PERMUTATION (node).release ();
4173 continue;
4174 }
4175 }
4176 else
4177 {
4178 stmt_vec_info load_info;
4179 bool this_load_permuted = false;
4180 unsigned j;
4181 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4182 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4183 {
4184 this_load_permuted = true;
4185 break;
4186 }
4187 stmt_vec_info first_stmt_info
4188 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4189 if (!this_load_permuted
4190 /* The load requires permutation when unrolling exposes
4191 a gap either because the group is larger than the SLP
4192 group-size or because there is a gap between the groups. */
4193 && (known_eq (LOOP_VINFO_VECT_FACTOR
4194 (as_a <loop_vec_info> (vinfo)), 1U)
4195 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4196 && DR_GROUP_GAP (first_stmt_info) == 0)))
4197 {
4198 SLP_TREE_LOAD_PERMUTATION (node).release ();
4199 continue;
4200 }
4201 }
4202 }
4203 }
4204
4205 /* Gather loads reachable from the individual SLP graph entries. */
4206
4207 void
vect_gather_slp_loads(vec_info * vinfo)4208 vect_gather_slp_loads (vec_info *vinfo)
4209 {
4210 unsigned i;
4211 slp_instance instance;
4212 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4213 {
4214 hash_set<slp_tree> visited;
4215 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4216 SLP_INSTANCE_TREE (instance), visited);
4217 }
4218 }
4219
4220
4221 /* For each possible SLP instance decide whether to SLP it and calculate overall
4222 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4223 least one instance. */
4224
4225 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)4226 vect_make_slp_decision (loop_vec_info loop_vinfo)
4227 {
4228 unsigned int i;
4229 poly_uint64 unrolling_factor = 1;
4230 const vec<slp_instance> &slp_instances
4231 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4232 slp_instance instance;
4233 int decided_to_slp = 0;
4234
4235 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4236
4237 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4238 {
4239 /* FORNOW: SLP if you can. */
4240 /* All unroll factors have the form:
4241
4242 GET_MODE_SIZE (vinfo->vector_mode) * X
4243
4244 for some rational X, so they must have a common multiple. */
4245 unrolling_factor
4246 = force_common_multiple (unrolling_factor,
4247 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4248
4249 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4250 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4251 loop-based vectorization. Such stmts will be marked as HYBRID. */
4252 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4253 decided_to_slp++;
4254 }
4255
4256 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4257
4258 if (decided_to_slp && dump_enabled_p ())
4259 {
4260 dump_printf_loc (MSG_NOTE, vect_location,
4261 "Decided to SLP %d instances. Unrolling factor ",
4262 decided_to_slp);
4263 dump_dec (MSG_NOTE, unrolling_factor);
4264 dump_printf (MSG_NOTE, "\n");
4265 }
4266
4267 return (decided_to_slp > 0);
4268 }
4269
4270 /* Private data for vect_detect_hybrid_slp. */
4271 struct vdhs_data
4272 {
4273 loop_vec_info loop_vinfo;
4274 vec<stmt_vec_info> *worklist;
4275 };
4276
4277 /* Walker for walk_gimple_op. */
4278
4279 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)4280 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4281 {
4282 walk_stmt_info *wi = (walk_stmt_info *)data;
4283 vdhs_data *dat = (vdhs_data *)wi->info;
4284
4285 if (wi->is_lhs)
4286 return NULL_TREE;
4287
4288 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4289 if (!def_stmt_info)
4290 return NULL_TREE;
4291 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4292 if (PURE_SLP_STMT (def_stmt_info))
4293 {
4294 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4296 def_stmt_info->stmt);
4297 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4298 dat->worklist->safe_push (def_stmt_info);
4299 }
4300
4301 return NULL_TREE;
4302 }
4303
4304 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4305 if so, otherwise pushing it to WORKLIST. */
4306
4307 static void
maybe_push_to_hybrid_worklist(vec_info * vinfo,vec<stmt_vec_info> & worklist,stmt_vec_info stmt_info)4308 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4309 vec<stmt_vec_info> &worklist,
4310 stmt_vec_info stmt_info)
4311 {
4312 if (dump_enabled_p ())
4313 dump_printf_loc (MSG_NOTE, vect_location,
4314 "Processing hybrid candidate : %G", stmt_info->stmt);
4315 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4316 imm_use_iterator iter2;
4317 ssa_op_iter iter1;
4318 use_operand_p use_p;
4319 def_operand_p def_p;
4320 bool any_def = false;
4321 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4322 {
4323 any_def = true;
4324 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4325 {
4326 if (is_gimple_debug (USE_STMT (use_p)))
4327 continue;
4328 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4329 /* An out-of loop use means this is a loop_vect sink. */
4330 if (!use_info)
4331 {
4332 if (dump_enabled_p ())
4333 dump_printf_loc (MSG_NOTE, vect_location,
4334 "Found loop_vect sink: %G", stmt_info->stmt);
4335 worklist.safe_push (stmt_info);
4336 return;
4337 }
4338 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4339 {
4340 if (dump_enabled_p ())
4341 dump_printf_loc (MSG_NOTE, vect_location,
4342 "Found loop_vect use: %G", use_info->stmt);
4343 worklist.safe_push (stmt_info);
4344 return;
4345 }
4346 }
4347 }
4348 /* No def means this is a loo_vect sink. */
4349 if (!any_def)
4350 {
4351 if (dump_enabled_p ())
4352 dump_printf_loc (MSG_NOTE, vect_location,
4353 "Found loop_vect sink: %G", stmt_info->stmt);
4354 worklist.safe_push (stmt_info);
4355 return;
4356 }
4357 if (dump_enabled_p ())
4358 dump_printf_loc (MSG_NOTE, vect_location,
4359 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4360 STMT_SLP_TYPE (stmt_info) = pure_slp;
4361 }
4362
4363 /* Find stmts that must be both vectorized and SLPed. */
4364
4365 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)4366 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4367 {
4368 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4369
4370 /* All stmts participating in SLP are marked pure_slp, all other
4371 stmts are loop_vect.
4372 First collect all loop_vect stmts into a worklist.
4373 SLP patterns cause not all original scalar stmts to appear in
4374 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4375 Rectify this here and do a backward walk over the IL only considering
4376 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4377 mark them as pure_slp. */
4378 auto_vec<stmt_vec_info> worklist;
4379 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4380 {
4381 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4382 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4383 gsi_next (&gsi))
4384 {
4385 gphi *phi = gsi.phi ();
4386 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4387 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4388 maybe_push_to_hybrid_worklist (loop_vinfo,
4389 worklist, stmt_info);
4390 }
4391 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4392 gsi_prev (&gsi))
4393 {
4394 gimple *stmt = gsi_stmt (gsi);
4395 if (is_gimple_debug (stmt))
4396 continue;
4397 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4398 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4399 {
4400 for (gimple_stmt_iterator gsi2
4401 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4402 !gsi_end_p (gsi2); gsi_next (&gsi2))
4403 {
4404 stmt_vec_info patt_info
4405 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4406 if (!STMT_SLP_TYPE (patt_info)
4407 && STMT_VINFO_RELEVANT (patt_info))
4408 maybe_push_to_hybrid_worklist (loop_vinfo,
4409 worklist, patt_info);
4410 }
4411 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4412 }
4413 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4414 maybe_push_to_hybrid_worklist (loop_vinfo,
4415 worklist, stmt_info);
4416 }
4417 }
4418
4419 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4420 mark any SLP vectorized stmt as hybrid.
4421 ??? We're visiting def stmts N times (once for each non-SLP and
4422 once for each hybrid-SLP use). */
4423 walk_stmt_info wi;
4424 vdhs_data dat;
4425 dat.worklist = &worklist;
4426 dat.loop_vinfo = loop_vinfo;
4427 memset (&wi, 0, sizeof (wi));
4428 wi.info = (void *)&dat;
4429 while (!worklist.is_empty ())
4430 {
4431 stmt_vec_info stmt_info = worklist.pop ();
4432 /* Since SSA operands are not set up for pattern stmts we need
4433 to use walk_gimple_op. */
4434 wi.is_lhs = 0;
4435 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4436 /* For gather/scatter make sure to walk the offset operand, that
4437 can be a scaling and conversion away. */
4438 gather_scatter_info gs_info;
4439 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4440 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
4441 {
4442 int dummy;
4443 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
4444 }
4445 }
4446 }
4447
4448
4449 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4450
_bb_vec_info(vec<basic_block> _bbs,vec_info_shared * shared)4451 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4452 : vec_info (vec_info::bb, shared),
4453 bbs (_bbs),
4454 roots (vNULL)
4455 {
4456 for (unsigned i = 0; i < bbs.length (); ++i)
4457 {
4458 if (i != 0)
4459 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4460 gsi_next (&si))
4461 {
4462 gphi *phi = si.phi ();
4463 gimple_set_uid (phi, 0);
4464 add_stmt (phi);
4465 }
4466 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4467 !gsi_end_p (gsi); gsi_next (&gsi))
4468 {
4469 gimple *stmt = gsi_stmt (gsi);
4470 gimple_set_uid (stmt, 0);
4471 if (is_gimple_debug (stmt))
4472 continue;
4473 add_stmt (stmt);
4474 }
4475 }
4476 }
4477
4478
4479 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4480 stmts in the basic block. */
4481
~_bb_vec_info()4482 _bb_vec_info::~_bb_vec_info ()
4483 {
4484 /* Reset region marker. */
4485 for (unsigned i = 0; i < bbs.length (); ++i)
4486 {
4487 if (i != 0)
4488 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4489 gsi_next (&si))
4490 {
4491 gphi *phi = si.phi ();
4492 gimple_set_uid (phi, -1);
4493 }
4494 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4495 !gsi_end_p (gsi); gsi_next (&gsi))
4496 {
4497 gimple *stmt = gsi_stmt (gsi);
4498 gimple_set_uid (stmt, -1);
4499 }
4500 }
4501
4502 for (unsigned i = 0; i < roots.length (); ++i)
4503 {
4504 roots[i].stmts.release ();
4505 roots[i].roots.release ();
4506 }
4507 roots.release ();
4508 }
4509
4510 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4511 given then that child nodes have already been processed, and that
4512 their def types currently match their SLP node's def type. */
4513
4514 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)4515 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4516 slp_instance node_instance,
4517 stmt_vector_for_cost *cost_vec)
4518 {
4519 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4520
4521 /* Calculate the number of vector statements to be created for the
4522 scalar stmts in this node. For SLP reductions it is equal to the
4523 number of vector statements in the children (which has already been
4524 calculated by the recursive call). Otherwise it is the number of
4525 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4526 VF divided by the number of elements in a vector. */
4527 if (!STMT_VINFO_DATA_REF (stmt_info)
4528 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4529 {
4530 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4531 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4532 {
4533 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4534 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4535 break;
4536 }
4537 }
4538 else
4539 {
4540 poly_uint64 vf;
4541 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4542 vf = loop_vinfo->vectorization_factor;
4543 else
4544 vf = 1;
4545 unsigned int group_size = SLP_TREE_LANES (node);
4546 tree vectype = SLP_TREE_VECTYPE (node);
4547 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4548 = vect_get_num_vectors (vf * group_size, vectype);
4549 }
4550
4551 /* Handle purely internal nodes. */
4552 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4553 {
4554 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
4555 return false;
4556
4557 stmt_vec_info slp_stmt_info;
4558 unsigned int i;
4559 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
4560 {
4561 if (STMT_VINFO_LIVE_P (slp_stmt_info)
4562 && !vectorizable_live_operation (vinfo,
4563 slp_stmt_info, NULL, node,
4564 node_instance, i,
4565 false, cost_vec))
4566 return false;
4567 }
4568 return true;
4569 }
4570
4571 bool dummy;
4572 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4573 node, node_instance, cost_vec);
4574 }
4575
4576 /* Try to build NODE from scalars, returning true on success.
4577 NODE_INSTANCE is the SLP instance that contains NODE. */
4578
4579 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)4580 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4581 slp_instance node_instance)
4582 {
4583 stmt_vec_info stmt_info;
4584 unsigned int i;
4585
4586 if (!is_a <bb_vec_info> (vinfo)
4587 || node == SLP_INSTANCE_TREE (node_instance)
4588 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4589 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4590 return false;
4591
4592 if (dump_enabled_p ())
4593 dump_printf_loc (MSG_NOTE, vect_location,
4594 "Building vector operands of %p from scalars instead\n", node);
4595
4596 /* Don't remove and free the child nodes here, since they could be
4597 referenced by other structures. The analysis and scheduling phases
4598 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4599 unsigned int group_size = SLP_TREE_LANES (node);
4600 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4601 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4602 SLP_TREE_LOAD_PERMUTATION (node).release ();
4603 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4604 {
4605 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4606 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4607 }
4608 return true;
4609 }
4610
4611 /* Return true if all elements of the slice are the same. */
4612 bool
all_same_p() const4613 vect_scalar_ops_slice::all_same_p () const
4614 {
4615 for (unsigned int i = 1; i < length; ++i)
4616 if (!operand_equal_p (op (0), op (i)))
4617 return false;
4618 return true;
4619 }
4620
4621 hashval_t
hash(const value_type & s)4622 vect_scalar_ops_slice_hash::hash (const value_type &s)
4623 {
4624 hashval_t hash = 0;
4625 for (unsigned i = 0; i < s.length; ++i)
4626 hash = iterative_hash_expr (s.op (i), hash);
4627 return hash;
4628 }
4629
4630 bool
equal(const value_type & s1,const compare_type & s2)4631 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4632 const compare_type &s2)
4633 {
4634 if (s1.length != s2.length)
4635 return false;
4636 for (unsigned i = 0; i < s1.length; ++i)
4637 if (!operand_equal_p (s1.op (i), s2.op (i)))
4638 return false;
4639 return true;
4640 }
4641
4642 /* Compute the prologue cost for invariant or constant operands represented
4643 by NODE. */
4644
4645 static void
vect_prologue_cost_for_slp(slp_tree node,stmt_vector_for_cost * cost_vec)4646 vect_prologue_cost_for_slp (slp_tree node,
4647 stmt_vector_for_cost *cost_vec)
4648 {
4649 /* There's a special case of an existing vector, that costs nothing. */
4650 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4651 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4652 return;
4653 /* Without looking at the actual initializer a vector of
4654 constants can be implemented as load from the constant pool.
4655 When all elements are the same we can use a splat. */
4656 tree vectype = SLP_TREE_VECTYPE (node);
4657 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4658 unsigned HOST_WIDE_INT const_nunits;
4659 unsigned nelt_limit;
4660 auto ops = &SLP_TREE_SCALAR_OPS (node);
4661 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4662 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4663 && ! multiple_p (const_nunits, group_size))
4664 {
4665 nelt_limit = const_nunits;
4666 hash_set<vect_scalar_ops_slice_hash> vector_ops;
4667 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4668 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4669 starts.quick_push (i * const_nunits);
4670 }
4671 else
4672 {
4673 /* If either the vector has variable length or the vectors
4674 are composed of repeated whole groups we only need to
4675 cost construction once. All vectors will be the same. */
4676 nelt_limit = group_size;
4677 starts.quick_push (0);
4678 }
4679 /* ??? We're just tracking whether vectors in a single node are the same.
4680 Ideally we'd do something more global. */
4681 for (unsigned int start : starts)
4682 {
4683 vect_cost_for_stmt kind;
4684 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4685 kind = vector_load;
4686 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4687 kind = scalar_to_vec;
4688 else
4689 kind = vec_construct;
4690 record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4691 }
4692 }
4693
4694 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4695 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4696
4697 Return true if the operations are supported. */
4698
4699 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited_set,vec<slp_tree> & visited_vec,stmt_vector_for_cost * cost_vec)4700 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4701 slp_instance node_instance,
4702 hash_set<slp_tree> &visited_set,
4703 vec<slp_tree> &visited_vec,
4704 stmt_vector_for_cost *cost_vec)
4705 {
4706 int i, j;
4707 slp_tree child;
4708
4709 /* Assume we can code-generate all invariants. */
4710 if (!node
4711 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4712 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4713 return true;
4714
4715 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4716 {
4717 if (dump_enabled_p ())
4718 dump_printf_loc (MSG_NOTE, vect_location,
4719 "Failed cyclic SLP reference in %p\n", node);
4720 return false;
4721 }
4722 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4723
4724 /* If we already analyzed the exact same set of scalar stmts we're done.
4725 We share the generated vector stmts for those. */
4726 if (visited_set.add (node))
4727 return true;
4728 visited_vec.safe_push (node);
4729
4730 bool res = true;
4731 unsigned visited_rec_start = visited_vec.length ();
4732 unsigned cost_vec_rec_start = cost_vec->length ();
4733 bool seen_non_constant_child = false;
4734 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4735 {
4736 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4737 visited_set, visited_vec,
4738 cost_vec);
4739 if (!res)
4740 break;
4741 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4742 seen_non_constant_child = true;
4743 }
4744 /* We're having difficulties scheduling nodes with just constant
4745 operands and no scalar stmts since we then cannot compute a stmt
4746 insertion place. */
4747 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4748 {
4749 if (dump_enabled_p ())
4750 dump_printf_loc (MSG_NOTE, vect_location,
4751 "Cannot vectorize all-constant op node %p\n", node);
4752 res = false;
4753 }
4754
4755 if (res)
4756 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4757 cost_vec);
4758 /* If analysis failed we have to pop all recursive visited nodes
4759 plus ourselves. */
4760 if (!res)
4761 {
4762 while (visited_vec.length () >= visited_rec_start)
4763 visited_set.remove (visited_vec.pop ());
4764 cost_vec->truncate (cost_vec_rec_start);
4765 }
4766
4767 /* When the node can be vectorized cost invariant nodes it references.
4768 This is not done in DFS order to allow the refering node
4769 vectorizable_* calls to nail down the invariant nodes vector type
4770 and possibly unshare it if it needs a different vector type than
4771 other referrers. */
4772 if (res)
4773 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4774 if (child
4775 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4776 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4777 /* Perform usual caching, note code-generation still
4778 code-gens these nodes multiple times but we expect
4779 to CSE them later. */
4780 && !visited_set.add (child))
4781 {
4782 visited_vec.safe_push (child);
4783 /* ??? After auditing more code paths make a "default"
4784 and push the vector type from NODE to all children
4785 if it is not already set. */
4786 /* Compute the number of vectors to be generated. */
4787 tree vector_type = SLP_TREE_VECTYPE (child);
4788 if (!vector_type)
4789 {
4790 /* For shifts with a scalar argument we don't need
4791 to cost or code-generate anything.
4792 ??? Represent this more explicitely. */
4793 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4794 == shift_vec_info_type)
4795 && j == 1);
4796 continue;
4797 }
4798 unsigned group_size = SLP_TREE_LANES (child);
4799 poly_uint64 vf = 1;
4800 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4801 vf = loop_vinfo->vectorization_factor;
4802 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4803 = vect_get_num_vectors (vf * group_size, vector_type);
4804 /* And cost them. */
4805 vect_prologue_cost_for_slp (child, cost_vec);
4806 }
4807
4808 /* If this node or any of its children can't be vectorized, try pruning
4809 the tree here rather than felling the whole thing. */
4810 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4811 {
4812 /* We'll need to revisit this for invariant costing and number
4813 of vectorized stmt setting. */
4814 res = true;
4815 }
4816
4817 return res;
4818 }
4819
4820 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4821 region and that can be vectorized using vectorizable_live_operation
4822 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4823 scalar code computing it to be retained. */
4824
4825 static void
vect_bb_slp_mark_live_stmts(bb_vec_info bb_vinfo,slp_tree node,slp_instance instance,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & svisited,hash_set<slp_tree> & visited)4826 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4827 slp_instance instance,
4828 stmt_vector_for_cost *cost_vec,
4829 hash_set<stmt_vec_info> &svisited,
4830 hash_set<slp_tree> &visited)
4831 {
4832 if (visited.add (node))
4833 return;
4834
4835 unsigned i;
4836 stmt_vec_info stmt_info;
4837 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4838 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4839 {
4840 if (svisited.contains (stmt_info))
4841 continue;
4842 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4843 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4844 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4845 /* Only the pattern root stmt computes the original scalar value. */
4846 continue;
4847 bool mark_visited = true;
4848 gimple *orig_stmt = orig_stmt_info->stmt;
4849 ssa_op_iter op_iter;
4850 def_operand_p def_p;
4851 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4852 {
4853 imm_use_iterator use_iter;
4854 gimple *use_stmt;
4855 stmt_vec_info use_stmt_info;
4856 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4857 if (!is_gimple_debug (use_stmt))
4858 {
4859 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4860 if (!use_stmt_info
4861 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4862 {
4863 STMT_VINFO_LIVE_P (stmt_info) = true;
4864 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4865 NULL, node, instance, i,
4866 false, cost_vec))
4867 /* ??? So we know we can vectorize the live stmt
4868 from one SLP node. If we cannot do so from all
4869 or none consistently we'd have to record which
4870 SLP node (and lane) we want to use for the live
4871 operation. So make sure we can code-generate
4872 from all nodes. */
4873 mark_visited = false;
4874 else
4875 STMT_VINFO_LIVE_P (stmt_info) = false;
4876 break;
4877 }
4878 }
4879 /* We have to verify whether we can insert the lane extract
4880 before all uses. The following is a conservative approximation.
4881 We cannot put this into vectorizable_live_operation because
4882 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4883 doesn't work.
4884 Note that while the fact that we emit code for loads at the
4885 first load should make this a non-problem leafs we construct
4886 from scalars are vectorized after the last scalar def.
4887 ??? If we'd actually compute the insert location during
4888 analysis we could use sth less conservative than the last
4889 scalar stmt in the node for the dominance check. */
4890 /* ??? What remains is "live" uses in vector CTORs in the same
4891 SLP graph which is where those uses can end up code-generated
4892 right after their definition instead of close to their original
4893 use. But that would restrict us to code-generate lane-extracts
4894 from the latest stmt in a node. So we compensate for this
4895 during code-generation, simply not replacing uses for those
4896 hopefully rare cases. */
4897 if (STMT_VINFO_LIVE_P (stmt_info))
4898 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4899 if (!is_gimple_debug (use_stmt)
4900 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4901 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4902 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4903 {
4904 if (dump_enabled_p ())
4905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4906 "Cannot determine insertion place for "
4907 "lane extract\n");
4908 STMT_VINFO_LIVE_P (stmt_info) = false;
4909 mark_visited = true;
4910 }
4911 }
4912 if (mark_visited)
4913 svisited.add (stmt_info);
4914 }
4915
4916 slp_tree child;
4917 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4918 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4919 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4920 cost_vec, svisited, visited);
4921 }
4922
4923 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4924
4925 static bool
vectorizable_bb_reduc_epilogue(slp_instance instance,stmt_vector_for_cost * cost_vec)4926 vectorizable_bb_reduc_epilogue (slp_instance instance,
4927 stmt_vector_for_cost *cost_vec)
4928 {
4929 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4930 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4931 if (reduc_code == MINUS_EXPR)
4932 reduc_code = PLUS_EXPR;
4933 internal_fn reduc_fn;
4934 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4935 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4936 || reduc_fn == IFN_LAST
4937 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4938 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4939 TREE_TYPE (vectype)))
4940 return false;
4941
4942 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4943 cost log2 vector operations plus shuffles and one extraction. */
4944 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4945 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4946 vectype, 0, vect_body);
4947 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4948 vectype, 0, vect_body);
4949 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4950 vectype, 0, vect_body);
4951 return true;
4952 }
4953
4954 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4955 and recurse to children. */
4956
4957 static void
vect_slp_prune_covered_roots(slp_tree node,hash_set<stmt_vec_info> & roots,hash_set<slp_tree> & visited)4958 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4959 hash_set<slp_tree> &visited)
4960 {
4961 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4962 || visited.add (node))
4963 return;
4964
4965 stmt_vec_info stmt;
4966 unsigned i;
4967 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4968 roots.remove (vect_orig_stmt (stmt));
4969
4970 slp_tree child;
4971 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4972 if (child)
4973 vect_slp_prune_covered_roots (child, roots, visited);
4974 }
4975
4976 /* Analyze statements in SLP instances of VINFO. Return true if the
4977 operations are supported. */
4978
4979 bool
vect_slp_analyze_operations(vec_info * vinfo)4980 vect_slp_analyze_operations (vec_info *vinfo)
4981 {
4982 slp_instance instance;
4983 int i;
4984
4985 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4986
4987 hash_set<slp_tree> visited;
4988 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4989 {
4990 auto_vec<slp_tree> visited_vec;
4991 stmt_vector_for_cost cost_vec;
4992 cost_vec.create (2);
4993 if (is_a <bb_vec_info> (vinfo))
4994 vect_location = instance->location ();
4995 if (!vect_slp_analyze_node_operations (vinfo,
4996 SLP_INSTANCE_TREE (instance),
4997 instance, visited, visited_vec,
4998 &cost_vec)
4999 /* CTOR instances require vectorized defs for the SLP tree root. */
5000 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
5001 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
5002 != vect_internal_def
5003 /* Make sure we vectorized with the expected type. */
5004 || !useless_type_conversion_p
5005 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
5006 (instance->root_stmts[0]->stmt))),
5007 TREE_TYPE (SLP_TREE_VECTYPE
5008 (SLP_INSTANCE_TREE (instance))))))
5009 /* Check we can vectorize the reduction. */
5010 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
5011 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
5012 {
5013 slp_tree node = SLP_INSTANCE_TREE (instance);
5014 stmt_vec_info stmt_info;
5015 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5016 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5017 else
5018 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5019 if (dump_enabled_p ())
5020 dump_printf_loc (MSG_NOTE, vect_location,
5021 "removing SLP instance operations starting from: %G",
5022 stmt_info->stmt);
5023 vect_free_slp_instance (instance);
5024 vinfo->slp_instances.ordered_remove (i);
5025 cost_vec.release ();
5026 while (!visited_vec.is_empty ())
5027 visited.remove (visited_vec.pop ());
5028 }
5029 else
5030 {
5031 i++;
5032 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
5033 {
5034 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
5035 cost_vec.release ();
5036 }
5037 else
5038 /* For BB vectorization remember the SLP graph entry
5039 cost for later. */
5040 instance->cost_vec = cost_vec;
5041 }
5042 }
5043
5044 /* Now look for SLP instances with a root that are covered by other
5045 instances and remove them. */
5046 hash_set<stmt_vec_info> roots;
5047 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5048 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5049 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
5050 if (!roots.is_empty ())
5051 {
5052 visited.empty ();
5053 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5054 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
5055 visited);
5056 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
5057 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
5058 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
5059 {
5060 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5061 if (dump_enabled_p ())
5062 dump_printf_loc (MSG_NOTE, vect_location,
5063 "removing SLP instance operations starting "
5064 "from: %G", root->stmt);
5065 vect_free_slp_instance (instance);
5066 vinfo->slp_instances.ordered_remove (i);
5067 }
5068 else
5069 ++i;
5070 }
5071
5072 /* Compute vectorizable live stmts. */
5073 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5074 {
5075 hash_set<stmt_vec_info> svisited;
5076 hash_set<slp_tree> visited;
5077 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5078 {
5079 vect_location = instance->location ();
5080 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5081 instance, &instance->cost_vec, svisited,
5082 visited);
5083 }
5084 }
5085
5086 return !vinfo->slp_instances.is_empty ();
5087 }
5088
5089 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5090 closing the eventual chain. */
5091
5092 static slp_instance
get_ultimate_leader(slp_instance instance,hash_map<slp_instance,slp_instance> & instance_leader)5093 get_ultimate_leader (slp_instance instance,
5094 hash_map<slp_instance, slp_instance> &instance_leader)
5095 {
5096 auto_vec<slp_instance *, 8> chain;
5097 slp_instance *tem;
5098 while (*(tem = instance_leader.get (instance)) != instance)
5099 {
5100 chain.safe_push (tem);
5101 instance = *tem;
5102 }
5103 while (!chain.is_empty ())
5104 *chain.pop () = instance;
5105 return instance;
5106 }
5107
5108 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5109
5110 static void
vect_bb_partition_graph_r(bb_vec_info bb_vinfo,slp_instance instance,slp_tree node,hash_map<stmt_vec_info,slp_instance> & stmt_to_instance,hash_map<slp_instance,slp_instance> & instance_leader,hash_set<slp_tree> & visited)5111 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5112 slp_instance instance, slp_tree node,
5113 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5114 hash_map<slp_instance, slp_instance> &instance_leader,
5115 hash_set<slp_tree> &visited)
5116 {
5117 stmt_vec_info stmt_info;
5118 unsigned i;
5119
5120 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5121 {
5122 bool existed_p;
5123 slp_instance &stmt_instance
5124 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5125 if (!existed_p)
5126 ;
5127 else if (stmt_instance != instance)
5128 {
5129 /* If we're running into a previously marked stmt make us the
5130 leader of the current ultimate leader. This keeps the
5131 leader chain acyclic and works even when the current instance
5132 connects two previously independent graph parts. */
5133 slp_instance stmt_leader
5134 = get_ultimate_leader (stmt_instance, instance_leader);
5135 if (stmt_leader != instance)
5136 instance_leader.put (stmt_leader, instance);
5137 }
5138 stmt_instance = instance;
5139 }
5140
5141 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5142 return;
5143
5144 slp_tree child;
5145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5146 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5147 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5148 instance_leader, visited);
5149 }
5150
5151 /* Partition the SLP graph into pieces that can be costed independently. */
5152
5153 static void
vect_bb_partition_graph(bb_vec_info bb_vinfo)5154 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5155 {
5156 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5157
5158 /* First walk the SLP graph assigning each involved scalar stmt a
5159 corresponding SLP graph entry and upon visiting a previously
5160 marked stmt, make the stmts leader the current SLP graph entry. */
5161 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5162 hash_map<slp_instance, slp_instance> instance_leader;
5163 hash_set<slp_tree> visited;
5164 slp_instance instance;
5165 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5166 {
5167 instance_leader.put (instance, instance);
5168 vect_bb_partition_graph_r (bb_vinfo,
5169 instance, SLP_INSTANCE_TREE (instance),
5170 stmt_to_instance, instance_leader,
5171 visited);
5172 }
5173
5174 /* Then collect entries to each independent subgraph. */
5175 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5176 {
5177 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5178 leader->subgraph_entries.safe_push (instance);
5179 if (dump_enabled_p ()
5180 && leader != instance)
5181 dump_printf_loc (MSG_NOTE, vect_location,
5182 "instance %p is leader of %p\n",
5183 leader, instance);
5184 }
5185 }
5186
5187 /* Compute the set of scalar stmts participating in internal and external
5188 nodes. */
5189
5190 static void
vect_slp_gather_vectorized_scalar_stmts(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited,hash_set<stmt_vec_info> & vstmts,hash_set<stmt_vec_info> & estmts)5191 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5192 hash_set<slp_tree> &visited,
5193 hash_set<stmt_vec_info> &vstmts,
5194 hash_set<stmt_vec_info> &estmts)
5195 {
5196 int i;
5197 stmt_vec_info stmt_info;
5198 slp_tree child;
5199
5200 if (visited.add (node))
5201 return;
5202
5203 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5204 {
5205 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5206 vstmts.add (stmt_info);
5207
5208 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5209 if (child)
5210 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5211 vstmts, estmts);
5212 }
5213 else
5214 for (tree def : SLP_TREE_SCALAR_OPS (node))
5215 {
5216 stmt_vec_info def_stmt = vinfo->lookup_def (def);
5217 if (def_stmt)
5218 estmts.add (def_stmt);
5219 }
5220 }
5221
5222
5223 /* Compute the scalar cost of the SLP node NODE and its children
5224 and return it. Do not account defs that are marked in LIFE and
5225 update LIFE according to uses of NODE. */
5226
5227 static void
vect_bb_slp_scalar_cost(vec_info * vinfo,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & vectorized_scalar_stmts,hash_set<slp_tree> & visited)5228 vect_bb_slp_scalar_cost (vec_info *vinfo,
5229 slp_tree node, vec<bool, va_heap> *life,
5230 stmt_vector_for_cost *cost_vec,
5231 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5232 hash_set<slp_tree> &visited)
5233 {
5234 unsigned i;
5235 stmt_vec_info stmt_info;
5236 slp_tree child;
5237
5238 if (visited.add (node))
5239 return;
5240
5241 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5242 {
5243 ssa_op_iter op_iter;
5244 def_operand_p def_p;
5245
5246 if ((*life)[i])
5247 continue;
5248
5249 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5250 gimple *orig_stmt = orig_stmt_info->stmt;
5251
5252 /* If there is a non-vectorized use of the defs then the scalar
5253 stmt is kept live in which case we do not account it or any
5254 required defs in the SLP children in the scalar cost. This
5255 way we make the vectorization more costly when compared to
5256 the scalar cost. */
5257 if (!STMT_VINFO_LIVE_P (stmt_info))
5258 {
5259 auto_vec<gimple *, 8> worklist;
5260 hash_set<gimple *> *worklist_visited = NULL;
5261 worklist.quick_push (orig_stmt);
5262 do
5263 {
5264 gimple *work_stmt = worklist.pop ();
5265 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
5266 {
5267 imm_use_iterator use_iter;
5268 gimple *use_stmt;
5269 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
5270 DEF_FROM_PTR (def_p))
5271 if (!is_gimple_debug (use_stmt))
5272 {
5273 stmt_vec_info use_stmt_info
5274 = vinfo->lookup_stmt (use_stmt);
5275 if (!use_stmt_info
5276 || !vectorized_scalar_stmts.contains (use_stmt_info))
5277 {
5278 if (use_stmt_info
5279 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
5280 {
5281 /* For stmts participating in patterns we have
5282 to check its uses recursively. */
5283 if (!worklist_visited)
5284 worklist_visited = new hash_set<gimple *> ();
5285 if (!worklist_visited->add (use_stmt))
5286 worklist.safe_push (use_stmt);
5287 continue;
5288 }
5289 (*life)[i] = true;
5290 goto next_lane;
5291 }
5292 }
5293 }
5294 }
5295 while (!worklist.is_empty ());
5296 next_lane:
5297 if (worklist_visited)
5298 delete worklist_visited;
5299 if ((*life)[i])
5300 continue;
5301 }
5302
5303 /* Count scalar stmts only once. */
5304 if (gimple_visited_p (orig_stmt))
5305 continue;
5306 gimple_set_visited (orig_stmt, true);
5307
5308 vect_cost_for_stmt kind;
5309 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5310 {
5311 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5312 kind = scalar_load;
5313 else
5314 kind = scalar_store;
5315 }
5316 else if (vect_nop_conversion_p (orig_stmt_info))
5317 continue;
5318 /* For single-argument PHIs assume coalescing which means zero cost
5319 for the scalar and the vector PHIs. This avoids artificially
5320 favoring the vector path (but may pessimize it in some cases). */
5321 else if (is_a <gphi *> (orig_stmt_info->stmt)
5322 && gimple_phi_num_args
5323 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5324 continue;
5325 else
5326 kind = scalar_stmt;
5327 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5328 SLP_TREE_VECTYPE (node), 0, vect_body);
5329 }
5330
5331 auto_vec<bool, 20> subtree_life;
5332 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5333 {
5334 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5335 {
5336 /* Do not directly pass LIFE to the recursive call, copy it to
5337 confine changes in the callee to the current child/subtree. */
5338 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5339 {
5340 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5341 for (unsigned j = 0;
5342 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5343 {
5344 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5345 if (perm.first == i)
5346 subtree_life[perm.second] = (*life)[j];
5347 }
5348 }
5349 else
5350 {
5351 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5352 subtree_life.safe_splice (*life);
5353 }
5354 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5355 vectorized_scalar_stmts, visited);
5356 subtree_life.truncate (0);
5357 }
5358 }
5359 }
5360
5361 /* Comparator for the loop-index sorted cost vectors. */
5362
5363 static int
li_cost_vec_cmp(const void * a_,const void * b_)5364 li_cost_vec_cmp (const void *a_, const void *b_)
5365 {
5366 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5367 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5368 if (a->first < b->first)
5369 return -1;
5370 else if (a->first == b->first)
5371 return 0;
5372 return 1;
5373 }
5374
5375 /* Check if vectorization of the basic block is profitable for the
5376 subgraph denoted by SLP_INSTANCES. */
5377
5378 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo,vec<slp_instance> slp_instances,loop_p orig_loop)5379 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5380 vec<slp_instance> slp_instances,
5381 loop_p orig_loop)
5382 {
5383 slp_instance instance;
5384 int i;
5385 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5386 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5387
5388 if (dump_enabled_p ())
5389 {
5390 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5391 hash_set<slp_tree> visited;
5392 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5393 vect_print_slp_graph (MSG_NOTE, vect_location,
5394 SLP_INSTANCE_TREE (instance), visited);
5395 }
5396
5397 /* Compute the set of scalar stmts we know will go away 'locally' when
5398 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5399 not accurate for nodes promoted extern late or for scalar stmts that
5400 are used both in extern defs and in vectorized defs. */
5401 hash_set<stmt_vec_info> vectorized_scalar_stmts;
5402 hash_set<stmt_vec_info> scalar_stmts_in_externs;
5403 hash_set<slp_tree> visited;
5404 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5405 {
5406 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5407 SLP_INSTANCE_TREE (instance),
5408 visited,
5409 vectorized_scalar_stmts,
5410 scalar_stmts_in_externs);
5411 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5412 vectorized_scalar_stmts.add (rstmt);
5413 }
5414 /* Scalar stmts used as defs in external nodes need to be preseved, so
5415 remove them from vectorized_scalar_stmts. */
5416 for (stmt_vec_info stmt : scalar_stmts_in_externs)
5417 vectorized_scalar_stmts.remove (stmt);
5418
5419 /* Calculate scalar cost and sum the cost for the vector stmts
5420 previously collected. */
5421 stmt_vector_for_cost scalar_costs = vNULL;
5422 stmt_vector_for_cost vector_costs = vNULL;
5423 visited.empty ();
5424 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5425 {
5426 auto_vec<bool, 20> life;
5427 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5428 true);
5429 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5430 record_stmt_cost (&scalar_costs,
5431 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5432 scalar_stmt,
5433 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5434 vect_bb_slp_scalar_cost (bb_vinfo,
5435 SLP_INSTANCE_TREE (instance),
5436 &life, &scalar_costs, vectorized_scalar_stmts,
5437 visited);
5438 vector_costs.safe_splice (instance->cost_vec);
5439 instance->cost_vec.release ();
5440 }
5441
5442 if (dump_enabled_p ())
5443 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5444
5445 /* When costing non-loop vectorization we need to consider each covered
5446 loop independently and make sure vectorization is profitable. For
5447 now we assume a loop may be not entered or executed an arbitrary
5448 number of iterations (??? static information can provide more
5449 precise info here) which means we can simply cost each containing
5450 loops stmts separately. */
5451
5452 /* First produce cost vectors sorted by loop index. */
5453 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5454 li_scalar_costs (scalar_costs.length ());
5455 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5456 li_vector_costs (vector_costs.length ());
5457 stmt_info_for_cost *cost;
5458 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5459 {
5460 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5461 li_scalar_costs.quick_push (std::make_pair (l, cost));
5462 }
5463 /* Use a random used loop as fallback in case the first vector_costs
5464 entry does not have a stmt_info associated with it. */
5465 unsigned l = li_scalar_costs[0].first;
5466 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5467 {
5468 /* We inherit from the previous COST, invariants, externals and
5469 extracts immediately follow the cost for the related stmt. */
5470 if (cost->stmt_info)
5471 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5472 li_vector_costs.quick_push (std::make_pair (l, cost));
5473 }
5474 li_scalar_costs.qsort (li_cost_vec_cmp);
5475 li_vector_costs.qsort (li_cost_vec_cmp);
5476
5477 /* Now cost the portions individually. */
5478 unsigned vi = 0;
5479 unsigned si = 0;
5480 bool profitable = true;
5481 while (si < li_scalar_costs.length ()
5482 && vi < li_vector_costs.length ())
5483 {
5484 unsigned sl = li_scalar_costs[si].first;
5485 unsigned vl = li_vector_costs[vi].first;
5486 if (sl != vl)
5487 {
5488 if (dump_enabled_p ())
5489 dump_printf_loc (MSG_NOTE, vect_location,
5490 "Scalar %d and vector %d loop part do not "
5491 "match up, skipping scalar part\n", sl, vl);
5492 /* Skip the scalar part, assuming zero cost on the vector side. */
5493 do
5494 {
5495 si++;
5496 }
5497 while (si < li_scalar_costs.length ()
5498 && li_scalar_costs[si].first == sl);
5499 continue;
5500 }
5501
5502 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5503 do
5504 {
5505 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5506 si++;
5507 }
5508 while (si < li_scalar_costs.length ()
5509 && li_scalar_costs[si].first == sl);
5510 unsigned dummy;
5511 finish_cost (scalar_target_cost_data, nullptr,
5512 &dummy, &scalar_cost, &dummy);
5513
5514 /* Complete the target-specific vector cost calculation. */
5515 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5516 do
5517 {
5518 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5519 vi++;
5520 }
5521 while (vi < li_vector_costs.length ()
5522 && li_vector_costs[vi].first == vl);
5523 finish_cost (vect_target_cost_data, scalar_target_cost_data,
5524 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5525 delete scalar_target_cost_data;
5526 delete vect_target_cost_data;
5527
5528 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5529
5530 if (dump_enabled_p ())
5531 {
5532 dump_printf_loc (MSG_NOTE, vect_location,
5533 "Cost model analysis for part in loop %d:\n", sl);
5534 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5535 vec_inside_cost + vec_outside_cost);
5536 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5537 }
5538
5539 /* Vectorization is profitable if its cost is more than the cost of scalar
5540 version. Note that we err on the vector side for equal cost because
5541 the cost estimate is otherwise quite pessimistic (constant uses are
5542 free on the scalar side but cost a load on the vector side for
5543 example). */
5544 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5545 {
5546 profitable = false;
5547 break;
5548 }
5549 }
5550 if (profitable && vi < li_vector_costs.length ())
5551 {
5552 if (dump_enabled_p ())
5553 dump_printf_loc (MSG_NOTE, vect_location,
5554 "Excess vector cost for part in loop %d:\n",
5555 li_vector_costs[vi].first);
5556 profitable = false;
5557 }
5558
5559 /* Unset visited flag. This is delayed when the subgraph is profitable
5560 and we process the loop for remaining unvectorized if-converted code. */
5561 if (!orig_loop || !profitable)
5562 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5563 gimple_set_visited (cost->stmt_info->stmt, false);
5564
5565 scalar_costs.release ();
5566 vector_costs.release ();
5567
5568 return profitable;
5569 }
5570
5571 /* qsort comparator for lane defs. */
5572
5573 static int
vld_cmp(const void * a_,const void * b_)5574 vld_cmp (const void *a_, const void *b_)
5575 {
5576 auto *a = (const std::pair<unsigned, tree> *)a_;
5577 auto *b = (const std::pair<unsigned, tree> *)b_;
5578 return a->first - b->first;
5579 }
5580
5581 /* Return true if USE_STMT is a vector lane insert into VEC and set
5582 *THIS_LANE to the lane number that is set. */
5583
5584 static bool
vect_slp_is_lane_insert(gimple * use_stmt,tree vec,unsigned * this_lane)5585 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5586 {
5587 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5588 if (!use_ass
5589 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5590 || (vec
5591 ? gimple_assign_rhs1 (use_ass) != vec
5592 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5593 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5594 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5595 || !constant_multiple_p
5596 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5597 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5598 this_lane))
5599 return false;
5600 return true;
5601 }
5602
5603 /* Find any vectorizable constructors and add them to the grouped_store
5604 array. */
5605
5606 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)5607 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5608 {
5609 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5610 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5611 !gsi_end_p (gsi); gsi_next (&gsi))
5612 {
5613 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5614 if (!assign)
5615 continue;
5616
5617 tree rhs = gimple_assign_rhs1 (assign);
5618 enum tree_code code = gimple_assign_rhs_code (assign);
5619 use_operand_p use_p;
5620 gimple *use_stmt;
5621 if (code == CONSTRUCTOR)
5622 {
5623 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5624 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5625 CONSTRUCTOR_NELTS (rhs))
5626 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5627 || uniform_vector_p (rhs))
5628 continue;
5629
5630 unsigned j;
5631 tree val;
5632 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5633 if (TREE_CODE (val) != SSA_NAME
5634 || !bb_vinfo->lookup_def (val))
5635 break;
5636 if (j != CONSTRUCTOR_NELTS (rhs))
5637 continue;
5638
5639 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5640 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5641 }
5642 else if (code == BIT_INSERT_EXPR
5643 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5644 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5645 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5646 && integer_zerop (gimple_assign_rhs3 (assign))
5647 && useless_type_conversion_p
5648 (TREE_TYPE (TREE_TYPE (rhs)),
5649 TREE_TYPE (gimple_assign_rhs2 (assign)))
5650 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5651 {
5652 /* We start to match on insert to lane zero but since the
5653 inserts need not be ordered we'd have to search both
5654 the def and the use chains. */
5655 tree vectype = TREE_TYPE (rhs);
5656 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5657 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5658 auto_sbitmap lanes (nlanes);
5659 bitmap_clear (lanes);
5660 bitmap_set_bit (lanes, 0);
5661 tree def = gimple_assign_lhs (assign);
5662 lane_defs.quick_push
5663 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5664 unsigned lanes_found = 1;
5665 /* Start with the use chains, the last stmt will be the root. */
5666 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5667 vec<stmt_vec_info> roots = vNULL;
5668 roots.safe_push (last);
5669 do
5670 {
5671 use_operand_p use_p;
5672 gimple *use_stmt;
5673 if (!single_imm_use (def, &use_p, &use_stmt))
5674 break;
5675 unsigned this_lane;
5676 if (!bb_vinfo->lookup_stmt (use_stmt)
5677 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5678 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5679 break;
5680 if (bitmap_bit_p (lanes, this_lane))
5681 break;
5682 lanes_found++;
5683 bitmap_set_bit (lanes, this_lane);
5684 gassign *use_ass = as_a <gassign *> (use_stmt);
5685 lane_defs.quick_push (std::make_pair
5686 (this_lane, gimple_assign_rhs2 (use_ass)));
5687 last = bb_vinfo->lookup_stmt (use_ass);
5688 roots.safe_push (last);
5689 def = gimple_assign_lhs (use_ass);
5690 }
5691 while (lanes_found < nlanes);
5692 if (roots.length () > 1)
5693 std::swap(roots[0], roots[roots.length () - 1]);
5694 if (lanes_found < nlanes)
5695 {
5696 /* Now search the def chain. */
5697 def = gimple_assign_rhs1 (assign);
5698 do
5699 {
5700 if (TREE_CODE (def) != SSA_NAME
5701 || !has_single_use (def))
5702 break;
5703 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5704 unsigned this_lane;
5705 if (!bb_vinfo->lookup_stmt (def_stmt)
5706 || !vect_slp_is_lane_insert (def_stmt,
5707 NULL_TREE, &this_lane)
5708 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5709 break;
5710 if (bitmap_bit_p (lanes, this_lane))
5711 break;
5712 lanes_found++;
5713 bitmap_set_bit (lanes, this_lane);
5714 lane_defs.quick_push (std::make_pair
5715 (this_lane,
5716 gimple_assign_rhs2 (def_stmt)));
5717 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5718 def = gimple_assign_rhs1 (def_stmt);
5719 }
5720 while (lanes_found < nlanes);
5721 }
5722 if (lanes_found == nlanes)
5723 {
5724 /* Sort lane_defs after the lane index and register the root. */
5725 lane_defs.qsort (vld_cmp);
5726 vec<stmt_vec_info> stmts;
5727 stmts.create (nlanes);
5728 for (unsigned i = 0; i < nlanes; ++i)
5729 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5730 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5731 stmts, roots));
5732 }
5733 else
5734 roots.release ();
5735 }
5736 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5737 && (associative_tree_code (code) || code == MINUS_EXPR)
5738 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5739 checks pessimize a two-element reduction. PR54400.
5740 ??? In-order reduction could be handled if we only
5741 traverse one operand chain in vect_slp_linearize_chain. */
5742 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5743 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5744 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5745 /* Ops with constants at the tail can be stripped here. */
5746 && TREE_CODE (rhs) == SSA_NAME
5747 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5748 /* Should be the chain end. */
5749 && (!single_imm_use (gimple_assign_lhs (assign),
5750 &use_p, &use_stmt)
5751 || !is_gimple_assign (use_stmt)
5752 || (gimple_assign_rhs_code (use_stmt) != code
5753 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5754 || (gimple_assign_rhs_code (use_stmt)
5755 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5756 {
5757 /* We start the match at the end of a possible association
5758 chain. */
5759 auto_vec<chain_op_t> chain;
5760 auto_vec<std::pair<tree_code, gimple *> > worklist;
5761 auto_vec<gimple *> chain_stmts;
5762 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5763 if (code == MINUS_EXPR)
5764 code = PLUS_EXPR;
5765 internal_fn reduc_fn;
5766 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5767 || reduc_fn == IFN_LAST)
5768 continue;
5769 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5770 /* ??? */
5771 code_stmt, alt_code_stmt, &chain_stmts);
5772 if (chain.length () > 1)
5773 {
5774 /* Sort the chain according to def_type and operation. */
5775 chain.sort (dt_sort_cmp, bb_vinfo);
5776 /* ??? Now we'd want to strip externals and constants
5777 but record those to be handled in the epilogue. */
5778 /* ??? For now do not allow mixing ops or externs/constants. */
5779 bool invalid = false;
5780 for (unsigned i = 0; i < chain.length (); ++i)
5781 if (chain[i].dt != vect_internal_def
5782 || chain[i].code != code)
5783 invalid = true;
5784 if (!invalid)
5785 {
5786 vec<stmt_vec_info> stmts;
5787 stmts.create (chain.length ());
5788 for (unsigned i = 0; i < chain.length (); ++i)
5789 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5790 vec<stmt_vec_info> roots;
5791 roots.create (chain_stmts.length ());
5792 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5793 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5794 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5795 stmts, roots));
5796 }
5797 }
5798 }
5799 }
5800 }
5801
5802 /* Walk the grouped store chains and replace entries with their
5803 pattern variant if any. */
5804
5805 static void
vect_fixup_store_groups_with_patterns(vec_info * vinfo)5806 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5807 {
5808 stmt_vec_info first_element;
5809 unsigned i;
5810
5811 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5812 {
5813 /* We also have CTORs in this array. */
5814 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5815 continue;
5816 if (STMT_VINFO_IN_PATTERN_P (first_element))
5817 {
5818 stmt_vec_info orig = first_element;
5819 first_element = STMT_VINFO_RELATED_STMT (first_element);
5820 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5821 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5822 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5823 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5824 vinfo->grouped_stores[i] = first_element;
5825 }
5826 stmt_vec_info prev = first_element;
5827 while (DR_GROUP_NEXT_ELEMENT (prev))
5828 {
5829 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5830 if (STMT_VINFO_IN_PATTERN_P (elt))
5831 {
5832 stmt_vec_info orig = elt;
5833 elt = STMT_VINFO_RELATED_STMT (elt);
5834 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5835 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5836 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5837 }
5838 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5839 prev = elt;
5840 }
5841 }
5842 }
5843
5844 /* Check if the region described by BB_VINFO can be vectorized, returning
5845 true if so. When returning false, set FATAL to true if the same failure
5846 would prevent vectorization at other vector sizes, false if it is still
5847 worth trying other sizes. N_STMTS is the number of statements in the
5848 region. */
5849
5850 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal,vec<int> * dataref_groups)5851 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5852 vec<int> *dataref_groups)
5853 {
5854 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5855
5856 slp_instance instance;
5857 int i;
5858 poly_uint64 min_vf = 2;
5859
5860 /* The first group of checks is independent of the vector size. */
5861 fatal = true;
5862
5863 /* Analyze the data references. */
5864
5865 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5866 {
5867 if (dump_enabled_p ())
5868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5869 "not vectorized: unhandled data-ref in basic "
5870 "block.\n");
5871 return false;
5872 }
5873
5874 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5875 {
5876 if (dump_enabled_p ())
5877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5878 "not vectorized: unhandled data access in "
5879 "basic block.\n");
5880 return false;
5881 }
5882
5883 vect_slp_check_for_constructors (bb_vinfo);
5884
5885 /* If there are no grouped stores and no constructors in the region
5886 there is no need to continue with pattern recog as vect_analyze_slp
5887 will fail anyway. */
5888 if (bb_vinfo->grouped_stores.is_empty ()
5889 && bb_vinfo->roots.is_empty ())
5890 {
5891 if (dump_enabled_p ())
5892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5893 "not vectorized: no grouped stores in "
5894 "basic block.\n");
5895 return false;
5896 }
5897
5898 /* While the rest of the analysis below depends on it in some way. */
5899 fatal = false;
5900
5901 vect_pattern_recog (bb_vinfo);
5902
5903 /* Update store groups from pattern processing. */
5904 vect_fixup_store_groups_with_patterns (bb_vinfo);
5905
5906 /* Check the SLP opportunities in the basic block, analyze and build SLP
5907 trees. */
5908 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5909 {
5910 if (dump_enabled_p ())
5911 {
5912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5913 "Failed to SLP the basic block.\n");
5914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5915 "not vectorized: failed to find SLP opportunities "
5916 "in basic block.\n");
5917 }
5918 return false;
5919 }
5920
5921 /* Optimize permutations. */
5922 vect_optimize_slp (bb_vinfo);
5923
5924 /* Gather the loads reachable from the SLP graph entries. */
5925 vect_gather_slp_loads (bb_vinfo);
5926
5927 vect_record_base_alignments (bb_vinfo);
5928
5929 /* Analyze and verify the alignment of data references and the
5930 dependence in the SLP instances. */
5931 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5932 {
5933 vect_location = instance->location ();
5934 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5935 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5936 {
5937 slp_tree node = SLP_INSTANCE_TREE (instance);
5938 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5939 if (dump_enabled_p ())
5940 dump_printf_loc (MSG_NOTE, vect_location,
5941 "removing SLP instance operations starting from: %G",
5942 stmt_info->stmt);
5943 vect_free_slp_instance (instance);
5944 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5945 continue;
5946 }
5947
5948 /* Mark all the statements that we want to vectorize as pure SLP and
5949 relevant. */
5950 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5951 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5952 unsigned j;
5953 stmt_vec_info root;
5954 /* Likewise consider instance root stmts as vectorized. */
5955 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5956 STMT_SLP_TYPE (root) = pure_slp;
5957
5958 i++;
5959 }
5960 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5961 return false;
5962
5963 if (!vect_slp_analyze_operations (bb_vinfo))
5964 {
5965 if (dump_enabled_p ())
5966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5967 "not vectorized: bad operation in basic block.\n");
5968 return false;
5969 }
5970
5971 vect_bb_partition_graph (bb_vinfo);
5972
5973 return true;
5974 }
5975
5976 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5977 basic blocks in BBS, returning true on success.
5978 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5979
5980 static bool
vect_slp_region(vec<basic_block> bbs,vec<data_reference_p> datarefs,vec<int> * dataref_groups,unsigned int n_stmts,loop_p orig_loop)5981 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5982 vec<int> *dataref_groups, unsigned int n_stmts,
5983 loop_p orig_loop)
5984 {
5985 bb_vec_info bb_vinfo;
5986 auto_vector_modes vector_modes;
5987
5988 /* Autodetect first vector size we try. */
5989 machine_mode next_vector_mode = VOIDmode;
5990 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5991 unsigned int mode_i = 0;
5992
5993 vec_info_shared shared;
5994
5995 machine_mode autodetected_vector_mode = VOIDmode;
5996 while (1)
5997 {
5998 bool vectorized = false;
5999 bool fatal = false;
6000 bb_vinfo = new _bb_vec_info (bbs, &shared);
6001
6002 bool first_time_p = shared.datarefs.is_empty ();
6003 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
6004 if (first_time_p)
6005 bb_vinfo->shared->save_datarefs ();
6006 else
6007 bb_vinfo->shared->check_datarefs ();
6008 bb_vinfo->vector_mode = next_vector_mode;
6009
6010 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
6011 {
6012 if (dump_enabled_p ())
6013 {
6014 dump_printf_loc (MSG_NOTE, vect_location,
6015 "***** Analysis succeeded with vector mode"
6016 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
6017 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
6018 }
6019
6020 bb_vinfo->shared->check_datarefs ();
6021
6022 auto_vec<slp_instance> profitable_subgraphs;
6023 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
6024 {
6025 if (instance->subgraph_entries.is_empty ())
6026 continue;
6027
6028 vect_location = instance->location ();
6029 if (!unlimited_cost_model (NULL)
6030 && !vect_bb_vectorization_profitable_p
6031 (bb_vinfo, instance->subgraph_entries, orig_loop))
6032 {
6033 if (dump_enabled_p ())
6034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6035 "not vectorized: vectorization is not "
6036 "profitable.\n");
6037 continue;
6038 }
6039
6040 if (!dbg_cnt (vect_slp))
6041 continue;
6042
6043 profitable_subgraphs.safe_push (instance);
6044 }
6045
6046 /* When we're vectorizing an if-converted loop body make sure
6047 we vectorized all if-converted code. */
6048 if (!profitable_subgraphs.is_empty ()
6049 && orig_loop)
6050 {
6051 gcc_assert (bb_vinfo->bbs.length () == 1);
6052 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
6053 !gsi_end_p (gsi); gsi_next (&gsi))
6054 {
6055 /* The costing above left us with DCEable vectorized scalar
6056 stmts having the visited flag set on profitable
6057 subgraphs. Do the delayed clearing of the flag here. */
6058 if (gimple_visited_p (gsi_stmt (gsi)))
6059 {
6060 gimple_set_visited (gsi_stmt (gsi), false);
6061 continue;
6062 }
6063 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
6064 continue;
6065
6066 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
6067 if (gimple_assign_rhs_code (ass) == COND_EXPR)
6068 {
6069 if (!profitable_subgraphs.is_empty ()
6070 && dump_enabled_p ())
6071 dump_printf_loc (MSG_NOTE, vect_location,
6072 "not profitable because of "
6073 "unprofitable if-converted scalar "
6074 "code\n");
6075 profitable_subgraphs.truncate (0);
6076 }
6077 }
6078 }
6079
6080 /* Finally schedule the profitable subgraphs. */
6081 for (slp_instance instance : profitable_subgraphs)
6082 {
6083 if (!vectorized && dump_enabled_p ())
6084 dump_printf_loc (MSG_NOTE, vect_location,
6085 "Basic block will be vectorized "
6086 "using SLP\n");
6087 vectorized = true;
6088
6089 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
6090
6091 unsigned HOST_WIDE_INT bytes;
6092 if (dump_enabled_p ())
6093 {
6094 if (GET_MODE_SIZE
6095 (bb_vinfo->vector_mode).is_constant (&bytes))
6096 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6097 "basic block part vectorized using %wu "
6098 "byte vectors\n", bytes);
6099 else
6100 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6101 "basic block part vectorized using "
6102 "variable length vectors\n");
6103 }
6104 }
6105 }
6106 else
6107 {
6108 if (dump_enabled_p ())
6109 dump_printf_loc (MSG_NOTE, vect_location,
6110 "***** Analysis failed with vector mode %s\n",
6111 GET_MODE_NAME (bb_vinfo->vector_mode));
6112 }
6113
6114 if (mode_i == 0)
6115 autodetected_vector_mode = bb_vinfo->vector_mode;
6116
6117 if (!fatal)
6118 while (mode_i < vector_modes.length ()
6119 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6120 {
6121 if (dump_enabled_p ())
6122 dump_printf_loc (MSG_NOTE, vect_location,
6123 "***** The result for vector mode %s would"
6124 " be the same\n",
6125 GET_MODE_NAME (vector_modes[mode_i]));
6126 mode_i += 1;
6127 }
6128
6129 delete bb_vinfo;
6130
6131 if (mode_i < vector_modes.length ()
6132 && VECTOR_MODE_P (autodetected_vector_mode)
6133 && (related_vector_mode (vector_modes[mode_i],
6134 GET_MODE_INNER (autodetected_vector_mode))
6135 == autodetected_vector_mode)
6136 && (related_vector_mode (autodetected_vector_mode,
6137 GET_MODE_INNER (vector_modes[mode_i]))
6138 == vector_modes[mode_i]))
6139 {
6140 if (dump_enabled_p ())
6141 dump_printf_loc (MSG_NOTE, vect_location,
6142 "***** Skipping vector mode %s, which would"
6143 " repeat the analysis for %s\n",
6144 GET_MODE_NAME (vector_modes[mode_i]),
6145 GET_MODE_NAME (autodetected_vector_mode));
6146 mode_i += 1;
6147 }
6148
6149 if (vectorized
6150 || mode_i == vector_modes.length ()
6151 || autodetected_vector_mode == VOIDmode
6152 /* If vect_slp_analyze_bb_1 signaled that analysis for all
6153 vector sizes will fail do not bother iterating. */
6154 || fatal)
6155 return vectorized;
6156
6157 /* Try the next biggest vector size. */
6158 next_vector_mode = vector_modes[mode_i++];
6159 if (dump_enabled_p ())
6160 dump_printf_loc (MSG_NOTE, vect_location,
6161 "***** Re-trying analysis with vector mode %s\n",
6162 GET_MODE_NAME (next_vector_mode));
6163 }
6164 }
6165
6166
6167 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6168 true if anything in the basic-block was vectorized. */
6169
6170 static bool
vect_slp_bbs(const vec<basic_block> & bbs,loop_p orig_loop)6171 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6172 {
6173 vec<data_reference_p> datarefs = vNULL;
6174 auto_vec<int> dataref_groups;
6175 int insns = 0;
6176 int current_group = 0;
6177
6178 for (unsigned i = 0; i < bbs.length (); i++)
6179 {
6180 basic_block bb = bbs[i];
6181 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6182 gsi_next (&gsi))
6183 {
6184 gimple *stmt = gsi_stmt (gsi);
6185 if (is_gimple_debug (stmt))
6186 continue;
6187
6188 insns++;
6189
6190 if (gimple_location (stmt) != UNKNOWN_LOCATION)
6191 vect_location = stmt;
6192
6193 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6194 &dataref_groups, current_group))
6195 ++current_group;
6196 }
6197 /* New BBs always start a new DR group. */
6198 ++current_group;
6199 }
6200
6201 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6202 }
6203
6204 /* Special entry for the BB vectorizer. Analyze and transform a single
6205 if-converted BB with ORIG_LOOPs body being the not if-converted
6206 representation. Returns true if anything in the basic-block was
6207 vectorized. */
6208
6209 bool
vect_slp_if_converted_bb(basic_block bb,loop_p orig_loop)6210 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6211 {
6212 auto_vec<basic_block> bbs;
6213 bbs.safe_push (bb);
6214 return vect_slp_bbs (bbs, orig_loop);
6215 }
6216
6217 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6218 true if anything in the basic-block was vectorized. */
6219
6220 bool
vect_slp_function(function * fun)6221 vect_slp_function (function *fun)
6222 {
6223 bool r = false;
6224 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6225 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6226
6227 /* For the moment split the function into pieces to avoid making
6228 the iteration on the vector mode moot. Split at points we know
6229 to not handle well which is CFG merges (SLP discovery doesn't
6230 handle non-loop-header PHIs) and loop exits. Since pattern
6231 recog requires reverse iteration to visit uses before defs
6232 simply chop RPO into pieces. */
6233 auto_vec<basic_block> bbs;
6234 for (unsigned i = 0; i < n; i++)
6235 {
6236 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6237 bool split = false;
6238
6239 /* Split when a BB is not dominated by the first block. */
6240 if (!bbs.is_empty ()
6241 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6242 {
6243 if (dump_enabled_p ())
6244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245 "splitting region at dominance boundary bb%d\n",
6246 bb->index);
6247 split = true;
6248 }
6249 /* Split when the loop determined by the first block
6250 is exited. This is because we eventually insert
6251 invariants at region begin. */
6252 else if (!bbs.is_empty ()
6253 && bbs[0]->loop_father != bb->loop_father
6254 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6255 {
6256 if (dump_enabled_p ())
6257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 "splitting region at loop %d exit at bb%d\n",
6259 bbs[0]->loop_father->num, bb->index);
6260 split = true;
6261 }
6262
6263 if (split && !bbs.is_empty ())
6264 {
6265 r |= vect_slp_bbs (bbs, NULL);
6266 bbs.truncate (0);
6267 }
6268
6269 /* We need to be able to insert at the head of the region which
6270 we cannot for region starting with a returns-twice call. */
6271 if (bbs.is_empty ())
6272 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
6273 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
6274 {
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277 "skipping bb%d as start of region as it "
6278 "starts with returns-twice call\n",
6279 bb->index);
6280 continue;
6281 }
6282
6283 bbs.safe_push (bb);
6284
6285 /* When we have a stmt ending this block and defining a
6286 value we have to insert on edges when inserting after it for
6287 a vector containing its definition. Avoid this for now. */
6288 if (gimple *last = last_stmt (bb))
6289 if (gimple_get_lhs (last)
6290 && is_ctrl_altering_stmt (last))
6291 {
6292 if (dump_enabled_p ())
6293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6294 "splitting region at control altering "
6295 "definition %G", last);
6296 r |= vect_slp_bbs (bbs, NULL);
6297 bbs.truncate (0);
6298 }
6299 }
6300
6301 if (!bbs.is_empty ())
6302 r |= vect_slp_bbs (bbs, NULL);
6303
6304 free (rpo);
6305
6306 return r;
6307 }
6308
6309 /* Build a variable-length vector in which the elements in ELTS are repeated
6310 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6311 RESULTS and add any new instructions to SEQ.
6312
6313 The approach we use is:
6314
6315 (1) Find a vector mode VM with integer elements of mode IM.
6316
6317 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6318 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6319 from small vectors to IM.
6320
6321 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6322
6323 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6324 correct byte contents.
6325
6326 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6327
6328 We try to find the largest IM for which this sequence works, in order
6329 to cut down on the number of interleaves. */
6330
6331 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,const vec<tree> & elts,unsigned int nresults,vec<tree> & results)6332 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6333 const vec<tree> &elts, unsigned int nresults,
6334 vec<tree> &results)
6335 {
6336 unsigned int nelts = elts.length ();
6337 tree element_type = TREE_TYPE (vector_type);
6338
6339 /* (1) Find a vector mode VM with integer elements of mode IM. */
6340 unsigned int nvectors = 1;
6341 tree new_vector_type;
6342 tree permutes[2];
6343 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6344 &nvectors, &new_vector_type,
6345 permutes))
6346 gcc_unreachable ();
6347
6348 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6349 unsigned int partial_nelts = nelts / nvectors;
6350 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6351
6352 tree_vector_builder partial_elts;
6353 auto_vec<tree, 32> pieces (nvectors * 2);
6354 pieces.quick_grow_cleared (nvectors * 2);
6355 for (unsigned int i = 0; i < nvectors; ++i)
6356 {
6357 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6358 ELTS' has mode IM. */
6359 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6360 for (unsigned int j = 0; j < partial_nelts; ++j)
6361 partial_elts.quick_push (elts[i * partial_nelts + j]);
6362 tree t = gimple_build_vector (seq, &partial_elts);
6363 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6364 TREE_TYPE (new_vector_type), t);
6365
6366 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6367 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6368 }
6369
6370 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6371 correct byte contents.
6372
6373 Conceptually, we need to repeat the following operation log2(nvectors)
6374 times, where hi_start = nvectors / 2:
6375
6376 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6377 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6378
6379 However, if each input repeats every N elements and the VF is
6380 a multiple of N * 2, the HI result is the same as the LO result.
6381 This will be true for the first N1 iterations of the outer loop,
6382 followed by N2 iterations for which both the LO and HI results
6383 are needed. I.e.:
6384
6385 N1 + N2 = log2(nvectors)
6386
6387 Each "N1 iteration" doubles the number of redundant vectors and the
6388 effect of the process as a whole is to have a sequence of nvectors/2**N1
6389 vectors that repeats 2**N1 times. Rather than generate these redundant
6390 vectors, we halve the number of vectors for each N1 iteration. */
6391 unsigned int in_start = 0;
6392 unsigned int out_start = nvectors;
6393 unsigned int new_nvectors = nvectors;
6394 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6395 {
6396 unsigned int hi_start = new_nvectors / 2;
6397 unsigned int out_i = 0;
6398 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6399 {
6400 if ((in_i & 1) != 0
6401 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6402 2 * in_repeat))
6403 continue;
6404
6405 tree output = make_ssa_name (new_vector_type);
6406 tree input1 = pieces[in_start + (in_i / 2)];
6407 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6408 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6409 input1, input2,
6410 permutes[in_i & 1]);
6411 gimple_seq_add_stmt (seq, stmt);
6412 pieces[out_start + out_i] = output;
6413 out_i += 1;
6414 }
6415 std::swap (in_start, out_start);
6416 new_nvectors = out_i;
6417 }
6418
6419 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6420 results.reserve (nresults);
6421 for (unsigned int i = 0; i < nresults; ++i)
6422 if (i < new_nvectors)
6423 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6424 pieces[in_start + i]));
6425 else
6426 results.quick_push (results[i - new_nvectors]);
6427 }
6428
6429
6430 /* For constant and loop invariant defs in OP_NODE this function creates
6431 vector defs that will be used in the vectorized stmts and stores them
6432 to SLP_TREE_VEC_DEFS of OP_NODE. */
6433
6434 static void
vect_create_constant_vectors(vec_info * vinfo,slp_tree op_node)6435 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6436 {
6437 unsigned HOST_WIDE_INT nunits;
6438 tree vec_cst;
6439 unsigned j, number_of_places_left_in_vector;
6440 tree vector_type;
6441 tree vop;
6442 int group_size = op_node->ops.length ();
6443 unsigned int vec_num, i;
6444 unsigned number_of_copies = 1;
6445 bool constant_p;
6446 gimple_seq ctor_seq = NULL;
6447 auto_vec<tree, 16> permute_results;
6448
6449 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6450 vector_type = SLP_TREE_VECTYPE (op_node);
6451
6452 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6453 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6454 auto_vec<tree> voprnds (number_of_vectors);
6455
6456 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6457 created vectors. It is greater than 1 if unrolling is performed.
6458
6459 For example, we have two scalar operands, s1 and s2 (e.g., group of
6460 strided accesses of size two), while NUNITS is four (i.e., four scalars
6461 of this type can be packed in a vector). The output vector will contain
6462 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6463 will be 2).
6464
6465 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6466 containing the operands.
6467
6468 For example, NUNITS is four as before, and the group size is 8
6469 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6470 {s5, s6, s7, s8}. */
6471
6472 /* When using duplicate_and_interleave, we just need one element for
6473 each scalar statement. */
6474 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6475 nunits = group_size;
6476
6477 number_of_copies = nunits * number_of_vectors / group_size;
6478
6479 number_of_places_left_in_vector = nunits;
6480 constant_p = true;
6481 tree_vector_builder elts (vector_type, nunits, 1);
6482 elts.quick_grow (nunits);
6483 stmt_vec_info insert_after = NULL;
6484 for (j = 0; j < number_of_copies; j++)
6485 {
6486 tree op;
6487 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6488 {
6489 /* Create 'vect_ = {op0,op1,...,opn}'. */
6490 number_of_places_left_in_vector--;
6491 tree orig_op = op;
6492 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6493 {
6494 if (CONSTANT_CLASS_P (op))
6495 {
6496 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6497 {
6498 /* Can't use VIEW_CONVERT_EXPR for booleans because
6499 of possibly different sizes of scalar value and
6500 vector element. */
6501 if (integer_zerop (op))
6502 op = build_int_cst (TREE_TYPE (vector_type), 0);
6503 else if (integer_onep (op))
6504 op = build_all_ones_cst (TREE_TYPE (vector_type));
6505 else
6506 gcc_unreachable ();
6507 }
6508 else
6509 op = fold_unary (VIEW_CONVERT_EXPR,
6510 TREE_TYPE (vector_type), op);
6511 gcc_assert (op && CONSTANT_CLASS_P (op));
6512 }
6513 else
6514 {
6515 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6516 gimple *init_stmt;
6517 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6518 {
6519 tree true_val
6520 = build_all_ones_cst (TREE_TYPE (vector_type));
6521 tree false_val
6522 = build_zero_cst (TREE_TYPE (vector_type));
6523 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6524 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6525 op, true_val,
6526 false_val);
6527 }
6528 else
6529 {
6530 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6531 op);
6532 init_stmt
6533 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6534 op);
6535 }
6536 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6537 op = new_temp;
6538 }
6539 }
6540 elts[number_of_places_left_in_vector] = op;
6541 if (!CONSTANT_CLASS_P (op))
6542 constant_p = false;
6543 /* For BB vectorization we have to compute an insert location
6544 when a def is inside the analyzed region since we cannot
6545 simply insert at the BB start in this case. */
6546 stmt_vec_info opdef;
6547 if (TREE_CODE (orig_op) == SSA_NAME
6548 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6549 && is_a <bb_vec_info> (vinfo)
6550 && (opdef = vinfo->lookup_def (orig_op)))
6551 {
6552 if (!insert_after)
6553 insert_after = opdef;
6554 else
6555 insert_after = get_later_stmt (insert_after, opdef);
6556 }
6557
6558 if (number_of_places_left_in_vector == 0)
6559 {
6560 if (constant_p
6561 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6562 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6563 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6564 else
6565 {
6566 if (permute_results.is_empty ())
6567 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6568 elts, number_of_vectors,
6569 permute_results);
6570 vec_cst = permute_results[number_of_vectors - j - 1];
6571 }
6572 if (!gimple_seq_empty_p (ctor_seq))
6573 {
6574 if (insert_after)
6575 {
6576 gimple_stmt_iterator gsi;
6577 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6578 {
6579 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6580 gsi_insert_seq_before (&gsi, ctor_seq,
6581 GSI_CONTINUE_LINKING);
6582 }
6583 else if (!stmt_ends_bb_p (insert_after->stmt))
6584 {
6585 gsi = gsi_for_stmt (insert_after->stmt);
6586 gsi_insert_seq_after (&gsi, ctor_seq,
6587 GSI_CONTINUE_LINKING);
6588 }
6589 else
6590 {
6591 /* When we want to insert after a def where the
6592 defining stmt throws then insert on the fallthru
6593 edge. */
6594 edge e = find_fallthru_edge
6595 (gimple_bb (insert_after->stmt)->succs);
6596 basic_block new_bb
6597 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6598 gcc_assert (!new_bb);
6599 }
6600 }
6601 else
6602 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6603 ctor_seq = NULL;
6604 }
6605 voprnds.quick_push (vec_cst);
6606 insert_after = NULL;
6607 number_of_places_left_in_vector = nunits;
6608 constant_p = true;
6609 elts.new_vector (vector_type, nunits, 1);
6610 elts.quick_grow (nunits);
6611 }
6612 }
6613 }
6614
6615 /* Since the vectors are created in the reverse order, we should invert
6616 them. */
6617 vec_num = voprnds.length ();
6618 for (j = vec_num; j != 0; j--)
6619 {
6620 vop = voprnds[j - 1];
6621 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6622 }
6623
6624 /* In case that VF is greater than the unrolling factor needed for the SLP
6625 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6626 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6627 to replicate the vectors. */
6628 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6629 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6630 i++)
6631 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6632 }
6633
6634 /* Get the Ith vectorized definition from SLP_NODE. */
6635
6636 tree
vect_get_slp_vect_def(slp_tree slp_node,unsigned i)6637 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6638 {
6639 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6640 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6641 else
6642 return SLP_TREE_VEC_DEFS (slp_node)[i];
6643 }
6644
6645 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6646
6647 void
vect_get_slp_defs(slp_tree slp_node,vec<tree> * vec_defs)6648 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6649 {
6650 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6651 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6652 {
6653 unsigned j;
6654 gimple *vec_def_stmt;
6655 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6656 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6657 }
6658 else
6659 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6660 }
6661
6662 /* Get N vectorized definitions for SLP_NODE. */
6663
6664 void
vect_get_slp_defs(vec_info *,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)6665 vect_get_slp_defs (vec_info *,
6666 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6667 {
6668 if (n == -1U)
6669 n = SLP_TREE_CHILDREN (slp_node).length ();
6670
6671 for (unsigned i = 0; i < n; ++i)
6672 {
6673 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6674 vec<tree> vec_defs = vNULL;
6675 vect_get_slp_defs (child, &vec_defs);
6676 vec_oprnds->quick_push (vec_defs);
6677 }
6678 }
6679
6680 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6681 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6682 permute statements for the SLP node NODE. Store the number of vector
6683 permute instructions in *N_PERMS and the number of vector load
6684 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6685 that were not needed. */
6686
6687 bool
vect_transform_slp_perm_load(vec_info * vinfo,slp_tree node,const vec<tree> & dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,bool analyze_only,unsigned * n_perms,unsigned int * n_loads,bool dce_chain)6688 vect_transform_slp_perm_load (vec_info *vinfo,
6689 slp_tree node, const vec<tree> &dr_chain,
6690 gimple_stmt_iterator *gsi, poly_uint64 vf,
6691 bool analyze_only, unsigned *n_perms,
6692 unsigned int *n_loads, bool dce_chain)
6693 {
6694 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6695 int vec_index = 0;
6696 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6697 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6698 unsigned int mask_element;
6699 machine_mode mode;
6700
6701 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6702 return false;
6703
6704 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6705
6706 mode = TYPE_MODE (vectype);
6707 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6708
6709 /* Initialize the vect stmts of NODE to properly insert the generated
6710 stmts later. */
6711 if (! analyze_only)
6712 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6713 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6714 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6715
6716 /* Generate permutation masks for every NODE. Number of masks for each NODE
6717 is equal to GROUP_SIZE.
6718 E.g., we have a group of three nodes with three loads from the same
6719 location in each node, and the vector size is 4. I.e., we have a
6720 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6721 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6722 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6723 ...
6724
6725 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6726 The last mask is illegal since we assume two operands for permute
6727 operation, and the mask element values can't be outside that range.
6728 Hence, the last mask must be converted into {2,5,5,5}.
6729 For the first two permutations we need the first and the second input
6730 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6731 we need the second and the third vectors: {b1,c1,a2,b2} and
6732 {c2,a3,b3,c3}. */
6733
6734 int vect_stmts_counter = 0;
6735 unsigned int index = 0;
6736 int first_vec_index = -1;
6737 int second_vec_index = -1;
6738 bool noop_p = true;
6739 *n_perms = 0;
6740
6741 vec_perm_builder mask;
6742 unsigned int nelts_to_build;
6743 unsigned int nvectors_per_build;
6744 unsigned int in_nlanes;
6745 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6746 && multiple_p (nunits, group_size));
6747 if (repeating_p)
6748 {
6749 /* A single vector contains a whole number of copies of the node, so:
6750 (a) all permutes can use the same mask; and
6751 (b) the permutes only need a single vector input. */
6752 mask.new_vector (nunits, group_size, 3);
6753 nelts_to_build = mask.encoded_nelts ();
6754 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6755 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6756 }
6757 else
6758 {
6759 /* We need to construct a separate mask for each vector statement. */
6760 unsigned HOST_WIDE_INT const_nunits, const_vf;
6761 if (!nunits.is_constant (&const_nunits)
6762 || !vf.is_constant (&const_vf))
6763 return false;
6764 mask.new_vector (const_nunits, const_nunits, 1);
6765 nelts_to_build = const_vf * group_size;
6766 nvectors_per_build = 1;
6767 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6768 }
6769 auto_sbitmap used_in_lanes (in_nlanes);
6770 bitmap_clear (used_in_lanes);
6771 auto_bitmap used_defs;
6772
6773 unsigned int count = mask.encoded_nelts ();
6774 mask.quick_grow (count);
6775 vec_perm_indices indices;
6776
6777 for (unsigned int j = 0; j < nelts_to_build; j++)
6778 {
6779 unsigned int iter_num = j / group_size;
6780 unsigned int stmt_num = j % group_size;
6781 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6782 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6783 bitmap_set_bit (used_in_lanes, i);
6784 if (repeating_p)
6785 {
6786 first_vec_index = 0;
6787 mask_element = i;
6788 }
6789 else
6790 {
6791 /* Enforced before the loop when !repeating_p. */
6792 unsigned int const_nunits = nunits.to_constant ();
6793 vec_index = i / const_nunits;
6794 mask_element = i % const_nunits;
6795 if (vec_index == first_vec_index
6796 || first_vec_index == -1)
6797 {
6798 first_vec_index = vec_index;
6799 }
6800 else if (vec_index == second_vec_index
6801 || second_vec_index == -1)
6802 {
6803 second_vec_index = vec_index;
6804 mask_element += const_nunits;
6805 }
6806 else
6807 {
6808 if (dump_enabled_p ())
6809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810 "permutation requires at "
6811 "least three vectors %G",
6812 stmt_info->stmt);
6813 gcc_assert (analyze_only);
6814 return false;
6815 }
6816
6817 gcc_assert (mask_element < 2 * const_nunits);
6818 }
6819
6820 if (mask_element != index)
6821 noop_p = false;
6822 mask[index++] = mask_element;
6823
6824 if (index == count && !noop_p)
6825 {
6826 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6827 if (!can_vec_perm_const_p (mode, indices))
6828 {
6829 if (dump_enabled_p ())
6830 {
6831 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6832 vect_location,
6833 "unsupported vect permute { ");
6834 for (i = 0; i < count; ++i)
6835 {
6836 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6837 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6838 }
6839 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6840 }
6841 gcc_assert (analyze_only);
6842 return false;
6843 }
6844
6845 ++*n_perms;
6846 }
6847
6848 if (index == count)
6849 {
6850 if (!analyze_only)
6851 {
6852 tree mask_vec = NULL_TREE;
6853
6854 if (! noop_p)
6855 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6856
6857 if (second_vec_index == -1)
6858 second_vec_index = first_vec_index;
6859
6860 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6861 {
6862 /* Generate the permute statement if necessary. */
6863 tree first_vec = dr_chain[first_vec_index + ri];
6864 tree second_vec = dr_chain[second_vec_index + ri];
6865 gimple *perm_stmt;
6866 if (! noop_p)
6867 {
6868 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6869 tree perm_dest
6870 = vect_create_destination_var (gimple_assign_lhs (stmt),
6871 vectype);
6872 perm_dest = make_ssa_name (perm_dest);
6873 perm_stmt
6874 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6875 first_vec, second_vec,
6876 mask_vec);
6877 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6878 gsi);
6879 if (dce_chain)
6880 {
6881 bitmap_set_bit (used_defs, first_vec_index + ri);
6882 bitmap_set_bit (used_defs, second_vec_index + ri);
6883 }
6884 }
6885 else
6886 {
6887 /* If mask was NULL_TREE generate the requested
6888 identity transform. */
6889 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6890 if (dce_chain)
6891 bitmap_set_bit (used_defs, first_vec_index + ri);
6892 }
6893
6894 /* Store the vector statement in NODE. */
6895 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6896 }
6897 }
6898
6899 index = 0;
6900 first_vec_index = -1;
6901 second_vec_index = -1;
6902 noop_p = true;
6903 }
6904 }
6905
6906 if (n_loads)
6907 {
6908 if (repeating_p)
6909 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6910 else
6911 {
6912 /* Enforced above when !repeating_p. */
6913 unsigned int const_nunits = nunits.to_constant ();
6914 *n_loads = 0;
6915 bool load_seen = false;
6916 for (unsigned i = 0; i < in_nlanes; ++i)
6917 {
6918 if (i % const_nunits == 0)
6919 {
6920 if (load_seen)
6921 *n_loads += 1;
6922 load_seen = false;
6923 }
6924 if (bitmap_bit_p (used_in_lanes, i))
6925 load_seen = true;
6926 }
6927 if (load_seen)
6928 *n_loads += 1;
6929 }
6930 }
6931
6932 if (dce_chain)
6933 for (unsigned i = 0; i < dr_chain.length (); ++i)
6934 if (!bitmap_bit_p (used_defs, i))
6935 {
6936 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6937 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6938 gsi_remove (&rgsi, true);
6939 release_defs (stmt);
6940 }
6941
6942 return true;
6943 }
6944
6945 /* Produce the next vector result for SLP permutation NODE by adding a vector
6946 statement at GSI. If MASK_VEC is nonnull, add:
6947
6948 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6949
6950 otherwise add:
6951
6952 <new SSA name> = FIRST_DEF. */
6953
6954 static void
vect_add_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,tree first_def,tree second_def,tree mask_vec)6955 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6956 slp_tree node, tree first_def, tree second_def,
6957 tree mask_vec)
6958 {
6959 tree vectype = SLP_TREE_VECTYPE (node);
6960
6961 /* ??? We SLP match existing vector element extracts but
6962 allow punning which we need to re-instantiate at uses
6963 but have no good way of explicitly representing. */
6964 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6965 {
6966 gassign *conv_stmt
6967 = gimple_build_assign (make_ssa_name (vectype),
6968 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6969 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6970 first_def = gimple_assign_lhs (conv_stmt);
6971 }
6972 gassign *perm_stmt;
6973 tree perm_dest = make_ssa_name (vectype);
6974 if (mask_vec)
6975 {
6976 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6977 {
6978 gassign *conv_stmt
6979 = gimple_build_assign (make_ssa_name (vectype),
6980 build1 (VIEW_CONVERT_EXPR,
6981 vectype, second_def));
6982 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6983 second_def = gimple_assign_lhs (conv_stmt);
6984 }
6985 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6986 first_def, second_def,
6987 mask_vec);
6988 }
6989 else
6990 /* We need a copy here in case the def was external. */
6991 perm_stmt = gimple_build_assign (perm_dest, first_def);
6992 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6993 /* Store the vector statement in NODE. */
6994 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6995 }
6996
6997 /* Vectorize the SLP permutations in NODE as specified
6998 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6999 child number and lane number.
7000 Interleaving of two two-lane two-child SLP subtrees (not supported):
7001 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
7002 A blend of two four-lane two-child SLP subtrees:
7003 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
7004 Highpart of a four-lane one-child SLP subtree (not supported):
7005 [ { 0, 2 }, { 0, 3 } ]
7006 Where currently only a subset is supported by code generating below. */
7007
7008 static bool
vectorizable_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,stmt_vector_for_cost * cost_vec)7009 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
7010 slp_tree node, stmt_vector_for_cost *cost_vec)
7011 {
7012 tree vectype = SLP_TREE_VECTYPE (node);
7013
7014 /* ??? We currently only support all same vector input and output types
7015 while the SLP IL should really do a concat + select and thus accept
7016 arbitrary mismatches. */
7017 slp_tree child;
7018 unsigned i;
7019 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7020 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
7021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7022 {
7023 if (!vect_maybe_update_slp_op_vectype (child, vectype)
7024 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
7025 {
7026 if (dump_enabled_p ())
7027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7028 "Unsupported lane permutation\n");
7029 return false;
7030 }
7031 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
7032 repeating_p = false;
7033 }
7034
7035 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
7036 gcc_assert (perm.length () == SLP_TREE_LANES (node));
7037 if (dump_enabled_p ())
7038 {
7039 dump_printf_loc (MSG_NOTE, vect_location,
7040 "vectorizing permutation");
7041 for (unsigned i = 0; i < perm.length (); ++i)
7042 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
7043 if (repeating_p)
7044 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
7045 dump_printf (MSG_NOTE, "\n");
7046 }
7047
7048 /* REPEATING_P is true if every output vector is guaranteed to use the
7049 same permute vector. We can handle that case for both variable-length
7050 and constant-length vectors, but we only handle other cases for
7051 constant-length vectors.
7052
7053 Set:
7054
7055 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7056 mask vector that we want to build.
7057
7058 - NCOPIES to the number of copies of PERM that we need in order
7059 to build the necessary permute mask vectors.
7060
7061 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7062 for each permute mask vector. This is only relevant when GSI is
7063 nonnull. */
7064 uint64_t npatterns;
7065 unsigned nelts_per_pattern;
7066 uint64_t ncopies;
7067 unsigned noutputs_per_mask;
7068 if (repeating_p)
7069 {
7070 /* We need a single permute mask vector that has the form:
7071
7072 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7073
7074 In other words, the original n-element permute in PERM is
7075 "unrolled" to fill a full vector. The stepped vector encoding
7076 that we use for permutes requires 3n elements. */
7077 npatterns = SLP_TREE_LANES (node);
7078 nelts_per_pattern = ncopies = 3;
7079 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7080 }
7081 else
7082 {
7083 /* Calculate every element of every permute mask vector explicitly,
7084 instead of relying on the pattern described above. */
7085 if (!nunits.is_constant (&npatterns))
7086 return false;
7087 nelts_per_pattern = ncopies = 1;
7088 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
7089 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
7090 return false;
7091 noutputs_per_mask = 1;
7092 }
7093 unsigned olanes = ncopies * SLP_TREE_LANES (node);
7094 gcc_assert (repeating_p || multiple_p (olanes, nunits));
7095
7096 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7097 from the { SLP operand, scalar lane } permutation as recorded in the
7098 SLP node as intermediate step. This part should already work
7099 with SLP children with arbitrary number of lanes. */
7100 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
7101 auto_vec<unsigned> active_lane;
7102 vperm.create (olanes);
7103 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
7104 for (unsigned i = 0; i < ncopies; ++i)
7105 {
7106 for (unsigned pi = 0; pi < perm.length (); ++pi)
7107 {
7108 std::pair<unsigned, unsigned> p = perm[pi];
7109 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7110 if (repeating_p)
7111 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7112 else
7113 {
7114 /* We checked above that the vectors are constant-length. */
7115 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7116 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7117 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7118 vperm.quick_push ({{p.first, vi}, vl});
7119 }
7120 }
7121 /* Advance to the next group. */
7122 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7123 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7124 }
7125
7126 if (dump_enabled_p ())
7127 {
7128 dump_printf_loc (MSG_NOTE, vect_location, "as");
7129 for (unsigned i = 0; i < vperm.length (); ++i)
7130 {
7131 if (i != 0
7132 && (repeating_p
7133 ? multiple_p (i, npatterns)
7134 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7135 dump_printf (MSG_NOTE, ",");
7136 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7137 vperm[i].first.first, vperm[i].first.second,
7138 vperm[i].second);
7139 }
7140 dump_printf (MSG_NOTE, "\n");
7141 }
7142
7143 /* We can only handle two-vector permutes, everything else should
7144 be lowered on the SLP level. The following is closely inspired
7145 by vect_transform_slp_perm_load and is supposed to eventually
7146 replace it.
7147 ??? As intermediate step do code-gen in the SLP tree representation
7148 somehow? */
7149 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7150 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7151 unsigned int index = 0;
7152 poly_uint64 mask_element;
7153 vec_perm_builder mask;
7154 mask.new_vector (nunits, npatterns, nelts_per_pattern);
7155 unsigned int count = mask.encoded_nelts ();
7156 mask.quick_grow (count);
7157 vec_perm_indices indices;
7158 unsigned nperms = 0;
7159 for (unsigned i = 0; i < vperm.length (); ++i)
7160 {
7161 mask_element = vperm[i].second;
7162 if (first_vec.first == -1U
7163 || first_vec == vperm[i].first)
7164 first_vec = vperm[i].first;
7165 else if (second_vec.first == -1U
7166 || second_vec == vperm[i].first)
7167 {
7168 second_vec = vperm[i].first;
7169 mask_element += nunits;
7170 }
7171 else
7172 {
7173 if (dump_enabled_p ())
7174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7175 "permutation requires at "
7176 "least three vectors\n");
7177 gcc_assert (!gsi);
7178 return false;
7179 }
7180
7181 mask[index++] = mask_element;
7182
7183 if (index == count)
7184 {
7185 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7186 bool identity_p = indices.series_p (0, 1, 0, 1);
7187 if (!identity_p
7188 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7189 {
7190 if (dump_enabled_p ())
7191 {
7192 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7193 vect_location,
7194 "unsupported vect permute { ");
7195 for (i = 0; i < count; ++i)
7196 {
7197 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7198 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7199 }
7200 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7201 }
7202 gcc_assert (!gsi);
7203 return false;
7204 }
7205
7206 if (!identity_p)
7207 nperms++;
7208 if (gsi)
7209 {
7210 if (second_vec.first == -1U)
7211 second_vec = first_vec;
7212
7213 slp_tree
7214 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7215 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7216
7217 tree mask_vec = NULL_TREE;
7218 if (!identity_p)
7219 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7220
7221 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7222 {
7223 tree first_def
7224 = vect_get_slp_vect_def (first_node,
7225 first_vec.second + vi);
7226 tree second_def
7227 = vect_get_slp_vect_def (second_node,
7228 second_vec.second + vi);
7229 vect_add_slp_permutation (vinfo, gsi, node, first_def,
7230 second_def, mask_vec);
7231 }
7232 }
7233
7234 index = 0;
7235 first_vec = std::make_pair (-1U, -1U);
7236 second_vec = std::make_pair (-1U, -1U);
7237 }
7238 }
7239
7240 if (!gsi)
7241 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7242
7243 return true;
7244 }
7245
7246 /* Vectorize SLP NODE. */
7247
7248 static void
vect_schedule_slp_node(vec_info * vinfo,slp_tree node,slp_instance instance)7249 vect_schedule_slp_node (vec_info *vinfo,
7250 slp_tree node, slp_instance instance)
7251 {
7252 gimple_stmt_iterator si;
7253 int i;
7254 slp_tree child;
7255
7256 /* Vectorize externals and constants. */
7257 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7258 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7259 {
7260 /* ??? vectorizable_shift can end up using a scalar operand which is
7261 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7262 node in this case. */
7263 if (!SLP_TREE_VECTYPE (node))
7264 return;
7265
7266 /* There are two reasons vector defs might already exist. The first
7267 is that we are vectorizing an existing vector def. The second is
7268 when performing BB vectorization shared constant/external nodes
7269 are not split apart during partitioning so during the code-gen
7270 DFS walk we can end up visiting them twice. */
7271 if (! SLP_TREE_VEC_DEFS (node).exists ())
7272 vect_create_constant_vectors (vinfo, node);
7273 return;
7274 }
7275
7276 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
7277
7278 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7279
7280 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7281 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7282
7283 if (dump_enabled_p ())
7284 dump_printf_loc (MSG_NOTE, vect_location,
7285 "------>vectorizing SLP node starting from: %G",
7286 stmt_info->stmt);
7287
7288 if (STMT_VINFO_DATA_REF (stmt_info)
7289 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7290 {
7291 /* Vectorized loads go before the first scalar load to make it
7292 ready early, vectorized stores go before the last scalar
7293 stmt which is where all uses are ready. */
7294 stmt_vec_info last_stmt_info = NULL;
7295 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7296 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7297 else /* DR_IS_WRITE */
7298 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7299 si = gsi_for_stmt (last_stmt_info->stmt);
7300 }
7301 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7302 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7303 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7304 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7305 {
7306 /* For PHI node vectorization we do not use the insertion iterator. */
7307 si = gsi_none ();
7308 }
7309 else
7310 {
7311 /* Emit other stmts after the children vectorized defs which is
7312 earliest possible. */
7313 gimple *last_stmt = NULL;
7314 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
7315 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7316 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7317 {
7318 /* But avoid scheduling internal defs outside of the loop when
7319 we might have only implicitly tracked loop mask/len defs. */
7320 gimple_stmt_iterator si
7321 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
7322 last_stmt = gsi_stmt (si);
7323 }
7324 bool seen_vector_def = false;
7325 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7326 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7327 {
7328 /* For fold-left reductions we are retaining the scalar
7329 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7330 set so the representation isn't perfect. Resort to the
7331 last scalar def here. */
7332 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7333 {
7334 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7335 == cycle_phi_info_type);
7336 gphi *phi = as_a <gphi *>
7337 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7338 if (!last_stmt
7339 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7340 last_stmt = phi;
7341 }
7342 /* We are emitting all vectorized stmts in the same place and
7343 the last one is the last.
7344 ??? Unless we have a load permutation applied and that
7345 figures to re-use an earlier generated load. */
7346 unsigned j;
7347 gimple *vstmt;
7348 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7349 if (!last_stmt
7350 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7351 last_stmt = vstmt;
7352 }
7353 else if (!SLP_TREE_VECTYPE (child))
7354 {
7355 /* For externals we use unvectorized at all scalar defs. */
7356 unsigned j;
7357 tree def;
7358 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7359 if (TREE_CODE (def) == SSA_NAME
7360 && !SSA_NAME_IS_DEFAULT_DEF (def))
7361 {
7362 gimple *stmt = SSA_NAME_DEF_STMT (def);
7363 if (!last_stmt
7364 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7365 last_stmt = stmt;
7366 }
7367 }
7368 else
7369 {
7370 /* For externals we have to look at all defs since their
7371 insertion place is decided per vector. But beware
7372 of pre-existing vectors where we need to make sure
7373 we do not insert before the region boundary. */
7374 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7375 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7376 seen_vector_def = true;
7377 else
7378 {
7379 unsigned j;
7380 tree vdef;
7381 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7382 if (TREE_CODE (vdef) == SSA_NAME
7383 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7384 {
7385 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7386 if (!last_stmt
7387 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7388 last_stmt = vstmt;
7389 }
7390 }
7391 }
7392 /* This can happen when all children are pre-existing vectors or
7393 constants. */
7394 if (!last_stmt)
7395 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7396 if (!last_stmt)
7397 {
7398 gcc_assert (seen_vector_def);
7399 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7400 }
7401 else if (is_ctrl_altering_stmt (last_stmt))
7402 {
7403 /* We split regions to vectorize at control altering stmts
7404 with a definition so this must be an external which
7405 we can insert at the start of the region. */
7406 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7407 }
7408 else if (is_a <bb_vec_info> (vinfo)
7409 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7410 && gimple_could_trap_p (stmt_info->stmt))
7411 {
7412 /* We've constrained possibly trapping operations to all come
7413 from the same basic-block, if vectorized defs would allow earlier
7414 scheduling still force vectorized stmts to the original block.
7415 This is only necessary for BB vectorization since for loop vect
7416 all operations are in a single BB and scalar stmt based
7417 placement doesn't play well with epilogue vectorization. */
7418 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7419 gimple_bb (stmt_info->stmt),
7420 gimple_bb (last_stmt)));
7421 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7422 }
7423 else if (is_a <gphi *> (last_stmt))
7424 si = gsi_after_labels (gimple_bb (last_stmt));
7425 else
7426 {
7427 si = gsi_for_stmt (last_stmt);
7428 gsi_next (&si);
7429 }
7430 }
7431
7432 /* Handle purely internal nodes. */
7433 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7434 {
7435 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7436 be shared with different SLP nodes (but usually it's the same
7437 operation apart from the case the stmt is only there for denoting
7438 the actual scalar lane defs ...). So do not call vect_transform_stmt
7439 but open-code it here (partly). */
7440 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7441 gcc_assert (done);
7442 stmt_vec_info slp_stmt_info;
7443 unsigned int i;
7444 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7445 if (STMT_VINFO_LIVE_P (slp_stmt_info))
7446 {
7447 done = vectorizable_live_operation (vinfo,
7448 slp_stmt_info, &si, node,
7449 instance, i, true, NULL);
7450 gcc_assert (done);
7451 }
7452 }
7453 else
7454 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7455 }
7456
7457 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7458 For loop vectorization this is done in vectorizable_call, but for SLP
7459 it needs to be deferred until end of vect_schedule_slp, because multiple
7460 SLP instances may refer to the same scalar stmt. */
7461
7462 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited)7463 vect_remove_slp_scalar_calls (vec_info *vinfo,
7464 slp_tree node, hash_set<slp_tree> &visited)
7465 {
7466 gimple *new_stmt;
7467 gimple_stmt_iterator gsi;
7468 int i;
7469 slp_tree child;
7470 tree lhs;
7471 stmt_vec_info stmt_info;
7472
7473 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7474 return;
7475
7476 if (visited.add (node))
7477 return;
7478
7479 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7480 vect_remove_slp_scalar_calls (vinfo, child, visited);
7481
7482 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7483 {
7484 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7485 if (!stmt || gimple_bb (stmt) == NULL)
7486 continue;
7487 if (is_pattern_stmt_p (stmt_info)
7488 || !PURE_SLP_STMT (stmt_info))
7489 continue;
7490 lhs = gimple_call_lhs (stmt);
7491 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7492 gsi = gsi_for_stmt (stmt);
7493 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7494 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7495 }
7496 }
7497
7498 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node)7499 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7500 {
7501 hash_set<slp_tree> visited;
7502 vect_remove_slp_scalar_calls (vinfo, node, visited);
7503 }
7504
7505 /* Vectorize the instance root. */
7506
7507 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)7508 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7509 {
7510 gassign *rstmt = NULL;
7511
7512 if (instance->kind == slp_inst_kind_ctor)
7513 {
7514 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7515 {
7516 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7517 tree vect_lhs = gimple_get_lhs (child_stmt);
7518 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7519 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7520 TREE_TYPE (vect_lhs)))
7521 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7522 vect_lhs);
7523 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7524 }
7525 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7526 {
7527 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7528 gimple *child_stmt;
7529 int j;
7530 vec<constructor_elt, va_gc> *v;
7531 vec_alloc (v, nelts);
7532
7533 /* A CTOR can handle V16HI composition from VNx8HI so we
7534 do not need to convert vector elements if the types
7535 do not match. */
7536 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7537 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7538 gimple_get_lhs (child_stmt));
7539 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7540 tree rtype
7541 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7542 tree r_constructor = build_constructor (rtype, v);
7543 rstmt = gimple_build_assign (lhs, r_constructor);
7544 }
7545 }
7546 else if (instance->kind == slp_inst_kind_bb_reduc)
7547 {
7548 /* Largely inspired by reduction chain epilogue handling in
7549 vect_create_epilog_for_reduction. */
7550 vec<tree> vec_defs = vNULL;
7551 vect_get_slp_defs (node, &vec_defs);
7552 enum tree_code reduc_code
7553 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7554 /* ??? We actually have to reflect signs somewhere. */
7555 if (reduc_code == MINUS_EXPR)
7556 reduc_code = PLUS_EXPR;
7557 gimple_seq epilogue = NULL;
7558 /* We may end up with more than one vector result, reduce them
7559 to one vector. */
7560 tree vec_def = vec_defs[0];
7561 for (unsigned i = 1; i < vec_defs.length (); ++i)
7562 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7563 vec_def, vec_defs[i]);
7564 vec_defs.release ();
7565 /* ??? Support other schemes than direct internal fn. */
7566 internal_fn reduc_fn;
7567 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7568 || reduc_fn == IFN_LAST)
7569 gcc_unreachable ();
7570 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7571 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7572
7573 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7574 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7575 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7576 update_stmt (gsi_stmt (rgsi));
7577 return;
7578 }
7579 else
7580 gcc_unreachable ();
7581
7582 gcc_assert (rstmt);
7583
7584 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7585 gsi_replace (&rgsi, rstmt, true);
7586 }
7587
7588 struct slp_scc_info
7589 {
7590 bool on_stack;
7591 int dfs;
7592 int lowlink;
7593 };
7594
7595 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7596
7597 static void
vect_schedule_scc(vec_info * vinfo,slp_tree node,slp_instance instance,hash_map<slp_tree,slp_scc_info> & scc_info,int & maxdfs,vec<slp_tree> & stack)7598 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7599 hash_map<slp_tree, slp_scc_info> &scc_info,
7600 int &maxdfs, vec<slp_tree> &stack)
7601 {
7602 bool existed_p;
7603 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7604 gcc_assert (!existed_p);
7605 info->dfs = maxdfs;
7606 info->lowlink = maxdfs;
7607 maxdfs++;
7608
7609 /* Leaf. */
7610 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7611 {
7612 info->on_stack = false;
7613 vect_schedule_slp_node (vinfo, node, instance);
7614 return;
7615 }
7616
7617 info->on_stack = true;
7618 stack.safe_push (node);
7619
7620 unsigned i;
7621 slp_tree child;
7622 /* DFS recurse. */
7623 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7624 {
7625 if (!child)
7626 continue;
7627 slp_scc_info *child_info = scc_info.get (child);
7628 if (!child_info)
7629 {
7630 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7631 /* Recursion might have re-allocated the node. */
7632 info = scc_info.get (node);
7633 child_info = scc_info.get (child);
7634 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7635 }
7636 else if (child_info->on_stack)
7637 info->lowlink = MIN (info->lowlink, child_info->dfs);
7638 }
7639 if (info->lowlink != info->dfs)
7640 return;
7641
7642 auto_vec<slp_tree, 4> phis_to_fixup;
7643
7644 /* Singleton. */
7645 if (stack.last () == node)
7646 {
7647 stack.pop ();
7648 info->on_stack = false;
7649 vect_schedule_slp_node (vinfo, node, instance);
7650 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7651 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7652 phis_to_fixup.quick_push (node);
7653 }
7654 else
7655 {
7656 /* SCC. */
7657 int last_idx = stack.length () - 1;
7658 while (stack[last_idx] != node)
7659 last_idx--;
7660 /* We can break the cycle at PHIs who have at least one child
7661 code generated. Then we could re-start the DFS walk until
7662 all nodes in the SCC are covered (we might have new entries
7663 for only back-reachable nodes). But it's simpler to just
7664 iterate and schedule those that are ready. */
7665 unsigned todo = stack.length () - last_idx;
7666 do
7667 {
7668 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7669 {
7670 slp_tree entry = stack[idx];
7671 if (!entry)
7672 continue;
7673 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7674 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7675 bool ready = !phi;
7676 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7677 if (!child)
7678 {
7679 gcc_assert (phi);
7680 ready = true;
7681 break;
7682 }
7683 else if (scc_info.get (child)->on_stack)
7684 {
7685 if (!phi)
7686 {
7687 ready = false;
7688 break;
7689 }
7690 }
7691 else
7692 {
7693 if (phi)
7694 {
7695 ready = true;
7696 break;
7697 }
7698 }
7699 if (ready)
7700 {
7701 vect_schedule_slp_node (vinfo, entry, instance);
7702 scc_info.get (entry)->on_stack = false;
7703 stack[idx] = NULL;
7704 todo--;
7705 if (phi)
7706 phis_to_fixup.safe_push (entry);
7707 }
7708 }
7709 }
7710 while (todo != 0);
7711
7712 /* Pop the SCC. */
7713 stack.truncate (last_idx);
7714 }
7715
7716 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7717 slp_tree phi_node;
7718 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7719 {
7720 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7721 edge_iterator ei;
7722 edge e;
7723 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7724 {
7725 unsigned dest_idx = e->dest_idx;
7726 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7727 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7728 continue;
7729 /* Simply fill all args. */
7730 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7731 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7732 vect_get_slp_vect_def (child, i),
7733 e, gimple_phi_arg_location (phi, dest_idx));
7734 }
7735 }
7736 }
7737
7738 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7739
7740 void
vect_schedule_slp(vec_info * vinfo,const vec<slp_instance> & slp_instances)7741 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7742 {
7743 slp_instance instance;
7744 unsigned int i;
7745
7746 hash_map<slp_tree, slp_scc_info> scc_info;
7747 int maxdfs = 0;
7748 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7749 {
7750 slp_tree node = SLP_INSTANCE_TREE (instance);
7751 if (dump_enabled_p ())
7752 {
7753 dump_printf_loc (MSG_NOTE, vect_location,
7754 "Vectorizing SLP tree:\n");
7755 /* ??? Dump all? */
7756 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7757 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7758 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7759 vect_print_slp_graph (MSG_NOTE, vect_location,
7760 SLP_INSTANCE_TREE (instance));
7761 }
7762 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7763 have a PHI be the node breaking the cycle. */
7764 auto_vec<slp_tree> stack;
7765 if (!scc_info.get (node))
7766 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7767
7768 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7769 vectorize_slp_instance_root_stmt (node, instance);
7770
7771 if (dump_enabled_p ())
7772 dump_printf_loc (MSG_NOTE, vect_location,
7773 "vectorizing stmts using SLP.\n");
7774 }
7775
7776 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7777 {
7778 slp_tree root = SLP_INSTANCE_TREE (instance);
7779 stmt_vec_info store_info;
7780 unsigned int j;
7781
7782 /* Remove scalar call stmts. Do not do this for basic-block
7783 vectorization as not all uses may be vectorized.
7784 ??? Why should this be necessary? DCE should be able to
7785 remove the stmts itself.
7786 ??? For BB vectorization we can as well remove scalar
7787 stmts starting from the SLP tree root if they have no
7788 uses. */
7789 if (is_a <loop_vec_info> (vinfo))
7790 vect_remove_slp_scalar_calls (vinfo, root);
7791
7792 /* Remove vectorized stores original scalar stmts. */
7793 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7794 {
7795 if (!STMT_VINFO_DATA_REF (store_info)
7796 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7797 break;
7798
7799 store_info = vect_orig_stmt (store_info);
7800 /* Free the attached stmt_vec_info and remove the stmt. */
7801 vinfo->remove_stmt (store_info);
7802
7803 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7804 to not crash in vect_free_slp_tree later. */
7805 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7806 SLP_TREE_REPRESENTATIVE (root) = NULL;
7807 }
7808 }
7809 }
7810