1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47
48
49 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
50 FINAL_P is true if we have vectorized the instance or if we have
51 made a final decision not to vectorize the statements in any way. */
52
53 static void
vect_free_slp_tree(slp_tree node,bool final_p)54 vect_free_slp_tree (slp_tree node, bool final_p)
55 {
56 int i;
57 slp_tree child;
58
59 if (--node->refcnt != 0)
60 return;
61
62 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
63 vect_free_slp_tree (child, final_p);
64
65 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
66 Some statements might no longer exist, after having been
67 removed by vect_transform_stmt. Updating the remaining
68 statements would be redundant. */
69 if (!final_p)
70 {
71 stmt_vec_info stmt_info;
72 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
73 {
74 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
75 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
76 }
77 }
78
79 SLP_TREE_CHILDREN (node).release ();
80 SLP_TREE_SCALAR_STMTS (node).release ();
81 SLP_TREE_SCALAR_OPS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
vect_free_slp_instance(slp_instance instance,bool final_p)93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts)104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_SCALAR_OPS (node) = vNULL;
126 SLP_TREE_VEC_STMTS (node).create (0);
127 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
128 SLP_TREE_CHILDREN (node).create (nops);
129 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
130 SLP_TREE_TWO_OPERATORS (node) = false;
131 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
132 node->refcnt = 1;
133 node->max_nunits = 1;
134
135 unsigned i;
136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
137 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
138
139 return node;
140 }
141
142 /* Create an SLP node for OPS. */
143
144 static slp_tree
vect_create_new_slp_node(vec<tree> ops)145 vect_create_new_slp_node (vec<tree> ops)
146 {
147 slp_tree node;
148
149 node = XNEW (struct _slp_tree);
150 SLP_TREE_SCALAR_STMTS (node) = vNULL;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_VEC_STMTS (node).create (0);
153 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
154 SLP_TREE_CHILDREN (node) = vNULL;
155 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
156 SLP_TREE_TWO_OPERATORS (node) = false;
157 SLP_TREE_DEF_TYPE (node) = vect_external_def;
158 node->refcnt = 1;
159 node->max_nunits = 1;
160
161 return node;
162 }
163
164
165 /* This structure is used in creation of an SLP tree. Each instance
166 corresponds to the same operand in a group of scalar stmts in an SLP
167 node. */
168 typedef struct _slp_oprnd_info
169 {
170 /* Def-stmts for the operands. */
171 vec<stmt_vec_info> def_stmts;
172 /* Operands. */
173 vec<tree> ops;
174 /* Information about the first statement, its vector def-type, type, the
175 operand itself in case it's constant, and an indication if it's a pattern
176 stmt. */
177 tree first_op_type;
178 enum vect_def_type first_dt;
179 bool any_pattern;
180 } *slp_oprnd_info;
181
182
183 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
184 operand. */
185 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)186 vect_create_oprnd_info (int nops, int group_size)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190 vec<slp_oprnd_info> oprnds_info;
191
192 oprnds_info.create (nops);
193 for (i = 0; i < nops; i++)
194 {
195 oprnd_info = XNEW (struct _slp_oprnd_info);
196 oprnd_info->def_stmts.create (group_size);
197 oprnd_info->ops.create (group_size);
198 oprnd_info->first_dt = vect_uninitialized_def;
199 oprnd_info->first_op_type = NULL_TREE;
200 oprnd_info->any_pattern = false;
201 oprnds_info.quick_push (oprnd_info);
202 }
203
204 return oprnds_info;
205 }
206
207
208 /* Free operands info. */
209
210 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)211 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
212 {
213 int i;
214 slp_oprnd_info oprnd_info;
215
216 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
217 {
218 oprnd_info->def_stmts.release ();
219 oprnd_info->ops.release ();
220 XDELETE (oprnd_info);
221 }
222
223 oprnds_info.release ();
224 }
225
226
227 /* Return true if STMTS contains a pattern statement. */
228
229 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)230 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
231 {
232 stmt_vec_info stmt_info;
233 unsigned int i;
234 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
235 if (is_pattern_stmt_p (stmt_info))
236 return true;
237 return false;
238 }
239
240 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
241 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
242 of the chain. */
243
244 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)245 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
246 stmt_vec_info first_stmt_info)
247 {
248 stmt_vec_info next_stmt_info = first_stmt_info;
249 int result = 0;
250
251 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
252 return -1;
253
254 do
255 {
256 if (next_stmt_info == stmt_info)
257 return result;
258 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
259 if (next_stmt_info)
260 result += DR_GROUP_GAP (next_stmt_info);
261 }
262 while (next_stmt_info);
263
264 return -1;
265 }
266
267 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
268 using the method implemented by duplicate_and_interleave. Return true
269 if so, returning the number of intermediate vectors in *NVECTORS_OUT
270 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
271 (if nonnull). */
272
273 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)274 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
275 tree elt_type, unsigned int *nvectors_out,
276 tree *vector_type_out,
277 tree *permutes)
278 {
279 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
280 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
281 return false;
282
283 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
284 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
285 unsigned int nvectors = 1;
286 for (;;)
287 {
288 scalar_int_mode int_mode;
289 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
290 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
291 {
292 /* Get the natural vector type for this SLP group size. */
293 tree int_type = build_nonstandard_integer_type
294 (GET_MODE_BITSIZE (int_mode), 1);
295 tree vector_type
296 = get_vectype_for_scalar_type (vinfo, int_type, count);
297 if (vector_type
298 && VECTOR_MODE_P (TYPE_MODE (vector_type))
299 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
300 GET_MODE_SIZE (base_vector_mode)))
301 {
302 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
303 together into elements of type INT_TYPE and using the result
304 to build NVECTORS vectors. */
305 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
306 vec_perm_builder sel1 (nelts, 2, 3);
307 vec_perm_builder sel2 (nelts, 2, 3);
308 poly_int64 half_nelts = exact_div (nelts, 2);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel1.quick_push (i);
312 sel1.quick_push (i + nelts);
313 sel2.quick_push (half_nelts + i);
314 sel2.quick_push (half_nelts + i + nelts);
315 }
316 vec_perm_indices indices1 (sel1, 2, nelts);
317 vec_perm_indices indices2 (sel2, 2, nelts);
318 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
319 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
320 {
321 if (nvectors_out)
322 *nvectors_out = nvectors;
323 if (vector_type_out)
324 *vector_type_out = vector_type;
325 if (permutes)
326 {
327 permutes[0] = vect_gen_perm_mask_checked (vector_type,
328 indices1);
329 permutes[1] = vect_gen_perm_mask_checked (vector_type,
330 indices2);
331 }
332 return true;
333 }
334 }
335 }
336 if (!multiple_p (elt_bytes, 2, &elt_bytes))
337 return false;
338 nvectors *= 2;
339 }
340 }
341
342 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
343 they are of a valid type and that they match the defs of the first stmt of
344 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
345 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
346 indicates swap is required for cond_expr stmts. Specifically, *SWAP
347 is 1 if STMT is cond and operands of comparison need to be swapped;
348 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
349 If there is any operand swap in this function, *SWAP is set to non-zero
350 value.
351 If there was a fatal error return -1; if the error could be corrected by
352 swapping operands of father node of this one, return 1; if everything is
353 ok return 0. */
354 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)355 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
356 vec<stmt_vec_info> stmts, unsigned stmt_num,
357 vec<slp_oprnd_info> *oprnds_info)
358 {
359 stmt_vec_info stmt_info = stmts[stmt_num];
360 tree oprnd;
361 unsigned int i, number_of_oprnds;
362 enum vect_def_type dt = vect_uninitialized_def;
363 slp_oprnd_info oprnd_info;
364 int first_op_idx = 1;
365 unsigned int commutative_op = -1U;
366 bool first_op_cond = false;
367 bool first = stmt_num == 0;
368
369 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
370 {
371 number_of_oprnds = gimple_call_num_args (stmt);
372 first_op_idx = 3;
373 if (gimple_call_internal_p (stmt))
374 {
375 internal_fn ifn = gimple_call_internal_fn (stmt);
376 commutative_op = first_commutative_argument (ifn);
377
378 /* Masked load, only look at mask. */
379 if (ifn == IFN_MASK_LOAD)
380 {
381 number_of_oprnds = 1;
382 /* Mask operand index. */
383 first_op_idx = 5;
384 }
385 }
386 }
387 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
388 {
389 enum tree_code code = gimple_assign_rhs_code (stmt);
390 number_of_oprnds = gimple_num_ops (stmt) - 1;
391 /* Swap can only be done for cond_expr if asked to, otherwise we
392 could result in different comparison code to the first stmt. */
393 if (code == COND_EXPR
394 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
395 {
396 first_op_cond = true;
397 number_of_oprnds++;
398 }
399 else
400 commutative_op = commutative_tree_code (code) ? 0U : -1U;
401 }
402 else
403 return -1;
404
405 bool swapped = (*swap != 0);
406 gcc_assert (!swapped || first_op_cond);
407 for (i = 0; i < number_of_oprnds; i++)
408 {
409 again:
410 if (first_op_cond)
411 {
412 /* Map indicating how operands of cond_expr should be swapped. */
413 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
414 int *map = maps[*swap];
415
416 if (i < 2)
417 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
418 first_op_idx), map[i]);
419 else
420 oprnd = gimple_op (stmt_info->stmt, map[i]);
421 }
422 else
423 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
424 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
425 oprnd = TREE_OPERAND (oprnd, 0);
426
427 oprnd_info = (*oprnds_info)[i];
428
429 stmt_vec_info def_stmt_info;
430 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
431 {
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "Build SLP failed: can't analyze def for %T\n",
435 oprnd);
436
437 return -1;
438 }
439
440 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
441 oprnd_info->any_pattern = true;
442
443 if (first)
444 {
445 /* For the swapping logic below force vect_reduction_def
446 for the reduction op in a SLP reduction group. */
447 if (!STMT_VINFO_DATA_REF (stmt_info)
448 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
449 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
450 && def_stmt_info)
451 dt = vect_reduction_def;
452 oprnd_info->first_dt = dt;
453 oprnd_info->first_op_type = TREE_TYPE (oprnd);
454 }
455 else
456 {
457 /* Not first stmt of the group, check that the def-stmt/s match
458 the def-stmt/s of the first stmt. Allow different definition
459 types for reduction chains: the first stmt must be a
460 vect_reduction_def (a phi node), and the rest
461 end in the reduction chain. */
462 tree type = TREE_TYPE (oprnd);
463 if ((oprnd_info->first_dt != dt
464 && !(oprnd_info->first_dt == vect_reduction_def
465 && !STMT_VINFO_DATA_REF (stmt_info)
466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
467 && def_stmt_info
468 && !STMT_VINFO_DATA_REF (def_stmt_info)
469 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
470 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
471 && !((oprnd_info->first_dt == vect_external_def
472 || oprnd_info->first_dt == vect_constant_def)
473 && (dt == vect_external_def
474 || dt == vect_constant_def)))
475 || !types_compatible_p (oprnd_info->first_op_type, type)
476 || (!STMT_VINFO_DATA_REF (stmt_info)
477 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
478 && ((!def_stmt_info
479 || STMT_VINFO_DATA_REF (def_stmt_info)
480 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
481 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
482 != (oprnd_info->first_dt != vect_reduction_def))))
483 {
484 /* Try swapping operands if we got a mismatch. */
485 if (i == commutative_op && !swapped)
486 {
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "trying swapped operands\n");
490 swapped = true;
491 goto again;
492 }
493
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 "Build SLP failed: different types\n");
497
498 return 1;
499 }
500 if ((dt == vect_constant_def
501 || dt == vect_external_def)
502 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
503 && (TREE_CODE (type) == BOOLEAN_TYPE
504 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
505 type)))
506 {
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "Build SLP failed: invalid type of def "
510 "for variable-length SLP %T\n", oprnd);
511 return -1;
512 }
513 }
514
515 /* Check the types of the definitions. */
516 switch (dt)
517 {
518 case vect_external_def:
519 /* Make sure to demote the overall operand to external. */
520 oprnd_info->first_dt = vect_external_def;
521 /* Fallthru. */
522 case vect_constant_def:
523 oprnd_info->def_stmts.quick_push (NULL);
524 oprnd_info->ops.quick_push (oprnd);
525 break;
526
527 case vect_internal_def:
528 case vect_reduction_def:
529 if (oprnd_info->first_dt == vect_reduction_def
530 && !STMT_VINFO_DATA_REF (stmt_info)
531 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
532 && !STMT_VINFO_DATA_REF (def_stmt_info)
533 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
534 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
535 {
536 /* For a SLP reduction chain we want to duplicate the
537 reduction to each of the chain members. That gets
538 us a sane SLP graph (still the stmts are not 100%
539 correct wrt the initial values). */
540 gcc_assert (!first);
541 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
542 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
543 break;
544 }
545 /* Fallthru. */
546 case vect_induction_def:
547 oprnd_info->def_stmts.quick_push (def_stmt_info);
548 oprnd_info->ops.quick_push (oprnd);
549 break;
550
551 default:
552 /* FORNOW: Not supported. */
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: illegal type of def %T\n",
556 oprnd);
557
558 return -1;
559 }
560 }
561
562 /* Swap operands. */
563 if (swapped)
564 {
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "swapped operands to match def types in %G",
568 stmt_info->stmt);
569 }
570
571 *swap = swapped;
572 return 0;
573 }
574
575 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
576 Return true if we can, meaning that this choice doesn't conflict with
577 existing SLP nodes that use STMT_INFO. */
578
579 static bool
vect_update_shared_vectype(stmt_vec_info stmt_info,tree vectype)580 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
581 {
582 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
583 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
584 return true;
585
586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
588 {
589 /* We maintain the invariant that if any statement in the group is
590 used, all other members of the group have the same vector type. */
591 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
592 stmt_vec_info member_info = first_info;
593 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
594 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
595 || is_pattern_stmt_p (member_info))
596 break;
597
598 if (!member_info)
599 {
600 for (member_info = first_info; member_info;
601 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
602 STMT_VINFO_VECTYPE (member_info) = vectype;
603 return true;
604 }
605 }
606 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
607 && !is_pattern_stmt_p (stmt_info))
608 {
609 STMT_VINFO_VECTYPE (stmt_info) = vectype;
610 return true;
611 }
612
613 if (dump_enabled_p ())
614 {
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "Build SLP failed: incompatible vector"
617 " types for: %G", stmt_info->stmt);
618 dump_printf_loc (MSG_NOTE, vect_location,
619 " old vector type: %T\n", old_vectype);
620 dump_printf_loc (MSG_NOTE, vect_location,
621 " new vector type: %T\n", vectype);
622 }
623 return false;
624 }
625
626 /* Try to infer and assign a vector type to all the statements in STMTS.
627 Used only for BB vectorization. */
628
629 static bool
vect_update_all_shared_vectypes(vec<stmt_vec_info> stmts)630 vect_update_all_shared_vectypes (vec<stmt_vec_info> stmts)
631 {
632 tree vectype, nunits_vectype;
633 if (!vect_get_vector_types_for_stmt (stmts[0], &vectype,
634 &nunits_vectype, stmts.length ()))
635 return false;
636
637 stmt_vec_info stmt_info;
638 unsigned int i;
639 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
640 if (!vect_update_shared_vectype (stmt_info, vectype))
641 return false;
642
643 return true;
644 }
645
646 /* Return true if call statements CALL1 and CALL2 are similar enough
647 to be combined into the same SLP group. */
648
649 static bool
compatible_calls_p(gcall * call1,gcall * call2)650 compatible_calls_p (gcall *call1, gcall *call2)
651 {
652 unsigned int nargs = gimple_call_num_args (call1);
653 if (nargs != gimple_call_num_args (call2))
654 return false;
655
656 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
657 return false;
658
659 if (gimple_call_internal_p (call1))
660 {
661 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
662 TREE_TYPE (gimple_call_lhs (call2))))
663 return false;
664 for (unsigned int i = 0; i < nargs; ++i)
665 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
666 TREE_TYPE (gimple_call_arg (call2, i))))
667 return false;
668 }
669 else
670 {
671 if (!operand_equal_p (gimple_call_fn (call1),
672 gimple_call_fn (call2), 0))
673 return false;
674
675 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
676 return false;
677 }
678 return true;
679 }
680
681 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
682 caller's attempt to find the vector type in STMT_INFO with the narrowest
683 element type. Return true if VECTYPE is nonnull and if it is valid
684 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
685 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
686 vect_build_slp_tree. */
687
688 static bool
vect_record_max_nunits(stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)689 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
690 tree vectype, poly_uint64 *max_nunits)
691 {
692 if (!vectype)
693 {
694 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "Build SLP failed: unsupported data-type in %G\n",
697 stmt_info->stmt);
698 /* Fatal mismatch. */
699 return false;
700 }
701
702 /* If populating the vector type requires unrolling then fail
703 before adjusting *max_nunits for basic-block vectorization. */
704 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
705 unsigned HOST_WIDE_INT const_nunits;
706 if (STMT_VINFO_BB_VINFO (stmt_info)
707 && (!nunits.is_constant (&const_nunits)
708 || const_nunits > group_size))
709 {
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: unrolling required "
713 "in basic block SLP\n");
714 /* Fatal mismatch. */
715 return false;
716 }
717
718 /* In case of multiple types we need to detect the smallest type. */
719 vect_update_max_nunits (max_nunits, vectype);
720 return true;
721 }
722
723 /* STMTS is a group of GROUP_SIZE SLP statements in which some
724 statements do the same operation as the first statement and in which
725 the others do ALT_STMT_CODE. Return true if we can take one vector
726 of the first operation and one vector of the second and permute them
727 to get the required result. VECTYPE is the type of the vector that
728 would be permuted. */
729
730 static bool
vect_two_operations_perm_ok_p(vec<stmt_vec_info> stmts,unsigned int group_size,tree vectype,tree_code alt_stmt_code)731 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
732 unsigned int group_size, tree vectype,
733 tree_code alt_stmt_code)
734 {
735 unsigned HOST_WIDE_INT count;
736 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
737 return false;
738
739 vec_perm_builder sel (count, count, 1);
740 for (unsigned int i = 0; i < count; ++i)
741 {
742 unsigned int elt = i;
743 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
744 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
745 elt += count;
746 sel.quick_push (elt);
747 }
748 vec_perm_indices indices (sel, 2, count);
749 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
750 }
751
752 /* Verify if the scalar stmts STMTS are isomorphic, require data
753 permutation or are of unsupported types of operation. Return
754 true if they are, otherwise return false and indicate in *MATCHES
755 which stmts are not isomorphic to the first one. If MATCHES[0]
756 is false then this indicates the comparison could not be
757 carried out or the stmts will never be vectorized by SLP.
758
759 Note COND_EXPR is possibly isomorphic to another one after swapping its
760 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
761 the first stmt by swapping the two operands of comparison; set SWAP[i]
762 to 2 if stmt I is isormorphic to the first stmt by inverting the code
763 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
764 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
765
766 static bool
vect_build_slp_tree_1(unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators)767 vect_build_slp_tree_1 (unsigned char *swap,
768 vec<stmt_vec_info> stmts, unsigned int group_size,
769 poly_uint64 *max_nunits, bool *matches,
770 bool *two_operators)
771 {
772 unsigned int i;
773 stmt_vec_info first_stmt_info = stmts[0];
774 enum tree_code first_stmt_code = ERROR_MARK;
775 enum tree_code alt_stmt_code = ERROR_MARK;
776 enum tree_code rhs_code = ERROR_MARK;
777 enum tree_code first_cond_code = ERROR_MARK;
778 tree lhs;
779 bool need_same_oprnds = false;
780 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
781 optab optab;
782 int icode;
783 machine_mode optab_op2_mode;
784 machine_mode vec_mode;
785 stmt_vec_info first_load = NULL, prev_first_load = NULL;
786 bool load_p = false;
787
788 /* For every stmt in NODE find its def stmt/s. */
789 stmt_vec_info stmt_info;
790 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
791 {
792 vec_info *vinfo = stmt_info->vinfo;
793 gimple *stmt = stmt_info->stmt;
794 swap[i] = 0;
795 matches[i] = false;
796
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
799
800 /* Fail to vectorize statements marked as unvectorizable. */
801 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
802 {
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "Build SLP failed: unvectorizable statement %G",
806 stmt);
807 /* Fatal mismatch. */
808 matches[0] = false;
809 return false;
810 }
811
812 lhs = gimple_get_lhs (stmt);
813 if (lhs == NULL_TREE)
814 {
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: not GIMPLE_ASSIGN nor "
818 "GIMPLE_CALL %G", stmt);
819 /* Fatal mismatch. */
820 matches[0] = false;
821 return false;
822 }
823
824 tree nunits_vectype;
825 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
826 &nunits_vectype, group_size)
827 || (nunits_vectype
828 && !vect_record_max_nunits (stmt_info, group_size,
829 nunits_vectype, max_nunits)))
830 {
831 /* Fatal mismatch. */
832 matches[0] = false;
833 return false;
834 }
835
836 gcc_assert (vectype);
837
838 if (is_a <bb_vec_info> (vinfo)
839 && !vect_update_shared_vectype (stmt_info, vectype))
840 continue;
841
842 gcall *call_stmt = dyn_cast <gcall *> (stmt);
843 if (call_stmt)
844 {
845 rhs_code = CALL_EXPR;
846
847 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
848 load_p = true;
849 else if ((gimple_call_internal_p (call_stmt)
850 && (!vectorizable_internal_fn_p
851 (gimple_call_internal_fn (call_stmt))))
852 || gimple_call_tail_p (call_stmt)
853 || gimple_call_noreturn_p (call_stmt)
854 || !gimple_call_nothrow_p (call_stmt)
855 || gimple_call_chain (call_stmt))
856 {
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
859 "Build SLP failed: unsupported call type %G",
860 call_stmt);
861 /* Fatal mismatch. */
862 matches[0] = false;
863 return false;
864 }
865 }
866 else
867 {
868 rhs_code = gimple_assign_rhs_code (stmt);
869 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
870 }
871
872 /* Check the operation. */
873 if (i == 0)
874 {
875 first_stmt_code = rhs_code;
876
877 /* Shift arguments should be equal in all the packed stmts for a
878 vector shift with scalar shift operand. */
879 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
880 || rhs_code == LROTATE_EXPR
881 || rhs_code == RROTATE_EXPR)
882 {
883 vec_mode = TYPE_MODE (vectype);
884
885 /* First see if we have a vector/vector shift. */
886 optab = optab_for_tree_code (rhs_code, vectype,
887 optab_vector);
888
889 if (!optab
890 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
891 {
892 /* No vector/vector shift, try for a vector/scalar shift. */
893 optab = optab_for_tree_code (rhs_code, vectype,
894 optab_scalar);
895
896 if (!optab)
897 {
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 "Build SLP failed: no optab.\n");
901 /* Fatal mismatch. */
902 matches[0] = false;
903 return false;
904 }
905 icode = (int) optab_handler (optab, vec_mode);
906 if (icode == CODE_FOR_nothing)
907 {
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "Build SLP failed: "
911 "op not supported by target.\n");
912 /* Fatal mismatch. */
913 matches[0] = false;
914 return false;
915 }
916 optab_op2_mode = insn_data[icode].operand[2].mode;
917 if (!VECTOR_MODE_P (optab_op2_mode))
918 {
919 need_same_oprnds = true;
920 first_op1 = gimple_assign_rhs2 (stmt);
921 }
922 }
923 }
924 else if (rhs_code == WIDEN_LSHIFT_EXPR)
925 {
926 need_same_oprnds = true;
927 first_op1 = gimple_assign_rhs2 (stmt);
928 }
929 else if (call_stmt
930 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
931 {
932 need_same_oprnds = true;
933 first_op1 = gimple_call_arg (call_stmt, 1);
934 }
935 }
936 else
937 {
938 if (first_stmt_code != rhs_code
939 && alt_stmt_code == ERROR_MARK)
940 alt_stmt_code = rhs_code;
941 if (first_stmt_code != rhs_code
942 && (first_stmt_code != IMAGPART_EXPR
943 || rhs_code != REALPART_EXPR)
944 && (first_stmt_code != REALPART_EXPR
945 || rhs_code != IMAGPART_EXPR)
946 /* Handle mismatches in plus/minus by computing both
947 and merging the results. */
948 && !((first_stmt_code == PLUS_EXPR
949 || first_stmt_code == MINUS_EXPR)
950 && (alt_stmt_code == PLUS_EXPR
951 || alt_stmt_code == MINUS_EXPR)
952 && rhs_code == alt_stmt_code)
953 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
954 && (first_stmt_code == ARRAY_REF
955 || first_stmt_code == BIT_FIELD_REF
956 || first_stmt_code == INDIRECT_REF
957 || first_stmt_code == COMPONENT_REF
958 || first_stmt_code == MEM_REF)))
959 {
960 if (dump_enabled_p ())
961 {
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "Build SLP failed: different operation "
964 "in stmt %G", stmt);
965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
966 "original stmt %G", first_stmt_info->stmt);
967 }
968 /* Mismatch. */
969 continue;
970 }
971
972 if (!load_p && rhs_code == CALL_EXPR)
973 {
974 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
975 as_a <gcall *> (stmt)))
976 {
977 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "Build SLP failed: different calls in %G",
980 stmt);
981 /* Mismatch. */
982 continue;
983 }
984 }
985
986 if (need_same_oprnds)
987 {
988 tree other_op1 = (call_stmt
989 ? gimple_call_arg (call_stmt, 1)
990 : gimple_assign_rhs2 (stmt));
991 if (!operand_equal_p (first_op1, other_op1, 0))
992 {
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: different shift "
996 "arguments in %G", stmt);
997 /* Mismatch. */
998 continue;
999 }
1000 }
1001 }
1002
1003 /* Grouped store or load. */
1004 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1005 {
1006 if (REFERENCE_CLASS_P (lhs))
1007 {
1008 /* Store. */
1009 ;
1010 }
1011 else
1012 {
1013 /* Load. */
1014 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1015 if (prev_first_load)
1016 {
1017 /* Check that there are no loads from different interleaving
1018 chains in the same node. */
1019 if (prev_first_load != first_load)
1020 {
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1023 vect_location,
1024 "Build SLP failed: different "
1025 "interleaving chains in one node %G",
1026 stmt);
1027 /* Mismatch. */
1028 continue;
1029 }
1030 }
1031 else
1032 prev_first_load = first_load;
1033 }
1034 } /* Grouped access. */
1035 else
1036 {
1037 if (load_p)
1038 {
1039 /* Not grouped load. */
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "Build SLP failed: not grouped load %G", stmt);
1043
1044 /* FORNOW: Not grouped loads are not supported. */
1045 /* Fatal mismatch. */
1046 matches[0] = false;
1047 return false;
1048 }
1049
1050 /* Not memory operation. */
1051 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1052 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1053 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1054 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1055 && rhs_code != VIEW_CONVERT_EXPR
1056 && rhs_code != CALL_EXPR)
1057 {
1058 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1060 "Build SLP failed: operation unsupported %G",
1061 stmt);
1062 /* Fatal mismatch. */
1063 matches[0] = false;
1064 return false;
1065 }
1066
1067 if (rhs_code == COND_EXPR)
1068 {
1069 tree cond_expr = gimple_assign_rhs1 (stmt);
1070 enum tree_code cond_code = TREE_CODE (cond_expr);
1071 enum tree_code swap_code = ERROR_MARK;
1072 enum tree_code invert_code = ERROR_MARK;
1073
1074 if (i == 0)
1075 first_cond_code = TREE_CODE (cond_expr);
1076 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1077 {
1078 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1079 swap_code = swap_tree_comparison (cond_code);
1080 invert_code = invert_tree_comparison (cond_code, honor_nans);
1081 }
1082
1083 if (first_cond_code == cond_code)
1084 ;
1085 /* Isomorphic can be achieved by swapping. */
1086 else if (first_cond_code == swap_code)
1087 swap[i] = 1;
1088 /* Isomorphic can be achieved by inverting. */
1089 else if (first_cond_code == invert_code)
1090 swap[i] = 2;
1091 else
1092 {
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "Build SLP failed: different"
1096 " operation %G", stmt);
1097 /* Mismatch. */
1098 continue;
1099 }
1100 }
1101 }
1102
1103 matches[i] = true;
1104 }
1105
1106 for (i = 0; i < group_size; ++i)
1107 if (!matches[i])
1108 return false;
1109
1110 /* If we allowed a two-operation SLP node verify the target can cope
1111 with the permute we are going to use. */
1112 if (alt_stmt_code != ERROR_MARK
1113 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1114 {
1115 if (!vect_two_operations_perm_ok_p (stmts, group_size,
1116 vectype, alt_stmt_code))
1117 {
1118 for (i = 0; i < group_size; ++i)
1119 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1120 {
1121 matches[i] = false;
1122 if (dump_enabled_p ())
1123 {
1124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125 "Build SLP failed: different operation "
1126 "in stmt %G", stmts[i]->stmt);
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "original stmt %G", first_stmt_info->stmt);
1129 }
1130 }
1131 return false;
1132 }
1133 *two_operators = true;
1134 }
1135
1136 return true;
1137 }
1138
1139 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1140 Note we never remove apart from at destruction time so we do not
1141 need a special value for deleted that differs from empty. */
1142 struct bst_traits
1143 {
1144 typedef vec <stmt_vec_info> value_type;
1145 typedef vec <stmt_vec_info> compare_type;
1146 static inline hashval_t hash (value_type);
1147 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1148 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1149 static inline bool is_deleted (value_type x) { return !x.exists (); }
1150 static const bool empty_zero_p = true;
mark_emptybst_traits1151 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1152 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1153 static inline void remove (value_type &x) { x.release (); }
1154 };
1155 inline hashval_t
hash(value_type x)1156 bst_traits::hash (value_type x)
1157 {
1158 inchash::hash h;
1159 for (unsigned i = 0; i < x.length (); ++i)
1160 h.add_int (gimple_uid (x[i]->stmt));
1161 return h.end ();
1162 }
1163 inline bool
equal(value_type existing,value_type candidate)1164 bst_traits::equal (value_type existing, value_type candidate)
1165 {
1166 if (existing.length () != candidate.length ())
1167 return false;
1168 for (unsigned i = 0; i < existing.length (); ++i)
1169 if (existing[i] != candidate[i])
1170 return false;
1171 return true;
1172 }
1173
1174 typedef hash_map <vec <gimple *>, slp_tree,
1175 simple_hashmap_traits <bst_traits, slp_tree> >
1176 scalar_stmts_to_slp_tree_map_t;
1177
1178 static slp_tree
1179 vect_build_slp_tree_2 (vec_info *vinfo,
1180 vec<stmt_vec_info> stmts, unsigned int group_size,
1181 poly_uint64 *max_nunits,
1182 bool *matches, unsigned *npermutes, unsigned *tree_size,
1183 scalar_stmts_to_slp_tree_map_t *bst_map);
1184
1185 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1186 vect_build_slp_tree (vec_info *vinfo,
1187 vec<stmt_vec_info> stmts, unsigned int group_size,
1188 poly_uint64 *max_nunits,
1189 bool *matches, unsigned *npermutes, unsigned *tree_size,
1190 scalar_stmts_to_slp_tree_map_t *bst_map)
1191 {
1192 if (slp_tree *leader = bst_map->get (stmts))
1193 {
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1196 *leader ? "" : "failed ", *leader);
1197 if (*leader)
1198 {
1199 (*leader)->refcnt++;
1200 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1201 }
1202 return *leader;
1203 }
1204 poly_uint64 this_max_nunits = 1;
1205 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1206 &this_max_nunits,
1207 matches, npermutes, tree_size, bst_map);
1208 if (res)
1209 {
1210 res->max_nunits = this_max_nunits;
1211 vect_update_max_nunits (max_nunits, this_max_nunits);
1212 /* Keep a reference for the bst_map use. */
1213 res->refcnt++;
1214 }
1215 bst_map->put (stmts.copy (), res);
1216 return res;
1217 }
1218
1219 /* Recursively build an SLP tree starting from NODE.
1220 Fail (and return a value not equal to zero) if def-stmts are not
1221 isomorphic, require data permutation or are of unsupported types of
1222 operation. Otherwise, return 0.
1223 The value returned is the depth in the SLP tree where a mismatch
1224 was found. */
1225
1226 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1227 vect_build_slp_tree_2 (vec_info *vinfo,
1228 vec<stmt_vec_info> stmts, unsigned int group_size,
1229 poly_uint64 *max_nunits,
1230 bool *matches, unsigned *npermutes, unsigned *tree_size,
1231 scalar_stmts_to_slp_tree_map_t *bst_map)
1232 {
1233 unsigned nops, i, this_tree_size = 0;
1234 poly_uint64 this_max_nunits = *max_nunits;
1235 slp_tree node;
1236
1237 matches[0] = false;
1238
1239 stmt_vec_info stmt_info = stmts[0];
1240 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1241 nops = gimple_call_num_args (stmt);
1242 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1243 {
1244 nops = gimple_num_ops (stmt) - 1;
1245 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1246 nops++;
1247 }
1248 else if (is_a <gphi *> (stmt_info->stmt))
1249 nops = 0;
1250 else
1251 return NULL;
1252
1253 /* If the SLP node is a PHI (induction or reduction), terminate
1254 the recursion. */
1255 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1256 {
1257 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1258 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1259 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1260 return NULL;
1261
1262 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1263 /* Induction from different IVs is not supported. */
1264 if (def_type == vect_induction_def)
1265 {
1266 stmt_vec_info other_info;
1267 FOR_EACH_VEC_ELT (stmts, i, other_info)
1268 if (stmt_info != other_info)
1269 return NULL;
1270 }
1271 else if (def_type == vect_reduction_def
1272 || def_type == vect_double_reduction_def
1273 || def_type == vect_nested_cycle)
1274 {
1275 /* Else def types have to match. */
1276 stmt_vec_info other_info;
1277 FOR_EACH_VEC_ELT (stmts, i, other_info)
1278 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1279 return NULL;
1280 }
1281 else
1282 return NULL;
1283 (*tree_size)++;
1284 node = vect_create_new_slp_node (stmts);
1285 return node;
1286 }
1287
1288
1289 bool two_operators = false;
1290 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1291 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1292 &this_max_nunits, matches, &two_operators))
1293 return NULL;
1294
1295 /* If the SLP node is a load, terminate the recursion unless masked. */
1296 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1297 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1298 {
1299 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1300 {
1301 /* Masked load. */
1302 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1303 nops = 1;
1304 }
1305 else
1306 {
1307 *max_nunits = this_max_nunits;
1308 (*tree_size)++;
1309 node = vect_create_new_slp_node (stmts);
1310 /* And compute the load permutation. Whether it is actually
1311 a permutation depends on the unrolling factor which is
1312 decided later. */
1313 vec<unsigned> load_permutation;
1314 int j;
1315 stmt_vec_info load_info;
1316 load_permutation.create (group_size);
1317 stmt_vec_info first_stmt_info
1318 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1319 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1320 {
1321 int load_place = vect_get_place_in_interleaving_chain
1322 (load_info, first_stmt_info);
1323 gcc_assert (load_place != -1);
1324 load_permutation.safe_push (load_place);
1325 }
1326 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1327 return node;
1328 }
1329 }
1330
1331 /* Get at the operands, verifying they are compatible. */
1332 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1333 slp_oprnd_info oprnd_info;
1334 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1335 {
1336 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1337 stmts, i, &oprnds_info);
1338 if (res != 0)
1339 matches[(res == -1) ? 0 : i] = false;
1340 if (!matches[0])
1341 break;
1342 }
1343 for (i = 0; i < group_size; ++i)
1344 if (!matches[i])
1345 {
1346 vect_free_oprnd_info (oprnds_info);
1347 return NULL;
1348 }
1349
1350 auto_vec<slp_tree, 4> children;
1351
1352 stmt_info = stmts[0];
1353
1354 /* Create SLP_TREE nodes for the definition node/s. */
1355 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1356 {
1357 slp_tree child;
1358 unsigned old_tree_size = this_tree_size;
1359 unsigned int j;
1360
1361 if (oprnd_info->first_dt == vect_uninitialized_def)
1362 {
1363 /* COND_EXPR have one too many eventually if the condition
1364 is a SSA name. */
1365 gcc_assert (i == 3 && nops == 4);
1366 continue;
1367 }
1368
1369 if (oprnd_info->first_dt != vect_internal_def
1370 && oprnd_info->first_dt != vect_reduction_def
1371 && oprnd_info->first_dt != vect_induction_def)
1372 {
1373 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1374 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1375 oprnd_info->ops = vNULL;
1376 children.safe_push (invnode);
1377 continue;
1378 }
1379
1380 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1381 group_size, &this_max_nunits,
1382 matches, npermutes,
1383 &this_tree_size, bst_map)) != NULL)
1384 {
1385 /* If we have all children of a non-unary child built up from
1386 scalars then just throw that away and build it up this node
1387 from scalars. */
1388 if (is_a <bb_vec_info> (vinfo)
1389 && SLP_TREE_CHILDREN (child).length () > 1
1390 /* ??? Rejecting patterns this way doesn't work. We'd have to
1391 do extra work to cancel the pattern so the uses see the
1392 scalar version. */
1393 && !oprnd_info->any_pattern)
1394 {
1395 slp_tree grandchild;
1396
1397 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1398 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1399 break;
1400 if (!grandchild
1401 && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
1402 {
1403 /* Roll back. */
1404 this_tree_size = old_tree_size;
1405 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1406 vect_free_slp_tree (grandchild, false);
1407 SLP_TREE_CHILDREN (child).truncate (0);
1408
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Building parent vector operands from "
1412 "scalars instead\n");
1413 oprnd_info->def_stmts = vNULL;
1414 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1415 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1416 oprnd_info->ops = vNULL;
1417 ++this_tree_size;
1418 children.safe_push (child);
1419 continue;
1420 }
1421 }
1422
1423 oprnd_info->def_stmts = vNULL;
1424 children.safe_push (child);
1425 continue;
1426 }
1427
1428 /* If the SLP build failed fatally and we analyze a basic-block
1429 simply treat nodes we fail to build as externally defined
1430 (and thus build vectors from the scalar defs).
1431 The cost model will reject outright expensive cases.
1432 ??? This doesn't treat cases where permutation ultimatively
1433 fails (or we don't try permutation below). Ideally we'd
1434 even compute a permutation that will end up with the maximum
1435 SLP tree size... */
1436 if (is_a <bb_vec_info> (vinfo)
1437 && !matches[0]
1438 /* ??? Rejecting patterns this way doesn't work. We'd have to
1439 do extra work to cancel the pattern so the uses see the
1440 scalar version. */
1441 && !is_pattern_stmt_p (stmt_info)
1442 && !oprnd_info->any_pattern
1443 && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
1444 {
1445 if (dump_enabled_p ())
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "Building vector operands from scalars\n");
1448 this_tree_size++;
1449 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1450 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1451 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1452 children.safe_push (child);
1453 oprnd_info->ops = vNULL;
1454 oprnd_info->def_stmts = vNULL;
1455 continue;
1456 }
1457
1458 /* If the SLP build for operand zero failed and operand zero
1459 and one can be commutated try that for the scalar stmts
1460 that failed the match. */
1461 if (i == 0
1462 /* A first scalar stmt mismatch signals a fatal mismatch. */
1463 && matches[0]
1464 /* ??? For COND_EXPRs we can swap the comparison operands
1465 as well as the arms under some constraints. */
1466 && nops == 2
1467 && oprnds_info[1]->first_dt == vect_internal_def
1468 && is_gimple_assign (stmt_info->stmt)
1469 /* Swapping operands for reductions breaks assumptions later on. */
1470 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1471 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1472 /* Do so only if the number of not successful permutes was nor more
1473 than a cut-ff as re-trying the recursive match on
1474 possibly each level of the tree would expose exponential
1475 behavior. */
1476 && *npermutes < 4)
1477 {
1478 /* See whether we can swap the matching or the non-matching
1479 stmt operands. */
1480 bool swap_not_matching = true;
1481 do
1482 {
1483 for (j = 0; j < group_size; ++j)
1484 {
1485 if (matches[j] != !swap_not_matching)
1486 continue;
1487 stmt_vec_info stmt_info = stmts[j];
1488 /* Verify if we can swap operands of this stmt. */
1489 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1490 if (!stmt
1491 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1492 {
1493 if (!swap_not_matching)
1494 goto fail;
1495 swap_not_matching = false;
1496 break;
1497 }
1498 }
1499 }
1500 while (j != group_size);
1501
1502 /* Swap mismatched definition stmts. */
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_NOTE, vect_location,
1505 "Re-trying with swapped operands of stmts ");
1506 for (j = 0; j < group_size; ++j)
1507 if (matches[j] == !swap_not_matching)
1508 {
1509 std::swap (oprnds_info[0]->def_stmts[j],
1510 oprnds_info[1]->def_stmts[j]);
1511 std::swap (oprnds_info[0]->ops[j],
1512 oprnds_info[1]->ops[j]);
1513 if (dump_enabled_p ())
1514 dump_printf (MSG_NOTE, "%d ", j);
1515 }
1516 if (dump_enabled_p ())
1517 dump_printf (MSG_NOTE, "\n");
1518 /* After swapping some operands we lost track whether an
1519 operand has any pattern defs so be conservative here. */
1520 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
1521 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
1522 /* And try again with scratch 'matches' ... */
1523 bool *tem = XALLOCAVEC (bool, group_size);
1524 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1525 group_size, &this_max_nunits,
1526 tem, npermutes,
1527 &this_tree_size, bst_map)) != NULL)
1528 {
1529 /* If we have all children of a non-unary child built up from
1530 scalars then just throw that away and build it up this node
1531 from scalars. */
1532 if (is_a <bb_vec_info> (vinfo)
1533 && SLP_TREE_CHILDREN (child).length () > 1
1534 /* ??? Rejecting patterns this way doesn't work. We'd have
1535 to do extra work to cancel the pattern so the uses see the
1536 scalar version. */
1537 && !oprnd_info->any_pattern)
1538 {
1539 unsigned int j;
1540 slp_tree grandchild;
1541
1542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1543 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1544 break;
1545 if (!grandchild
1546 && (vect_update_all_shared_vectypes
1547 (oprnd_info->def_stmts)))
1548 {
1549 /* Roll back. */
1550 this_tree_size = old_tree_size;
1551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1552 vect_free_slp_tree (grandchild, false);
1553 SLP_TREE_CHILDREN (child).truncate (0);
1554
1555 if (dump_enabled_p ())
1556 dump_printf_loc (MSG_NOTE, vect_location,
1557 "Building parent vector operands from "
1558 "scalars instead\n");
1559 oprnd_info->def_stmts = vNULL;
1560 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1561 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1562 oprnd_info->ops = vNULL;
1563 ++this_tree_size;
1564 children.safe_push (child);
1565 continue;
1566 }
1567 }
1568
1569 oprnd_info->def_stmts = vNULL;
1570 children.safe_push (child);
1571 continue;
1572 }
1573
1574 ++*npermutes;
1575 }
1576
1577 fail:
1578 gcc_assert (child == NULL);
1579 FOR_EACH_VEC_ELT (children, j, child)
1580 vect_free_slp_tree (child, false);
1581 vect_free_oprnd_info (oprnds_info);
1582 return NULL;
1583 }
1584
1585 vect_free_oprnd_info (oprnds_info);
1586
1587 *tree_size += this_tree_size + 1;
1588 *max_nunits = this_max_nunits;
1589
1590 node = vect_create_new_slp_node (stmts);
1591 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1592 SLP_TREE_CHILDREN (node).splice (children);
1593 return node;
1594 }
1595
1596 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1597
1598 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)1599 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1600 slp_tree node, hash_set<slp_tree> &visited)
1601 {
1602 unsigned i, j;
1603 stmt_vec_info stmt_info;
1604 slp_tree child;
1605 tree op;
1606
1607 if (visited.add (node))
1608 return;
1609
1610 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1611 dump_user_location_t user_loc = loc.get_user_location ();
1612 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1613 SLP_TREE_DEF_TYPE (node) == vect_external_def
1614 ? " (external)"
1615 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1616 ? " (constant)"
1617 : ""), node,
1618 estimated_poly_value (node->max_nunits), node->refcnt);
1619 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1620 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1621 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1622 else
1623 {
1624 dump_printf_loc (metadata, user_loc, "\t{ ");
1625 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1626 dump_printf (metadata, "%T%s ", op,
1627 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1628 dump_printf (metadata, "}\n");
1629 }
1630 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1631 {
1632 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1633 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1634 dump_printf (dump_kind, " %u", j);
1635 dump_printf (dump_kind, " }\n");
1636 }
1637 if (SLP_TREE_CHILDREN (node).is_empty ())
1638 return;
1639 dump_printf_loc (metadata, user_loc, "\tchildren");
1640 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1641 dump_printf (dump_kind, " %p", (void *)child);
1642 dump_printf (dump_kind, "\n");
1643 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1644 vect_print_slp_tree (dump_kind, loc, child, visited);
1645 }
1646
1647 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)1648 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1649 slp_tree node)
1650 {
1651 hash_set<slp_tree> visited;
1652 vect_print_slp_tree (dump_kind, loc, node, visited);
1653 }
1654
1655 /* Mark the tree rooted at NODE with PURE_SLP. */
1656
1657 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)1658 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1659 {
1660 int i;
1661 stmt_vec_info stmt_info;
1662 slp_tree child;
1663
1664 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1665 return;
1666
1667 if (visited.add (node))
1668 return;
1669
1670 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1671 STMT_SLP_TYPE (stmt_info) = pure_slp;
1672
1673 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1674 vect_mark_slp_stmts (child, visited);
1675 }
1676
1677 static void
vect_mark_slp_stmts(slp_tree node)1678 vect_mark_slp_stmts (slp_tree node)
1679 {
1680 hash_set<slp_tree> visited;
1681 vect_mark_slp_stmts (node, visited);
1682 }
1683
1684 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1685
1686 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)1687 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1688 {
1689 int i;
1690 stmt_vec_info stmt_info;
1691 slp_tree child;
1692
1693 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1694 return;
1695
1696 if (visited.add (node))
1697 return;
1698
1699 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1700 {
1701 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1702 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1703 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1704 }
1705
1706 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1707 vect_mark_slp_stmts_relevant (child, visited);
1708 }
1709
1710 static void
vect_mark_slp_stmts_relevant(slp_tree node)1711 vect_mark_slp_stmts_relevant (slp_tree node)
1712 {
1713 hash_set<slp_tree> visited;
1714 vect_mark_slp_stmts_relevant (node, visited);
1715 }
1716
1717 /* Copy the SLP subtree rooted at NODE. */
1718
1719 static slp_tree
slp_copy_subtree(slp_tree node,hash_map<slp_tree,slp_tree> & map)1720 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1721 {
1722 unsigned i;
1723
1724 bool existed_p;
1725 slp_tree ©_ref = map.get_or_insert (node, &existed_p);
1726 if (existed_p)
1727 return copy_ref;
1728
1729 copy_ref = XNEW (_slp_tree);
1730 slp_tree copy = copy_ref;
1731 memcpy (copy, node, sizeof (_slp_tree));
1732 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1733 {
1734 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1735 stmt_vec_info stmt_info;
1736 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1737 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1738 }
1739 if (SLP_TREE_SCALAR_OPS (node).exists ())
1740 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1741 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1742 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1743 if (SLP_TREE_CHILDREN (node).exists ())
1744 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1745 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1746 copy->refcnt = 0;
1747
1748 slp_tree child;
1749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1750 {
1751 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1752 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1753 }
1754 return copy;
1755 }
1756
1757 /* Rearrange the statements of NODE according to PERMUTATION. */
1758
1759 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation,hash_set<slp_tree> & visited)1760 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1761 vec<unsigned> permutation,
1762 hash_set<slp_tree> &visited)
1763 {
1764 unsigned int i;
1765 slp_tree child;
1766
1767 if (visited.add (node))
1768 return;
1769
1770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1771 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1772
1773 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1774 {
1775 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1776 /* ??? Computation nodes are isomorphic and need no rearrangement.
1777 This is a quick hack to cover those where rearrangement breaks
1778 semantics because only the first stmt is guaranteed to have the
1779 correct operation code due to others being swapped or inverted. */
1780 stmt_vec_info first = SLP_TREE_SCALAR_STMTS (node)[0];
1781 if (is_gimple_assign (first->stmt)
1782 && gimple_assign_rhs_code (first->stmt) == COND_EXPR)
1783 return;
1784 vec<stmt_vec_info> tmp_stmts;
1785 tmp_stmts.create (group_size);
1786 tmp_stmts.quick_grow (group_size);
1787 stmt_vec_info stmt_info;
1788 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1789 tmp_stmts[permutation[i]] = stmt_info;
1790 SLP_TREE_SCALAR_STMTS (node).release ();
1791 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1792 }
1793 if (SLP_TREE_SCALAR_OPS (node).exists ())
1794 {
1795 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1796 vec<tree> tmp_ops;
1797 tmp_ops.create (group_size);
1798 tmp_ops.quick_grow (group_size);
1799 tree op;
1800 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1801 tmp_ops[permutation[i]] = op;
1802 SLP_TREE_SCALAR_OPS (node).release ();
1803 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1804 }
1805 }
1806
1807
1808 /* Attempt to reorder stmts in a reduction chain so that we don't
1809 require any load permutation. Return true if that was possible,
1810 otherwise return false. */
1811
1812 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1813 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1814 {
1815 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1816 unsigned int i, j;
1817 unsigned int lidx;
1818 slp_tree node, load;
1819
1820 /* Compare all the permutation sequences to the first one. We know
1821 that at least one load is permuted. */
1822 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1823 if (!node->load_permutation.exists ())
1824 return false;
1825 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1826 {
1827 if (!load->load_permutation.exists ())
1828 return false;
1829 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1830 if (lidx != node->load_permutation[j])
1831 return false;
1832 }
1833
1834 /* Check that the loads in the first sequence are different and there
1835 are no gaps between them. */
1836 auto_sbitmap load_index (group_size);
1837 bitmap_clear (load_index);
1838 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1839 {
1840 if (lidx >= group_size)
1841 return false;
1842 if (bitmap_bit_p (load_index, lidx))
1843 return false;
1844
1845 bitmap_set_bit (load_index, lidx);
1846 }
1847 for (i = 0; i < group_size; i++)
1848 if (!bitmap_bit_p (load_index, i))
1849 return false;
1850
1851 /* This permutation is valid for reduction. Since the order of the
1852 statements in the nodes is not important unless they are memory
1853 accesses, we can rearrange the statements in all the nodes
1854 according to the order of the loads. */
1855
1856 /* We have to unshare the SLP tree we modify. */
1857 hash_map<slp_tree, slp_tree> map;
1858 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1859 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1860 unshared->refcnt++;
1861 SLP_INSTANCE_TREE (slp_instn) = unshared;
1862 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1863 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1864 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1865
1866 /* Do the actual re-arrangement. */
1867 hash_set<slp_tree> visited;
1868 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1869 node->load_permutation, visited);
1870
1871 /* We are done, no actual permutations need to be generated. */
1872 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1873 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1874 {
1875 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1876 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1877 /* But we have to keep those permutations that are required because
1878 of handling of gaps. */
1879 if (known_eq (unrolling_factor, 1U)
1880 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1881 && DR_GROUP_GAP (first_stmt_info) == 0))
1882 SLP_TREE_LOAD_PERMUTATION (node).release ();
1883 else
1884 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1885 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1886 }
1887
1888 return true;
1889 }
1890
1891 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1892
1893 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node,hash_set<slp_tree> & visited)1894 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1895 hash_set<slp_tree> &visited)
1896 {
1897 if (visited.add (node))
1898 return;
1899
1900 if (SLP_TREE_CHILDREN (node).length () == 0)
1901 {
1902 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1903 return;
1904 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1905 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1906 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1907 SLP_INSTANCE_LOADS (inst).safe_push (node);
1908 }
1909 else
1910 {
1911 unsigned i;
1912 slp_tree child;
1913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1914 vect_gather_slp_loads (inst, child, visited);
1915 }
1916 }
1917
1918 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node)1919 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1920 {
1921 hash_set<slp_tree> visited;
1922 vect_gather_slp_loads (inst, node, visited);
1923 }
1924
1925 /* Check if the required load permutations in the SLP instance
1926 SLP_INSTN are supported. */
1927
1928 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1929 vect_supported_load_permutation_p (slp_instance slp_instn)
1930 {
1931 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1932 unsigned int i, j, k, next;
1933 slp_tree node;
1934
1935 if (dump_enabled_p ())
1936 {
1937 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1938 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1939 if (node->load_permutation.exists ())
1940 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1941 dump_printf (MSG_NOTE, "%d ", next);
1942 else
1943 for (k = 0; k < group_size; ++k)
1944 dump_printf (MSG_NOTE, "%d ", k);
1945 dump_printf (MSG_NOTE, "\n");
1946 }
1947
1948 /* In case of reduction every load permutation is allowed, since the order
1949 of the reduction statements is not important (as opposed to the case of
1950 grouped stores). The only condition we need to check is that all the
1951 load nodes are of the same size and have the same permutation (and then
1952 rearrange all the nodes of the SLP instance according to this
1953 permutation). */
1954
1955 /* Check that all the load nodes are of the same size. */
1956 /* ??? Can't we assert this? */
1957 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1958 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1959 return false;
1960
1961 node = SLP_INSTANCE_TREE (slp_instn);
1962 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1963
1964 /* Reduction (there are no data-refs in the root).
1965 In reduction chain the order of the loads is not important. */
1966 if (!STMT_VINFO_DATA_REF (stmt_info)
1967 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
1968 && !SLP_INSTANCE_ROOT_STMT (slp_instn))
1969 vect_attempt_slp_rearrange_stmts (slp_instn);
1970
1971 /* In basic block vectorization we allow any subchain of an interleaving
1972 chain.
1973 FORNOW: not supported in loop SLP because of realignment compications. */
1974 if (STMT_VINFO_BB_VINFO (stmt_info))
1975 {
1976 /* Check whether the loads in an instance form a subchain and thus
1977 no permutation is necessary. */
1978 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1979 {
1980 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1981 continue;
1982 bool subchain_p = true;
1983 stmt_vec_info next_load_info = NULL;
1984 stmt_vec_info load_info;
1985 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1986 {
1987 if (j != 0
1988 && (next_load_info != load_info
1989 || DR_GROUP_GAP (load_info) != 1))
1990 {
1991 subchain_p = false;
1992 break;
1993 }
1994 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1995 }
1996 if (subchain_p)
1997 SLP_TREE_LOAD_PERMUTATION (node).release ();
1998 else
1999 {
2000 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
2001 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
2002 unsigned HOST_WIDE_INT nunits;
2003 unsigned k, maxk = 0;
2004 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
2005 if (k > maxk)
2006 maxk = k;
2007 /* In BB vectorization we may not actually use a loaded vector
2008 accessing elements in excess of DR_GROUP_SIZE. */
2009 tree vectype = STMT_VINFO_VECTYPE (group_info);
2010 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
2011 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
2012 {
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "BB vectorization with gaps at the end of "
2016 "a load is not supported\n");
2017 return false;
2018 }
2019
2020 /* Verify the permutation can be generated. */
2021 vec<tree> tem;
2022 unsigned n_perms;
2023 if (!vect_transform_slp_perm_load (node, tem, NULL,
2024 1, slp_instn, true, &n_perms))
2025 {
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2028 vect_location,
2029 "unsupported load permutation\n");
2030 return false;
2031 }
2032 }
2033 }
2034 return true;
2035 }
2036
2037 /* For loop vectorization verify we can generate the permutation. Be
2038 conservative about the vectorization factor, there are permutations
2039 that will use three vector inputs only starting from a specific factor
2040 and the vectorization factor is not yet final.
2041 ??? The SLP instance unrolling factor might not be the maximum one. */
2042 unsigned n_perms;
2043 poly_uint64 test_vf
2044 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
2045 LOOP_VINFO_VECT_FACTOR
2046 (STMT_VINFO_LOOP_VINFO (stmt_info)));
2047 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
2048 if (node->load_permutation.exists ()
2049 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
2050 slp_instn, true, &n_perms))
2051 return false;
2052
2053 return true;
2054 }
2055
2056
2057 /* Find the last store in SLP INSTANCE. */
2058
2059 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2060 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2061 {
2062 stmt_vec_info last = NULL;
2063 stmt_vec_info stmt_vinfo;
2064
2065 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2066 {
2067 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2068 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2069 }
2070
2071 return last;
2072 }
2073
2074 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2075 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2076 (also containing the first GROUP1_SIZE stmts, since stores are
2077 consecutive), the second containing the remainder.
2078 Return the first stmt in the second group. */
2079
2080 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2081 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2082 {
2083 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2084 gcc_assert (group1_size > 0);
2085 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2086 gcc_assert (group2_size > 0);
2087 DR_GROUP_SIZE (first_vinfo) = group1_size;
2088
2089 stmt_vec_info stmt_info = first_vinfo;
2090 for (unsigned i = group1_size; i > 1; i--)
2091 {
2092 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2093 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2094 }
2095 /* STMT is now the last element of the first group. */
2096 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2097 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2098
2099 DR_GROUP_SIZE (group2) = group2_size;
2100 for (stmt_info = group2; stmt_info;
2101 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2102 {
2103 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2104 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2105 }
2106
2107 /* For the second group, the DR_GROUP_GAP is that before the original group,
2108 plus skipping over the first vector. */
2109 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2110
2111 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2112 DR_GROUP_GAP (first_vinfo) += group2_size;
2113
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2116 group1_size, group2_size);
2117
2118 return group2;
2119 }
2120
2121 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2122 statements and a vector of NUNITS elements. */
2123
2124 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2125 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2126 {
2127 return exact_div (common_multiple (nunits, group_size), group_size);
2128 }
2129
2130 /* Analyze an SLP instance starting from a group of grouped stores. Call
2131 vect_build_slp_tree to build a tree of packed stmts if possible.
2132 Return FALSE if it's impossible to SLP any stmt in the loop. */
2133
2134 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,unsigned max_tree_size)2135 vect_analyze_slp_instance (vec_info *vinfo,
2136 scalar_stmts_to_slp_tree_map_t *bst_map,
2137 stmt_vec_info stmt_info, unsigned max_tree_size)
2138 {
2139 slp_instance new_instance;
2140 slp_tree node;
2141 unsigned int group_size;
2142 tree vectype, scalar_type = NULL_TREE;
2143 unsigned int i;
2144 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2145 vec<stmt_vec_info> scalar_stmts;
2146 bool constructor = false;
2147
2148 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2149 {
2150 scalar_type = TREE_TYPE (DR_REF (dr));
2151 group_size = DR_GROUP_SIZE (stmt_info);
2152 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2153 }
2154 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2155 {
2156 gcc_assert (is_a <loop_vec_info> (vinfo));
2157 vectype = STMT_VINFO_VECTYPE (stmt_info);
2158 group_size = REDUC_GROUP_SIZE (stmt_info);
2159 }
2160 else if (is_gimple_assign (stmt_info->stmt)
2161 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2162 {
2163 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2164 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2165 constructor = true;
2166 }
2167 else
2168 {
2169 gcc_assert (is_a <loop_vec_info> (vinfo));
2170 vectype = STMT_VINFO_VECTYPE (stmt_info);
2171 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2172 }
2173
2174 if (!vectype)
2175 {
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "Build SLP failed: unsupported data-type %T\n",
2179 scalar_type);
2180
2181 return false;
2182 }
2183 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2184
2185 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2186 scalar_stmts.create (group_size);
2187 stmt_vec_info next_info = stmt_info;
2188 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2189 {
2190 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2191 while (next_info)
2192 {
2193 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2194 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2195 }
2196 }
2197 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2198 {
2199 /* Collect the reduction stmts and store them in
2200 SLP_TREE_SCALAR_STMTS. */
2201 while (next_info)
2202 {
2203 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2204 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2205 }
2206 /* Mark the first element of the reduction chain as reduction to properly
2207 transform the node. In the reduction analysis phase only the last
2208 element of the chain is marked as reduction. */
2209 STMT_VINFO_DEF_TYPE (stmt_info)
2210 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2211 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2212 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2213 }
2214 else if (constructor)
2215 {
2216 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2217 tree val;
2218 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2219 {
2220 if (TREE_CODE (val) == SSA_NAME)
2221 {
2222 gimple* def = SSA_NAME_DEF_STMT (val);
2223 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2224 /* Value is defined in another basic block. */
2225 if (!def_info)
2226 return false;
2227 def_info = vect_stmt_to_vectorize (def_info);
2228 scalar_stmts.safe_push (def_info);
2229 }
2230 else
2231 return false;
2232 }
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "Analyzing vectorizable constructor: %G\n",
2236 stmt_info->stmt);
2237 }
2238 else
2239 {
2240 /* Collect reduction statements. */
2241 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2242 for (i = 0; reductions.iterate (i, &next_info); i++)
2243 scalar_stmts.safe_push (next_info);
2244 }
2245
2246 /* Build the tree for the SLP instance. */
2247 bool *matches = XALLOCAVEC (bool, group_size);
2248 unsigned npermutes = 0;
2249 poly_uint64 max_nunits = nunits;
2250 unsigned tree_size = 0;
2251 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2252 &max_nunits, matches, &npermutes,
2253 &tree_size, bst_map);
2254 if (node != NULL)
2255 {
2256 /* Calculate the unrolling factor based on the smallest type. */
2257 poly_uint64 unrolling_factor
2258 = calculate_unrolling_factor (max_nunits, group_size);
2259
2260 if (maybe_ne (unrolling_factor, 1U)
2261 && is_a <bb_vec_info> (vinfo))
2262 {
2263 unsigned HOST_WIDE_INT const_max_nunits;
2264 if (!max_nunits.is_constant (&const_max_nunits)
2265 || const_max_nunits > group_size)
2266 {
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 "Build SLP failed: store group "
2270 "size not a multiple of the vector size "
2271 "in basic block SLP\n");
2272 vect_free_slp_tree (node, false);
2273 return false;
2274 }
2275 /* Fatal mismatch. */
2276 matches[group_size / const_max_nunits * const_max_nunits] = false;
2277 vect_free_slp_tree (node, false);
2278 }
2279 else
2280 {
2281 /* Create a new SLP instance. */
2282 new_instance = XNEW (class _slp_instance);
2283 SLP_INSTANCE_TREE (new_instance) = node;
2284 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2285 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2286 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2287 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2288 new_instance->reduc_phis = NULL;
2289
2290 vect_gather_slp_loads (new_instance, node);
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_NOTE, vect_location,
2293 "SLP size %u vs. limit %u.\n",
2294 tree_size, max_tree_size);
2295
2296 /* Compute the load permutation. */
2297 slp_tree load_node;
2298 bool loads_permuted = false;
2299 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2300 {
2301 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2302 continue;
2303 unsigned j;
2304 stmt_vec_info load_info;
2305 bool this_load_permuted = false;
2306 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2307 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2308 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2309 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2310 {
2311 this_load_permuted = true;
2312 break;
2313 }
2314 if (!this_load_permuted
2315 /* The load requires permutation when unrolling exposes
2316 a gap either because the group is larger than the SLP
2317 group-size or because there is a gap between the groups. */
2318 && group_size == DR_GROUP_SIZE (first_stmt_info)
2319 && DR_GROUP_GAP (first_stmt_info) == 0)
2320 {
2321 SLP_TREE_LOAD_PERMUTATION (load_node).release ();
2322 continue;
2323 }
2324 loads_permuted = true;
2325 }
2326
2327 if (loads_permuted)
2328 {
2329 if (!vect_supported_load_permutation_p (new_instance))
2330 {
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "Build SLP failed: unsupported load "
2334 "permutation %G", stmt_info->stmt);
2335 vect_free_slp_instance (new_instance, false);
2336 return false;
2337 }
2338 }
2339
2340 /* If the loads and stores can be handled with load/store-lan
2341 instructions do not generate this SLP instance. */
2342 if (is_a <loop_vec_info> (vinfo)
2343 && loads_permuted
2344 && dr && vect_store_lanes_supported (vectype, group_size, false))
2345 {
2346 slp_tree load_node;
2347 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2348 {
2349 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2350 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2351 /* Use SLP for strided accesses (or if we can't load-lanes). */
2352 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2353 || ! vect_load_lanes_supported
2354 (STMT_VINFO_VECTYPE (stmt_vinfo),
2355 DR_GROUP_SIZE (stmt_vinfo), false))
2356 break;
2357 }
2358 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2359 {
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362 "Built SLP cancelled: can use "
2363 "load/store-lanes\n");
2364 vect_free_slp_instance (new_instance, false);
2365 return false;
2366 }
2367 }
2368
2369 /* If this is a reduction chain with a conversion in front
2370 amend the SLP tree with a node for that. */
2371 if (!dr
2372 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2373 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2374 {
2375 /* Get at the conversion stmt - we know it's the single use
2376 of the last stmt of the reduction chain. */
2377 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2378 use_operand_p use_p;
2379 gimple *use_stmt;
2380 bool r = single_imm_use (gimple_assign_lhs (tem),
2381 &use_p, &use_stmt);
2382 gcc_assert (r);
2383 next_info = vinfo->lookup_stmt (use_stmt);
2384 next_info = vect_stmt_to_vectorize (next_info);
2385 scalar_stmts = vNULL;
2386 scalar_stmts.create (group_size);
2387 for (unsigned i = 0; i < group_size; ++i)
2388 scalar_stmts.quick_push (next_info);
2389 slp_tree conv = vect_create_new_slp_node (scalar_stmts);
2390 SLP_TREE_CHILDREN (conv).quick_push (node);
2391 SLP_INSTANCE_TREE (new_instance) = conv;
2392 /* We also have to fake this conversion stmt as SLP reduction
2393 group so we don't have to mess with too much code
2394 elsewhere. */
2395 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2396 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2397 }
2398
2399 vinfo->slp_instances.safe_push (new_instance);
2400
2401 if (dump_enabled_p ())
2402 {
2403 dump_printf_loc (MSG_NOTE, vect_location,
2404 "Final SLP tree for instance:\n");
2405 vect_print_slp_tree (MSG_NOTE, vect_location,
2406 SLP_INSTANCE_TREE (new_instance));
2407 }
2408
2409 return true;
2410 }
2411 }
2412 else
2413 {
2414 /* Failed to SLP. */
2415 /* Free the allocated memory. */
2416 scalar_stmts.release ();
2417 }
2418
2419 /* For basic block SLP, try to break the group up into multiples of the
2420 vector size. */
2421 unsigned HOST_WIDE_INT const_nunits;
2422 if (is_a <bb_vec_info> (vinfo)
2423 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2424 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2425 && nunits.is_constant (&const_nunits))
2426 {
2427 /* We consider breaking the group only on VF boundaries from the existing
2428 start. */
2429 for (i = 0; i < group_size; i++)
2430 if (!matches[i]) break;
2431
2432 if (i >= const_nunits && i < group_size)
2433 {
2434 /* Split into two groups at the first vector boundary before i. */
2435 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2436 unsigned group1_size = i & ~(const_nunits - 1);
2437
2438 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2439 group1_size);
2440 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2441 max_tree_size);
2442 /* If the first non-match was in the middle of a vector,
2443 skip the rest of that vector. */
2444 if (group1_size < i)
2445 {
2446 i = group1_size + const_nunits;
2447 if (i < group_size)
2448 rest = vect_split_slp_store_group (rest, const_nunits);
2449 }
2450 if (i < group_size)
2451 res |= vect_analyze_slp_instance (vinfo, bst_map,
2452 rest, max_tree_size);
2453 return res;
2454 }
2455 /* Even though the first vector did not all match, we might be able to SLP
2456 (some) of the remainder. FORNOW ignore this possibility. */
2457 }
2458
2459 return false;
2460 }
2461
2462
2463 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2464 trees of packed scalar stmts if SLP is possible. */
2465
2466 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2467 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2468 {
2469 unsigned int i;
2470 stmt_vec_info first_element;
2471
2472 DUMP_VECT_SCOPE ("vect_analyze_slp");
2473
2474 scalar_stmts_to_slp_tree_map_t *bst_map
2475 = new scalar_stmts_to_slp_tree_map_t ();
2476
2477 /* Find SLP sequences starting from groups of grouped stores. */
2478 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2479 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2480
2481 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2482 {
2483 if (loop_vinfo->reduction_chains.length () > 0)
2484 {
2485 /* Find SLP sequences starting from reduction chains. */
2486 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2487 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2488 max_tree_size))
2489 {
2490 /* Dissolve reduction chain group. */
2491 stmt_vec_info vinfo = first_element;
2492 stmt_vec_info last = NULL;
2493 while (vinfo)
2494 {
2495 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2496 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2497 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2498 last = vinfo;
2499 vinfo = next;
2500 }
2501 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2502 /* It can be still vectorized as part of an SLP reduction. */
2503 loop_vinfo->reductions.safe_push (last);
2504 }
2505 }
2506
2507 /* Find SLP sequences starting from groups of reductions. */
2508 if (loop_vinfo->reductions.length () > 1)
2509 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2510 max_tree_size);
2511 }
2512
2513 /* The map keeps a reference on SLP nodes built, release that. */
2514 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2515 it != bst_map->end (); ++it)
2516 if ((*it).second)
2517 vect_free_slp_tree ((*it).second, false);
2518 delete bst_map;
2519
2520 return opt_result::success ();
2521 }
2522
2523
2524 /* For each possible SLP instance decide whether to SLP it and calculate overall
2525 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2526 least one instance. */
2527
2528 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2529 vect_make_slp_decision (loop_vec_info loop_vinfo)
2530 {
2531 unsigned int i;
2532 poly_uint64 unrolling_factor = 1;
2533 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2534 slp_instance instance;
2535 int decided_to_slp = 0;
2536
2537 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2538
2539 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2540 {
2541 /* FORNOW: SLP if you can. */
2542 /* All unroll factors have the form:
2543
2544 GET_MODE_SIZE (vinfo->vector_mode) * X
2545
2546 for some rational X, so they must have a common multiple. */
2547 unrolling_factor
2548 = force_common_multiple (unrolling_factor,
2549 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2550
2551 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2552 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2553 loop-based vectorization. Such stmts will be marked as HYBRID. */
2554 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2555 decided_to_slp++;
2556 }
2557
2558 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2559
2560 if (decided_to_slp && dump_enabled_p ())
2561 {
2562 dump_printf_loc (MSG_NOTE, vect_location,
2563 "Decided to SLP %d instances. Unrolling factor ",
2564 decided_to_slp);
2565 dump_dec (MSG_NOTE, unrolling_factor);
2566 dump_printf (MSG_NOTE, "\n");
2567 }
2568
2569 return (decided_to_slp > 0);
2570 }
2571
2572
2573 /* Private data for vect_detect_hybrid_slp. */
2574 struct vdhs_data
2575 {
2576 loop_vec_info loop_vinfo;
2577 vec<stmt_vec_info> *worklist;
2578 };
2579
2580 /* Walker for walk_gimple_op. */
2581
2582 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)2583 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2584 {
2585 walk_stmt_info *wi = (walk_stmt_info *)data;
2586 vdhs_data *dat = (vdhs_data *)wi->info;
2587
2588 if (wi->is_lhs)
2589 return NULL_TREE;
2590
2591 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2592 if (!def_stmt_info)
2593 return NULL_TREE;
2594 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2595 if (PURE_SLP_STMT (def_stmt_info))
2596 {
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2599 def_stmt_info->stmt);
2600 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2601 dat->worklist->safe_push (def_stmt_info);
2602 }
2603
2604 return NULL_TREE;
2605 }
2606
2607 /* Find stmts that must be both vectorized and SLPed. */
2608
2609 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2610 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2611 {
2612 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2613
2614 /* All stmts participating in SLP are marked pure_slp, all other
2615 stmts are loop_vect.
2616 First collect all loop_vect stmts into a worklist. */
2617 auto_vec<stmt_vec_info> worklist;
2618 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2619 {
2620 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2621 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2622 gsi_next (&gsi))
2623 {
2624 gphi *phi = gsi.phi ();
2625 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2626 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2627 worklist.safe_push (stmt_info);
2628 }
2629 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2630 gsi_next (&gsi))
2631 {
2632 gimple *stmt = gsi_stmt (gsi);
2633 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2634 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2635 {
2636 for (gimple_stmt_iterator gsi2
2637 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2638 !gsi_end_p (gsi2); gsi_next (&gsi2))
2639 {
2640 stmt_vec_info patt_info
2641 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2642 if (!STMT_SLP_TYPE (patt_info)
2643 && STMT_VINFO_RELEVANT (patt_info))
2644 worklist.safe_push (patt_info);
2645 }
2646 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2647 }
2648 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2649 worklist.safe_push (stmt_info);
2650 }
2651 }
2652
2653 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2654 mark any SLP vectorized stmt as hybrid.
2655 ??? We're visiting def stmts N times (once for each non-SLP and
2656 once for each hybrid-SLP use). */
2657 walk_stmt_info wi;
2658 vdhs_data dat;
2659 dat.worklist = &worklist;
2660 dat.loop_vinfo = loop_vinfo;
2661 memset (&wi, 0, sizeof (wi));
2662 wi.info = (void *)&dat;
2663 while (!worklist.is_empty ())
2664 {
2665 stmt_vec_info stmt_info = worklist.pop ();
2666 /* Since SSA operands are not set up for pattern stmts we need
2667 to use walk_gimple_op. */
2668 wi.is_lhs = 0;
2669 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2670 }
2671 }
2672
2673
2674 /* Initialize a bb_vec_info struct for the statements between
2675 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2676
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in,vec_info_shared * shared)2677 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2678 gimple_stmt_iterator region_end_in,
2679 vec_info_shared *shared)
2680 : vec_info (vec_info::bb, init_cost (NULL), shared),
2681 bb (gsi_bb (region_begin_in)),
2682 region_begin (region_begin_in),
2683 region_end (region_end_in)
2684 {
2685 gimple_stmt_iterator gsi;
2686
2687 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2688 gsi_next (&gsi))
2689 {
2690 gimple *stmt = gsi_stmt (gsi);
2691 gimple_set_uid (stmt, 0);
2692 add_stmt (stmt);
2693 }
2694
2695 bb->aux = this;
2696 }
2697
2698
2699 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2700 stmts in the basic block. */
2701
~_bb_vec_info()2702 _bb_vec_info::~_bb_vec_info ()
2703 {
2704 for (gimple_stmt_iterator si = region_begin;
2705 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2706 /* Reset region marker. */
2707 gimple_set_uid (gsi_stmt (si), -1);
2708
2709 bb->aux = NULL;
2710 }
2711
2712 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2713 given then that child nodes have already been processed, and that
2714 their def types currently match their SLP node's def type. */
2715
2716 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)2717 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2718 slp_instance node_instance,
2719 stmt_vector_for_cost *cost_vec)
2720 {
2721 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2722 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2723
2724 /* Calculate the number of vector statements to be created for the
2725 scalar stmts in this node. For SLP reductions it is equal to the
2726 number of vector statements in the children (which has already been
2727 calculated by the recursive call). Otherwise it is the number of
2728 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2729 VF divided by the number of elements in a vector. */
2730 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2731 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2732 {
2733 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2734 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2735 {
2736 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2737 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2738 break;
2739 }
2740 }
2741 else
2742 {
2743 poly_uint64 vf;
2744 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2745 vf = loop_vinfo->vectorization_factor;
2746 else
2747 vf = 1;
2748 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2749 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2750 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2751 = vect_get_num_vectors (vf * group_size, vectype);
2752 }
2753
2754 bool dummy;
2755 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2756 }
2757
2758 /* Try to build NODE from scalars, returning true on success.
2759 NODE_INSTANCE is the SLP instance that contains NODE. */
2760
2761 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)2762 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2763 slp_instance node_instance)
2764 {
2765 stmt_vec_info stmt_info;
2766 unsigned int i;
2767
2768 if (!is_a <bb_vec_info> (vinfo)
2769 || node == SLP_INSTANCE_TREE (node_instance)
2770 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2771 return false;
2772
2773 if (dump_enabled_p ())
2774 dump_printf_loc (MSG_NOTE, vect_location,
2775 "Building vector operands from scalars instead\n");
2776
2777 /* Don't remove and free the child nodes here, since they could be
2778 referenced by other structures. The analysis and scheduling phases
2779 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2780 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2781 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2782 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2783 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2784 {
2785 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2786 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2787 }
2788 return true;
2789 }
2790
2791 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2792 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2793
2794 Return true if the operations are supported. */
2795
2796 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited,hash_set<slp_tree> & lvisited,stmt_vector_for_cost * cost_vec)2797 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2798 slp_instance node_instance,
2799 hash_set<slp_tree> &visited,
2800 hash_set<slp_tree> &lvisited,
2801 stmt_vector_for_cost *cost_vec)
2802 {
2803 int i, j;
2804 slp_tree child;
2805
2806 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2807 return true;
2808
2809 /* If we already analyzed the exact same set of scalar stmts we're done.
2810 We share the generated vector stmts for those.
2811 The SLP graph is acyclic so not caching whether we failed or succeeded
2812 doesn't result in any issue since we throw away the lvisited set
2813 when we fail. */
2814 if (visited.contains (node)
2815 || lvisited.add (node))
2816 return true;
2817
2818 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2819 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2820 visited, lvisited, cost_vec))
2821 return false;
2822
2823 /* ??? We have to catch the case late where two first scalar stmts appear
2824 in multiple SLP children with different def type and fail. Remember
2825 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2826 match it when that is vect_internal_def. */
2827 auto_vec<vect_def_type, 4> dt;
2828 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2829 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2830 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2831 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2832
2833 /* Push SLP node def-type to stmt operands. */
2834 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2835 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2836 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2837 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2838 = SLP_TREE_DEF_TYPE (child);
2839
2840 /* Check everything worked out. */
2841 bool res = true;
2842 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2843 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2844 {
2845 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2846 {
2847 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2848 != SLP_TREE_DEF_TYPE (child))
2849 res = false;
2850 }
2851 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2852 != dt[j])
2853 res = false;
2854 }
2855 if (!res && dump_enabled_p ())
2856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2857 "not vectorized: same operand with different "
2858 "def type in stmt.\n");
2859
2860 if (res)
2861 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2862 cost_vec);
2863
2864 /* Restore def-types. */
2865 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2866 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2867 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2868
2869 /* If this node can't be vectorized, try pruning the tree here rather
2870 than felling the whole thing. */
2871 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2872 res = true;
2873
2874 return res;
2875 }
2876
2877
2878 /* Analyze statements in SLP instances of VINFO. Return true if the
2879 operations are supported. */
2880
2881 bool
vect_slp_analyze_operations(vec_info * vinfo)2882 vect_slp_analyze_operations (vec_info *vinfo)
2883 {
2884 slp_instance instance;
2885 int i;
2886
2887 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2888
2889 hash_set<slp_tree> visited;
2890 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2891 {
2892 hash_set<slp_tree> lvisited;
2893 stmt_vector_for_cost cost_vec;
2894 cost_vec.create (2);
2895 if (!vect_slp_analyze_node_operations (vinfo,
2896 SLP_INSTANCE_TREE (instance),
2897 instance, visited, lvisited,
2898 &cost_vec)
2899 /* Instances with a root stmt require vectorized defs for the
2900 SLP tree root. */
2901 || (SLP_INSTANCE_ROOT_STMT (instance)
2902 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2903 != vect_internal_def)))
2904 {
2905 slp_tree node = SLP_INSTANCE_TREE (instance);
2906 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_NOTE, vect_location,
2909 "removing SLP instance operations starting from: %G",
2910 stmt_info->stmt);
2911 vect_free_slp_instance (instance, false);
2912 vinfo->slp_instances.ordered_remove (i);
2913 cost_vec.release ();
2914 }
2915 else
2916 {
2917 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2918 x != lvisited.end(); ++x)
2919 visited.add (*x);
2920 i++;
2921
2922 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2923 cost_vec.release ();
2924 }
2925 }
2926
2927 return !vinfo->slp_instances.is_empty ();
2928 }
2929
2930
2931 /* Compute the scalar cost of the SLP node NODE and its children
2932 and return it. Do not account defs that are marked in LIFE and
2933 update LIFE according to uses of NODE. */
2934
2935 static void
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<slp_tree> & visited)2936 vect_bb_slp_scalar_cost (basic_block bb,
2937 slp_tree node, vec<bool, va_heap> *life,
2938 stmt_vector_for_cost *cost_vec,
2939 hash_set<slp_tree> &visited)
2940 {
2941 unsigned i;
2942 stmt_vec_info stmt_info;
2943 slp_tree child;
2944
2945 if (visited.add (node))
2946 return;
2947
2948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2949 {
2950 gimple *stmt = stmt_info->stmt;
2951 vec_info *vinfo = stmt_info->vinfo;
2952 ssa_op_iter op_iter;
2953 def_operand_p def_p;
2954
2955 if ((*life)[i])
2956 continue;
2957
2958 /* If there is a non-vectorized use of the defs then the scalar
2959 stmt is kept live in which case we do not account it or any
2960 required defs in the SLP children in the scalar cost. This
2961 way we make the vectorization more costly when compared to
2962 the scalar cost. */
2963 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2964 {
2965 imm_use_iterator use_iter;
2966 gimple *use_stmt;
2967 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2968 if (!is_gimple_debug (use_stmt))
2969 {
2970 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2971 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2972 {
2973 (*life)[i] = true;
2974 BREAK_FROM_IMM_USE_STMT (use_iter);
2975 }
2976 }
2977 }
2978 if ((*life)[i])
2979 continue;
2980
2981 /* Count scalar stmts only once. */
2982 if (gimple_visited_p (stmt))
2983 continue;
2984 gimple_set_visited (stmt, true);
2985
2986 vect_cost_for_stmt kind;
2987 if (STMT_VINFO_DATA_REF (stmt_info))
2988 {
2989 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2990 kind = scalar_load;
2991 else
2992 kind = scalar_store;
2993 }
2994 else if (vect_nop_conversion_p (stmt_info))
2995 continue;
2996 else
2997 kind = scalar_stmt;
2998 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2999 }
3000
3001 auto_vec<bool, 20> subtree_life;
3002 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3003 {
3004 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3005 {
3006 /* Do not directly pass LIFE to the recursive call, copy it to
3007 confine changes in the callee to the current child/subtree. */
3008 subtree_life.safe_splice (*life);
3009 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
3010 visited);
3011 subtree_life.truncate (0);
3012 }
3013 }
3014 }
3015
3016 /* Check if vectorization of the basic block is profitable. */
3017
3018 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)3019 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3020 {
3021 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3022 slp_instance instance;
3023 int i;
3024 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3025 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3026
3027 /* Calculate scalar cost. */
3028 stmt_vector_for_cost scalar_costs;
3029 scalar_costs.create (0);
3030 hash_set<slp_tree> visited;
3031 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3032 {
3033 auto_vec<bool, 20> life;
3034 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3035 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3036 SLP_INSTANCE_TREE (instance),
3037 &life, &scalar_costs, visited);
3038 }
3039 void *target_cost_data = init_cost (NULL);
3040 add_stmt_costs (target_cost_data, &scalar_costs);
3041 scalar_costs.release ();
3042 unsigned dummy;
3043 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3044 destroy_cost_data (target_cost_data);
3045
3046 /* Unset visited flag. */
3047 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3048 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3049 gimple_set_visited (gsi_stmt (gsi), false);
3050
3051 /* Complete the target-specific cost calculation. */
3052 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3053 &vec_inside_cost, &vec_epilogue_cost);
3054
3055 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3056
3057 if (dump_enabled_p ())
3058 {
3059 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3060 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3061 vec_inside_cost);
3062 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3063 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3064 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3065 }
3066
3067 /* Vectorization is profitable if its cost is more than the cost of scalar
3068 version. Note that we err on the vector side for equal cost because
3069 the cost estimate is otherwise quite pessimistic (constant uses are
3070 free on the scalar side but cost a load on the vector side for
3071 example). */
3072 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3073 return false;
3074
3075 return true;
3076 }
3077
3078 /* Find any vectorizable constructors and add them to the grouped_store
3079 array. */
3080
3081 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)3082 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3083 {
3084 gimple_stmt_iterator gsi;
3085
3086 for (gsi = bb_vinfo->region_begin;
3087 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3088 {
3089 gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
3090 if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
3091 continue;
3092
3093 tree rhs = gimple_assign_rhs1 (stmt);
3094 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3095 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3096 CONSTRUCTOR_NELTS (rhs))
3097 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3098 || uniform_vector_p (rhs))
3099 continue;
3100
3101 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3102 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3103 }
3104 }
3105
3106 /* Check if the region described by BB_VINFO can be vectorized, returning
3107 true if so. When returning false, set FATAL to true if the same failure
3108 would prevent vectorization at other vector sizes, false if it is still
3109 worth trying other sizes. N_STMTS is the number of statements in the
3110 region. */
3111
3112 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal)3113 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3114 {
3115 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3116
3117 slp_instance instance;
3118 int i;
3119 poly_uint64 min_vf = 2;
3120
3121 /* The first group of checks is independent of the vector size. */
3122 fatal = true;
3123
3124 /* Analyze the data references. */
3125
3126 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3127 {
3128 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130 "not vectorized: unhandled data-ref in basic "
3131 "block.\n");
3132 return false;
3133 }
3134
3135 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3136 {
3137 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3139 "not vectorized: not enough data-refs in "
3140 "basic block.\n");
3141 return false;
3142 }
3143
3144 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3145 {
3146 if (dump_enabled_p ())
3147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3148 "not vectorized: unhandled data access in "
3149 "basic block.\n");
3150 return false;
3151 }
3152
3153 vect_slp_check_for_constructors (bb_vinfo);
3154
3155 /* If there are no grouped stores in the region there is no need
3156 to continue with pattern recog as vect_analyze_slp will fail
3157 anyway. */
3158 if (bb_vinfo->grouped_stores.is_empty ())
3159 {
3160 if (dump_enabled_p ())
3161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3162 "not vectorized: no grouped stores in "
3163 "basic block.\n");
3164 return false;
3165 }
3166
3167 /* While the rest of the analysis below depends on it in some way. */
3168 fatal = false;
3169
3170 vect_pattern_recog (bb_vinfo);
3171
3172 /* Check the SLP opportunities in the basic block, analyze and build SLP
3173 trees. */
3174 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3175 {
3176 if (dump_enabled_p ())
3177 {
3178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3179 "Failed to SLP the basic block.\n");
3180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3181 "not vectorized: failed to find SLP opportunities "
3182 "in basic block.\n");
3183 }
3184 return false;
3185 }
3186
3187 vect_record_base_alignments (bb_vinfo);
3188
3189 /* Analyze and verify the alignment of data references and the
3190 dependence in the SLP instances. */
3191 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3192 {
3193 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3194 || ! vect_slp_analyze_instance_dependence (instance))
3195 {
3196 slp_tree node = SLP_INSTANCE_TREE (instance);
3197 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3198 if (dump_enabled_p ())
3199 dump_printf_loc (MSG_NOTE, vect_location,
3200 "removing SLP instance operations starting from: %G",
3201 stmt_info->stmt);
3202 vect_free_slp_instance (instance, false);
3203 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3204 continue;
3205 }
3206
3207 /* Mark all the statements that we want to vectorize as pure SLP and
3208 relevant. */
3209 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3210 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3211 if (SLP_INSTANCE_ROOT_STMT (instance))
3212 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3213
3214 i++;
3215 }
3216 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3217 return false;
3218
3219 if (!vect_slp_analyze_operations (bb_vinfo))
3220 {
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223 "not vectorized: bad operation in basic block.\n");
3224 return false;
3225 }
3226
3227 /* Cost model: check if the vectorization is worthwhile. */
3228 if (!unlimited_cost_model (NULL)
3229 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3230 {
3231 if (dump_enabled_p ())
3232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3233 "not vectorized: vectorization is not "
3234 "profitable.\n");
3235 return false;
3236 }
3237
3238 if (dump_enabled_p ())
3239 dump_printf_loc (MSG_NOTE, vect_location,
3240 "Basic block will be vectorized using SLP\n");
3241 return true;
3242 }
3243
3244 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3245 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3246 on success. The region has N_STMTS statements and has the datarefs
3247 given by DATAREFS. */
3248
3249 static bool
vect_slp_bb_region(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,unsigned int n_stmts)3250 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3251 gimple_stmt_iterator region_end,
3252 vec<data_reference_p> datarefs,
3253 unsigned int n_stmts)
3254 {
3255 bb_vec_info bb_vinfo;
3256 auto_vector_modes vector_modes;
3257
3258 /* Autodetect first vector size we try. */
3259 machine_mode next_vector_mode = VOIDmode;
3260 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3261 unsigned int mode_i = 0;
3262
3263 vec_info_shared shared;
3264
3265 machine_mode autodetected_vector_mode = VOIDmode;
3266 while (1)
3267 {
3268 bool vectorized = false;
3269 bool fatal = false;
3270 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3271
3272 bool first_time_p = shared.datarefs.is_empty ();
3273 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3274 if (first_time_p)
3275 bb_vinfo->shared->save_datarefs ();
3276 else
3277 bb_vinfo->shared->check_datarefs ();
3278 bb_vinfo->vector_mode = next_vector_mode;
3279
3280 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3281 && dbg_cnt (vect_slp))
3282 {
3283 if (dump_enabled_p ())
3284 {
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 "***** Analysis succeeded with vector mode"
3287 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3288 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3289 }
3290
3291 bb_vinfo->shared->check_datarefs ();
3292 vect_schedule_slp (bb_vinfo);
3293
3294 unsigned HOST_WIDE_INT bytes;
3295 if (dump_enabled_p ())
3296 {
3297 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3298 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3299 "basic block part vectorized using %wu byte "
3300 "vectors\n", bytes);
3301 else
3302 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3303 "basic block part vectorized using variable "
3304 "length vectors\n");
3305 }
3306
3307 vectorized = true;
3308 }
3309 else
3310 {
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_NOTE, vect_location,
3313 "***** Analysis failed with vector mode %s\n",
3314 GET_MODE_NAME (bb_vinfo->vector_mode));
3315 }
3316
3317 if (mode_i == 0)
3318 autodetected_vector_mode = bb_vinfo->vector_mode;
3319
3320 if (!fatal)
3321 while (mode_i < vector_modes.length ()
3322 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3323 {
3324 if (dump_enabled_p ())
3325 dump_printf_loc (MSG_NOTE, vect_location,
3326 "***** The result for vector mode %s would"
3327 " be the same\n",
3328 GET_MODE_NAME (vector_modes[mode_i]));
3329 mode_i += 1;
3330 }
3331
3332 delete bb_vinfo;
3333
3334 if (mode_i < vector_modes.length ()
3335 && VECTOR_MODE_P (autodetected_vector_mode)
3336 && (related_vector_mode (vector_modes[mode_i],
3337 GET_MODE_INNER (autodetected_vector_mode))
3338 == autodetected_vector_mode)
3339 && (related_vector_mode (autodetected_vector_mode,
3340 GET_MODE_INNER (vector_modes[mode_i]))
3341 == vector_modes[mode_i]))
3342 {
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "***** Skipping vector mode %s, which would"
3346 " repeat the analysis for %s\n",
3347 GET_MODE_NAME (vector_modes[mode_i]),
3348 GET_MODE_NAME (autodetected_vector_mode));
3349 mode_i += 1;
3350 }
3351
3352 if (vectorized
3353 || mode_i == vector_modes.length ()
3354 || autodetected_vector_mode == VOIDmode
3355 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3356 vector sizes will fail do not bother iterating. */
3357 || fatal)
3358 return vectorized;
3359
3360 /* Try the next biggest vector size. */
3361 next_vector_mode = vector_modes[mode_i++];
3362 if (dump_enabled_p ())
3363 dump_printf_loc (MSG_NOTE, vect_location,
3364 "***** Re-trying analysis with vector mode %s\n",
3365 GET_MODE_NAME (next_vector_mode));
3366 }
3367 }
3368
3369 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3370 true if anything in the basic-block was vectorized. */
3371
3372 bool
vect_slp_bb(basic_block bb)3373 vect_slp_bb (basic_block bb)
3374 {
3375 gimple_stmt_iterator gsi;
3376 bool any_vectorized = false;
3377
3378 gsi = gsi_start_bb (bb);
3379 while (!gsi_end_p (gsi))
3380 {
3381 gimple_stmt_iterator region_begin = gsi;
3382 vec<data_reference_p> datarefs = vNULL;
3383 int insns = 0;
3384
3385 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3386 {
3387 gimple *stmt = gsi_stmt (gsi);
3388 if (is_gimple_debug (stmt))
3389 continue;
3390 insns++;
3391
3392 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3393 vect_location = stmt;
3394
3395 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3396 break;
3397 }
3398
3399 /* Skip leading unhandled stmts. */
3400 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3401 {
3402 gsi_next (&gsi);
3403 continue;
3404 }
3405
3406 gimple_stmt_iterator region_end = gsi;
3407
3408 if (insns > param_slp_max_insns_in_bb)
3409 {
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3412 "not vectorized: too many instructions in "
3413 "basic block.\n");
3414 }
3415 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3416 any_vectorized = true;
3417
3418 if (gsi_end_p (region_end))
3419 break;
3420
3421 /* Skip the unhandled stmt. */
3422 gsi_next (&gsi);
3423 }
3424
3425 return any_vectorized;
3426 }
3427
3428
3429 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3430
3431 static bool
vect_mask_constant_operand_p(stmt_vec_info stmt_vinfo,unsigned op_num)3432 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, unsigned op_num)
3433 {
3434 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3435 tree op, vectype;
3436 enum vect_def_type dt;
3437
3438 /* For comparison and COND_EXPR type is chosen depending
3439 on the non-constant other comparison operand. */
3440 if (TREE_CODE_CLASS (code) == tcc_comparison)
3441 {
3442 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3443 op = gimple_assign_rhs1 (stmt);
3444
3445 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3446 gcc_unreachable ();
3447
3448 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3449 }
3450
3451 if (code == COND_EXPR)
3452 {
3453 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3454 tree cond = gimple_assign_rhs1 (stmt);
3455
3456 if (TREE_CODE (cond) == SSA_NAME)
3457 {
3458 if (op_num > 0)
3459 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3460 op = cond;
3461 }
3462 else
3463 {
3464 if (op_num > 1)
3465 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3466 op = TREE_OPERAND (cond, 0);
3467 }
3468
3469 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3470 gcc_unreachable ();
3471
3472 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3473 }
3474
3475 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3476 }
3477
3478 /* Build a variable-length vector in which the elements in ELTS are repeated
3479 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3480 RESULTS and add any new instructions to SEQ.
3481
3482 The approach we use is:
3483
3484 (1) Find a vector mode VM with integer elements of mode IM.
3485
3486 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3487 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3488 from small vectors to IM.
3489
3490 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3491
3492 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3493 correct byte contents.
3494
3495 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3496
3497 We try to find the largest IM for which this sequence works, in order
3498 to cut down on the number of interleaves. */
3499
3500 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3501 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3502 vec<tree> elts, unsigned int nresults,
3503 vec<tree> &results)
3504 {
3505 unsigned int nelts = elts.length ();
3506 tree element_type = TREE_TYPE (vector_type);
3507
3508 /* (1) Find a vector mode VM with integer elements of mode IM. */
3509 unsigned int nvectors = 1;
3510 tree new_vector_type;
3511 tree permutes[2];
3512 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3513 &nvectors, &new_vector_type,
3514 permutes))
3515 gcc_unreachable ();
3516
3517 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3518 unsigned int partial_nelts = nelts / nvectors;
3519 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3520
3521 tree_vector_builder partial_elts;
3522 auto_vec<tree, 32> pieces (nvectors * 2);
3523 pieces.quick_grow_cleared (nvectors * 2);
3524 for (unsigned int i = 0; i < nvectors; ++i)
3525 {
3526 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3527 ELTS' has mode IM. */
3528 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3529 for (unsigned int j = 0; j < partial_nelts; ++j)
3530 partial_elts.quick_push (elts[i * partial_nelts + j]);
3531 tree t = gimple_build_vector (seq, &partial_elts);
3532 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3533 TREE_TYPE (new_vector_type), t);
3534
3535 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3536 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3537 }
3538
3539 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3540 correct byte contents.
3541
3542 Conceptually, we need to repeat the following operation log2(nvectors)
3543 times, where hi_start = nvectors / 2:
3544
3545 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3546 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3547
3548 However, if each input repeats every N elements and the VF is
3549 a multiple of N * 2, the HI result is the same as the LO result.
3550 This will be true for the first N1 iterations of the outer loop,
3551 followed by N2 iterations for which both the LO and HI results
3552 are needed. I.e.:
3553
3554 N1 + N2 = log2(nvectors)
3555
3556 Each "N1 iteration" doubles the number of redundant vectors and the
3557 effect of the process as a whole is to have a sequence of nvectors/2**N1
3558 vectors that repeats 2**N1 times. Rather than generate these redundant
3559 vectors, we halve the number of vectors for each N1 iteration. */
3560 unsigned int in_start = 0;
3561 unsigned int out_start = nvectors;
3562 unsigned int new_nvectors = nvectors;
3563 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3564 {
3565 unsigned int hi_start = new_nvectors / 2;
3566 unsigned int out_i = 0;
3567 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
3568 {
3569 if ((in_i & 1) != 0
3570 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3571 2 * in_repeat))
3572 continue;
3573
3574 tree output = make_ssa_name (new_vector_type);
3575 tree input1 = pieces[in_start + (in_i / 2)];
3576 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
3577 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3578 input1, input2,
3579 permutes[in_i & 1]);
3580 gimple_seq_add_stmt (seq, stmt);
3581 pieces[out_start + out_i] = output;
3582 out_i += 1;
3583 }
3584 std::swap (in_start, out_start);
3585 new_nvectors = out_i;
3586 }
3587
3588 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3589 results.reserve (nresults);
3590 for (unsigned int i = 0; i < nresults; ++i)
3591 if (i < new_nvectors)
3592 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3593 pieces[in_start + i]));
3594 else
3595 results.quick_push (results[i - new_nvectors]);
3596 }
3597
3598
3599 /* For constant and loop invariant defs of SLP_NODE this function returns
3600 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3601 OP_NODE determines the node for the operand containing the scalar
3602 operands. */
3603
3604 static void
vect_get_constant_vectors(slp_tree slp_node,unsigned op_num,vec<tree> * vec_oprnds)3605 vect_get_constant_vectors (slp_tree slp_node, unsigned op_num,
3606 vec<tree> *vec_oprnds)
3607 {
3608 slp_tree op_node = SLP_TREE_CHILDREN (slp_node)[op_num];
3609 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3610 vec_info *vinfo = stmt_vinfo->vinfo;
3611 unsigned HOST_WIDE_INT nunits;
3612 tree vec_cst;
3613 unsigned j, number_of_places_left_in_vector;
3614 tree vector_type;
3615 tree vop;
3616 int group_size = op_node->ops.length ();
3617 unsigned int vec_num, i;
3618 unsigned number_of_copies = 1;
3619 bool constant_p;
3620 tree neutral_op = NULL;
3621 gimple_seq ctor_seq = NULL;
3622 auto_vec<tree, 16> permute_results;
3623
3624 /* ??? SLP analysis should compute the vector type for the
3625 constant / invariant and store it in the SLP node. */
3626 tree op = op_node->ops[0];
3627 /* Check if vector type is a boolean vector. */
3628 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3629 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3630 && vect_mask_constant_operand_p (stmt_vinfo, op_num))
3631 vector_type = truth_type_for (stmt_vectype);
3632 else
3633 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), op_node);
3634
3635 /* ??? For lane-reducing ops we should also have the required number
3636 of vector stmts initialized rather than second-guessing here. */
3637 unsigned int number_of_vectors;
3638 if (is_gimple_assign (stmt_vinfo->stmt)
3639 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR
3640 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR
3641 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR))
3642 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3643 else
3644 number_of_vectors
3645 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3646 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3647 vector_type);
3648 vec_oprnds->create (number_of_vectors);
3649 auto_vec<tree> voprnds (number_of_vectors);
3650
3651 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3652 created vectors. It is greater than 1 if unrolling is performed.
3653
3654 For example, we have two scalar operands, s1 and s2 (e.g., group of
3655 strided accesses of size two), while NUNITS is four (i.e., four scalars
3656 of this type can be packed in a vector). The output vector will contain
3657 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3658 will be 2).
3659
3660 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3661 containing the operands.
3662
3663 For example, NUNITS is four as before, and the group size is 8
3664 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3665 {s5, s6, s7, s8}. */
3666
3667 /* When using duplicate_and_interleave, we just need one element for
3668 each scalar statement. */
3669 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3670 nunits = group_size;
3671
3672 number_of_copies = nunits * number_of_vectors / group_size;
3673
3674 number_of_places_left_in_vector = nunits;
3675 constant_p = true;
3676 tree_vector_builder elts (vector_type, nunits, 1);
3677 elts.quick_grow (nunits);
3678 bool place_after_defs = false;
3679 for (j = 0; j < number_of_copies; j++)
3680 {
3681 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3682 {
3683 /* Create 'vect_ = {op0,op1,...,opn}'. */
3684 number_of_places_left_in_vector--;
3685 tree orig_op = op;
3686 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3687 {
3688 if (CONSTANT_CLASS_P (op))
3689 {
3690 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3691 {
3692 /* Can't use VIEW_CONVERT_EXPR for booleans because
3693 of possibly different sizes of scalar value and
3694 vector element. */
3695 if (integer_zerop (op))
3696 op = build_int_cst (TREE_TYPE (vector_type), 0);
3697 else if (integer_onep (op))
3698 op = build_all_ones_cst (TREE_TYPE (vector_type));
3699 else
3700 gcc_unreachable ();
3701 }
3702 else
3703 op = fold_unary (VIEW_CONVERT_EXPR,
3704 TREE_TYPE (vector_type), op);
3705 gcc_assert (op && CONSTANT_CLASS_P (op));
3706 }
3707 else
3708 {
3709 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3710 gimple *init_stmt;
3711 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3712 {
3713 tree true_val
3714 = build_all_ones_cst (TREE_TYPE (vector_type));
3715 tree false_val
3716 = build_zero_cst (TREE_TYPE (vector_type));
3717 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3718 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3719 op, true_val,
3720 false_val);
3721 }
3722 else
3723 {
3724 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3725 op);
3726 init_stmt
3727 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3728 op);
3729 }
3730 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3731 op = new_temp;
3732 }
3733 }
3734 elts[number_of_places_left_in_vector] = op;
3735 if (!CONSTANT_CLASS_P (op))
3736 constant_p = false;
3737 if (TREE_CODE (orig_op) == SSA_NAME
3738 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3739 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3740 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3741 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3742 place_after_defs = true;
3743
3744 if (number_of_places_left_in_vector == 0)
3745 {
3746 if (constant_p
3747 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3748 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3749 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3750 else
3751 {
3752 if (permute_results.is_empty ())
3753 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3754 elts, number_of_vectors,
3755 permute_results);
3756 vec_cst = permute_results[number_of_vectors - j - 1];
3757 }
3758 tree init;
3759 gimple_stmt_iterator gsi;
3760 if (place_after_defs)
3761 {
3762 stmt_vec_info last_stmt_info
3763 = vect_find_last_scalar_stmt_in_slp (slp_node);
3764 gsi = gsi_for_stmt (last_stmt_info->stmt);
3765 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3766 &gsi);
3767 }
3768 else
3769 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3770 NULL);
3771 if (ctor_seq != NULL)
3772 {
3773 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3774 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3775 ctor_seq = NULL;
3776 }
3777 voprnds.quick_push (init);
3778 place_after_defs = false;
3779 number_of_places_left_in_vector = nunits;
3780 constant_p = true;
3781 elts.new_vector (vector_type, nunits, 1);
3782 elts.quick_grow (nunits);
3783 }
3784 }
3785 }
3786
3787 /* Since the vectors are created in the reverse order, we should invert
3788 them. */
3789 vec_num = voprnds.length ();
3790 for (j = vec_num; j != 0; j--)
3791 {
3792 vop = voprnds[j - 1];
3793 vec_oprnds->quick_push (vop);
3794 }
3795
3796 /* In case that VF is greater than the unrolling factor needed for the SLP
3797 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3798 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3799 to replicate the vectors. */
3800 while (number_of_vectors > vec_oprnds->length ())
3801 {
3802 tree neutral_vec = NULL;
3803
3804 if (neutral_op)
3805 {
3806 if (!neutral_vec)
3807 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3808
3809 vec_oprnds->quick_push (neutral_vec);
3810 }
3811 else
3812 {
3813 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3814 vec_oprnds->quick_push (vop);
3815 }
3816 }
3817 }
3818
3819
3820 /* Get vectorized definitions from SLP_NODE that contains corresponding
3821 vectorized def-stmts. */
3822
3823 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3824 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3825 {
3826 stmt_vec_info vec_def_stmt_info;
3827 unsigned int i;
3828
3829 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3830
3831 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3832 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3833 }
3834
3835
3836 /* Get N vectorized definitions for SLP_NODE.
3837 If the scalar definitions are loop invariants or constants, collect them and
3838 call vect_get_constant_vectors() to create vector stmts.
3839 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3840 must be stored in the corresponding child of SLP_NODE, and we call
3841 vect_get_slp_vect_defs () to retrieve them. */
3842
3843 void
vect_get_slp_defs(slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)3844 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3845 {
3846 if (n == -1U)
3847 n = SLP_TREE_CHILDREN (slp_node).length ();
3848
3849 for (unsigned i = 0; i < n; ++i)
3850 {
3851 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3852
3853 vec<tree> vec_defs = vNULL;
3854
3855 /* For each operand we check if it has vectorized definitions in a child
3856 node or we need to create them (for invariants and constants). */
3857 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3858 {
3859 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3860 vect_get_slp_vect_defs (child, &vec_defs);
3861 }
3862 else
3863 vect_get_constant_vectors (slp_node, i, &vec_defs);
3864
3865 vec_oprnds->quick_push (vec_defs);
3866 }
3867 }
3868
3869 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3870 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3871 permute statements for the SLP node NODE of the SLP instance
3872 SLP_NODE_INSTANCE. */
3873
3874 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3875 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3876 gimple_stmt_iterator *gsi, poly_uint64 vf,
3877 slp_instance slp_node_instance, bool analyze_only,
3878 unsigned *n_perms)
3879 {
3880 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3881 vec_info *vinfo = stmt_info->vinfo;
3882 int vec_index = 0;
3883 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3884 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3885 unsigned int mask_element;
3886 machine_mode mode;
3887
3888 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3889 return false;
3890
3891 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3892
3893 mode = TYPE_MODE (vectype);
3894 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3895
3896 /* Initialize the vect stmts of NODE to properly insert the generated
3897 stmts later. */
3898 if (! analyze_only)
3899 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3900 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3901 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3902
3903 /* Generate permutation masks for every NODE. Number of masks for each NODE
3904 is equal to GROUP_SIZE.
3905 E.g., we have a group of three nodes with three loads from the same
3906 location in each node, and the vector size is 4. I.e., we have a
3907 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3908 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3909 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3910 ...
3911
3912 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3913 The last mask is illegal since we assume two operands for permute
3914 operation, and the mask element values can't be outside that range.
3915 Hence, the last mask must be converted into {2,5,5,5}.
3916 For the first two permutations we need the first and the second input
3917 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3918 we need the second and the third vectors: {b1,c1,a2,b2} and
3919 {c2,a3,b3,c3}. */
3920
3921 int vect_stmts_counter = 0;
3922 unsigned int index = 0;
3923 int first_vec_index = -1;
3924 int second_vec_index = -1;
3925 bool noop_p = true;
3926 *n_perms = 0;
3927
3928 vec_perm_builder mask;
3929 unsigned int nelts_to_build;
3930 unsigned int nvectors_per_build;
3931 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3932 && multiple_p (nunits, group_size));
3933 if (repeating_p)
3934 {
3935 /* A single vector contains a whole number of copies of the node, so:
3936 (a) all permutes can use the same mask; and
3937 (b) the permutes only need a single vector input. */
3938 mask.new_vector (nunits, group_size, 3);
3939 nelts_to_build = mask.encoded_nelts ();
3940 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3941 }
3942 else
3943 {
3944 /* We need to construct a separate mask for each vector statement. */
3945 unsigned HOST_WIDE_INT const_nunits, const_vf;
3946 if (!nunits.is_constant (&const_nunits)
3947 || !vf.is_constant (&const_vf))
3948 return false;
3949 mask.new_vector (const_nunits, const_nunits, 1);
3950 nelts_to_build = const_vf * group_size;
3951 nvectors_per_build = 1;
3952 }
3953
3954 unsigned int count = mask.encoded_nelts ();
3955 mask.quick_grow (count);
3956 vec_perm_indices indices;
3957
3958 for (unsigned int j = 0; j < nelts_to_build; j++)
3959 {
3960 unsigned int iter_num = j / group_size;
3961 unsigned int stmt_num = j % group_size;
3962 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3963 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3964 if (repeating_p)
3965 {
3966 first_vec_index = 0;
3967 mask_element = i;
3968 }
3969 else
3970 {
3971 /* Enforced before the loop when !repeating_p. */
3972 unsigned int const_nunits = nunits.to_constant ();
3973 vec_index = i / const_nunits;
3974 mask_element = i % const_nunits;
3975 if (vec_index == first_vec_index
3976 || first_vec_index == -1)
3977 {
3978 first_vec_index = vec_index;
3979 }
3980 else if (vec_index == second_vec_index
3981 || second_vec_index == -1)
3982 {
3983 second_vec_index = vec_index;
3984 mask_element += const_nunits;
3985 }
3986 else
3987 {
3988 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990 "permutation requires at "
3991 "least three vectors %G",
3992 stmt_info->stmt);
3993 gcc_assert (analyze_only);
3994 return false;
3995 }
3996
3997 gcc_assert (mask_element < 2 * const_nunits);
3998 }
3999
4000 if (mask_element != index)
4001 noop_p = false;
4002 mask[index++] = mask_element;
4003
4004 if (index == count && !noop_p)
4005 {
4006 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4007 if (!can_vec_perm_const_p (mode, indices))
4008 {
4009 if (dump_enabled_p ())
4010 {
4011 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4012 vect_location,
4013 "unsupported vect permute { ");
4014 for (i = 0; i < count; ++i)
4015 {
4016 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4017 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4018 }
4019 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4020 }
4021 gcc_assert (analyze_only);
4022 return false;
4023 }
4024
4025 ++*n_perms;
4026 }
4027
4028 if (index == count)
4029 {
4030 if (!analyze_only)
4031 {
4032 tree mask_vec = NULL_TREE;
4033
4034 if (! noop_p)
4035 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4036
4037 if (second_vec_index == -1)
4038 second_vec_index = first_vec_index;
4039
4040 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4041 {
4042 /* Generate the permute statement if necessary. */
4043 tree first_vec = dr_chain[first_vec_index + ri];
4044 tree second_vec = dr_chain[second_vec_index + ri];
4045 stmt_vec_info perm_stmt_info;
4046 if (! noop_p)
4047 {
4048 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4049 tree perm_dest
4050 = vect_create_destination_var (gimple_assign_lhs (stmt),
4051 vectype);
4052 perm_dest = make_ssa_name (perm_dest);
4053 gassign *perm_stmt
4054 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4055 first_vec, second_vec,
4056 mask_vec);
4057 perm_stmt_info
4058 = vect_finish_stmt_generation (stmt_info, perm_stmt,
4059 gsi);
4060 }
4061 else
4062 /* If mask was NULL_TREE generate the requested
4063 identity transform. */
4064 perm_stmt_info = vinfo->lookup_def (first_vec);
4065
4066 /* Store the vector statement in NODE. */
4067 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4068 = perm_stmt_info;
4069 }
4070 }
4071
4072 index = 0;
4073 first_vec_index = -1;
4074 second_vec_index = -1;
4075 noop_p = true;
4076 }
4077 }
4078
4079 return true;
4080 }
4081
4082 /* Vectorize SLP instance tree in postorder. */
4083
4084 static void
vect_schedule_slp_instance(slp_tree node,slp_instance instance)4085 vect_schedule_slp_instance (slp_tree node, slp_instance instance)
4086 {
4087 gimple_stmt_iterator si;
4088 stmt_vec_info stmt_info;
4089 unsigned int group_size;
4090 tree vectype;
4091 int i, j;
4092 slp_tree child;
4093
4094 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4095 return;
4096
4097 /* See if we have already vectorized the node in the graph of the
4098 SLP instance. */
4099 if (SLP_TREE_VEC_STMTS (node).exists ())
4100 return;
4101
4102 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4103 vect_schedule_slp_instance (child, instance);
4104
4105 /* Push SLP node def-type to stmts. */
4106 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4107 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4108 {
4109 stmt_vec_info child_stmt_info;
4110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4111 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4112 }
4113
4114 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4115
4116 /* VECTYPE is the type of the destination. */
4117 vectype = STMT_VINFO_VECTYPE (stmt_info);
4118 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4119 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4120
4121 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4122 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4123
4124 if (dump_enabled_p ())
4125 dump_printf_loc (MSG_NOTE, vect_location,
4126 "------>vectorizing SLP node starting from: %G",
4127 stmt_info->stmt);
4128
4129 /* Vectorized stmts go before the last scalar stmt which is where
4130 all uses are ready. */
4131 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4132 si = gsi_for_stmt (last_stmt_info->stmt);
4133
4134 /* Handle two-operation SLP nodes by vectorizing the group with
4135 both operations and then performing a merge. */
4136 bool done_p = false;
4137 if (SLP_TREE_TWO_OPERATORS (node))
4138 {
4139 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4140 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4141 enum tree_code ocode = ERROR_MARK;
4142 stmt_vec_info ostmt_info;
4143 vec_perm_builder mask (group_size, group_size, 1);
4144 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4145 {
4146 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4147 if (gimple_assign_rhs_code (ostmt) != code0)
4148 {
4149 mask.quick_push (1);
4150 ocode = gimple_assign_rhs_code (ostmt);
4151 }
4152 else
4153 mask.quick_push (0);
4154 }
4155 if (ocode != ERROR_MARK)
4156 {
4157 vec<stmt_vec_info> v0;
4158 vec<stmt_vec_info> v1;
4159 unsigned j;
4160 tree tmask = NULL_TREE;
4161 vect_transform_stmt (stmt_info, &si, node, instance);
4162 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4163 SLP_TREE_VEC_STMTS (node).truncate (0);
4164 gimple_assign_set_rhs_code (stmt, ocode);
4165 vect_transform_stmt (stmt_info, &si, node, instance);
4166 gimple_assign_set_rhs_code (stmt, code0);
4167 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4168 SLP_TREE_VEC_STMTS (node).truncate (0);
4169 tree meltype = build_nonstandard_integer_type
4170 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4171 tree mvectype = get_same_sized_vectype (meltype, vectype);
4172 unsigned k = 0, l;
4173 for (j = 0; j < v0.length (); ++j)
4174 {
4175 /* Enforced by vect_build_slp_tree, which rejects variable-length
4176 vectors for SLP_TREE_TWO_OPERATORS. */
4177 unsigned int const_nunits = nunits.to_constant ();
4178 tree_vector_builder melts (mvectype, const_nunits, 1);
4179 for (l = 0; l < const_nunits; ++l)
4180 {
4181 if (k >= group_size)
4182 k = 0;
4183 tree t = build_int_cst (meltype,
4184 mask[k++] * const_nunits + l);
4185 melts.quick_push (t);
4186 }
4187 tmask = melts.build ();
4188
4189 /* ??? Not all targets support a VEC_PERM_EXPR with a
4190 constant mask that would translate to a vec_merge RTX
4191 (with their vec_perm_const_ok). We can either not
4192 vectorize in that case or let veclower do its job.
4193 Unfortunately that isn't too great and at least for
4194 plus/minus we'd eventually like to match targets
4195 vector addsub instructions. */
4196 gimple *vstmt;
4197 vstmt = gimple_build_assign (make_ssa_name (vectype),
4198 VEC_PERM_EXPR,
4199 gimple_assign_lhs (v0[j]->stmt),
4200 gimple_assign_lhs (v1[j]->stmt),
4201 tmask);
4202 SLP_TREE_VEC_STMTS (node).quick_push
4203 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4204 }
4205 v0.release ();
4206 v1.release ();
4207 done_p = true;
4208 }
4209 }
4210 if (!done_p)
4211 vect_transform_stmt (stmt_info, &si, node, instance);
4212
4213 /* Restore stmt def-types. */
4214 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4215 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4216 {
4217 stmt_vec_info child_stmt_info;
4218 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4219 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4220 }
4221 }
4222
4223 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4224 For loop vectorization this is done in vectorizable_call, but for SLP
4225 it needs to be deferred until end of vect_schedule_slp, because multiple
4226 SLP instances may refer to the same scalar stmt. */
4227
4228 static void
vect_remove_slp_scalar_calls(slp_tree node,hash_set<slp_tree> & visited)4229 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4230 {
4231 gimple *new_stmt;
4232 gimple_stmt_iterator gsi;
4233 int i;
4234 slp_tree child;
4235 tree lhs;
4236 stmt_vec_info stmt_info;
4237
4238 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4239 return;
4240
4241 if (visited.add (node))
4242 return;
4243
4244 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4245 vect_remove_slp_scalar_calls (child, visited);
4246
4247 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4248 {
4249 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4250 if (!stmt || gimple_bb (stmt) == NULL)
4251 continue;
4252 if (is_pattern_stmt_p (stmt_info)
4253 || !PURE_SLP_STMT (stmt_info))
4254 continue;
4255 lhs = gimple_call_lhs (stmt);
4256 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4257 gsi = gsi_for_stmt (stmt);
4258 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4259 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4260 }
4261 }
4262
4263 static void
vect_remove_slp_scalar_calls(slp_tree node)4264 vect_remove_slp_scalar_calls (slp_tree node)
4265 {
4266 hash_set<slp_tree> visited;
4267 vect_remove_slp_scalar_calls (node, visited);
4268 }
4269
4270 /* Vectorize the instance root. */
4271
4272 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)4273 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4274 {
4275 gassign *rstmt = NULL;
4276
4277 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4278 {
4279 stmt_vec_info child_stmt_info;
4280 int j;
4281
4282 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4283 {
4284 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4285 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4286 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4287 TREE_TYPE (vect_lhs)))
4288 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4289 vect_lhs);
4290 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4291 break;
4292 }
4293 }
4294 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4295 {
4296 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4297 stmt_vec_info child_stmt_info;
4298 int j;
4299 vec<constructor_elt, va_gc> *v;
4300 vec_alloc (v, nelts);
4301
4302 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4303 {
4304 CONSTRUCTOR_APPEND_ELT (v,
4305 NULL_TREE,
4306 gimple_get_lhs (child_stmt_info->stmt));
4307 }
4308 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4309 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4310 tree r_constructor = build_constructor (rtype, v);
4311 rstmt = gimple_build_assign (lhs, r_constructor);
4312 }
4313
4314 gcc_assert (rstmt);
4315
4316 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4317 gsi_replace (&rgsi, rstmt, true);
4318 }
4319
4320 /* Generate vector code for all SLP instances in the loop/basic block. */
4321
4322 void
vect_schedule_slp(vec_info * vinfo)4323 vect_schedule_slp (vec_info *vinfo)
4324 {
4325 vec<slp_instance> slp_instances;
4326 slp_instance instance;
4327 unsigned int i;
4328
4329 slp_instances = vinfo->slp_instances;
4330 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4331 {
4332 slp_tree node = SLP_INSTANCE_TREE (instance);
4333 /* Schedule the tree of INSTANCE. */
4334 vect_schedule_slp_instance (node, instance);
4335
4336 if (SLP_INSTANCE_ROOT_STMT (instance))
4337 vectorize_slp_instance_root_stmt (node, instance);
4338
4339 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "vectorizing stmts using SLP.\n");
4342 }
4343
4344 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4345 {
4346 slp_tree root = SLP_INSTANCE_TREE (instance);
4347 stmt_vec_info store_info;
4348 unsigned int j;
4349
4350 /* For reductions set the latch values of the vectorized PHIs. */
4351 if (instance->reduc_phis
4352 && STMT_VINFO_REDUC_TYPE (SLP_TREE_SCALAR_STMTS
4353 (instance->reduc_phis)[0]) != FOLD_LEFT_REDUCTION
4354 && STMT_VINFO_REDUC_TYPE (SLP_TREE_SCALAR_STMTS
4355 (instance->reduc_phis)[0]) != EXTRACT_LAST_REDUCTION)
4356 {
4357 slp_tree slp_node = root;
4358 slp_tree phi_node = instance->reduc_phis;
4359 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
4360 edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
4361 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
4362 == SLP_TREE_VEC_STMTS (slp_node).length ());
4363 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
4364 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]->stmt),
4365 gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[j]->stmt),
4366 e, gimple_phi_arg_location (phi, e->dest_idx));
4367 }
4368
4369 /* Remove scalar call stmts. Do not do this for basic-block
4370 vectorization as not all uses may be vectorized.
4371 ??? Why should this be necessary? DCE should be able to
4372 remove the stmts itself.
4373 ??? For BB vectorization we can as well remove scalar
4374 stmts starting from the SLP tree root if they have no
4375 uses. */
4376 if (is_a <loop_vec_info> (vinfo))
4377 vect_remove_slp_scalar_calls (root);
4378
4379 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4380 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4381 {
4382 if (!STMT_VINFO_DATA_REF (store_info))
4383 break;
4384
4385 if (SLP_INSTANCE_ROOT_STMT (instance))
4386 continue;
4387
4388 store_info = vect_orig_stmt (store_info);
4389 /* Free the attached stmt_vec_info and remove the stmt. */
4390 vinfo->remove_stmt (store_info);
4391 }
4392 }
4393 }
4394