1 /* SLP - Basic Block Vectorization 2 Copyright (C) 2007-2020 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "tree-pass.h" 31 #include "ssa.h" 32 #include "optabs-tree.h" 33 #include "insn-config.h" 34 #include "recog.h" /* FIXME: for insn_data */ 35 #include "fold-const.h" 36 #include "stor-layout.h" 37 #include "gimple-iterator.h" 38 #include "cfgloop.h" 39 #include "tree-vectorizer.h" 40 #include "langhooks.h" 41 #include "gimple-walk.h" 42 #include "dbgcnt.h" 43 #include "tree-vector-builder.h" 44 #include "vec-perm-indices.h" 45 #include "gimple-fold.h" 46 #include "internal-fn.h" 47 48 49 /* Recursively free the memory allocated for the SLP tree rooted at NODE. 50 FINAL_P is true if we have vectorized the instance or if we have 51 made a final decision not to vectorize the statements in any way. */ 52 53 static void 54 vect_free_slp_tree (slp_tree node, bool final_p) 55 { 56 int i; 57 slp_tree child; 58 59 if (--node->refcnt != 0) 60 return; 61 62 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 63 vect_free_slp_tree (child, final_p); 64 65 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant. 66 Some statements might no longer exist, after having been 67 removed by vect_transform_stmt. Updating the remaining 68 statements would be redundant. */ 69 if (!final_p) 70 { 71 stmt_vec_info stmt_info; 72 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 73 { 74 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0); 75 STMT_VINFO_NUM_SLP_USES (stmt_info)--; 76 } 77 } 78 79 SLP_TREE_CHILDREN (node).release (); 80 SLP_TREE_SCALAR_STMTS (node).release (); 81 SLP_TREE_SCALAR_OPS (node).release (); 82 SLP_TREE_VEC_STMTS (node).release (); 83 SLP_TREE_LOAD_PERMUTATION (node).release (); 84 85 free (node); 86 } 87 88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we 89 have vectorized the instance or if we have made a final decision not 90 to vectorize the statements in any way. */ 91 92 void 93 vect_free_slp_instance (slp_instance instance, bool final_p) 94 { 95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p); 96 SLP_INSTANCE_LOADS (instance).release (); 97 free (instance); 98 } 99 100 101 /* Create an SLP node for SCALAR_STMTS. */ 102 103 static slp_tree 104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts) 105 { 106 slp_tree node; 107 stmt_vec_info stmt_info = scalar_stmts[0]; 108 unsigned int nops; 109 110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) 111 nops = gimple_call_num_args (stmt); 112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) 113 { 114 nops = gimple_num_ops (stmt) - 1; 115 if (gimple_assign_rhs_code (stmt) == COND_EXPR) 116 nops++; 117 } 118 else if (is_a <gphi *> (stmt_info->stmt)) 119 nops = 0; 120 else 121 return NULL; 122 123 node = XNEW (struct _slp_tree); 124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; 125 SLP_TREE_SCALAR_OPS (node) = vNULL; 126 SLP_TREE_VEC_STMTS (node).create (0); 127 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; 128 SLP_TREE_CHILDREN (node).create (nops); 129 SLP_TREE_LOAD_PERMUTATION (node) = vNULL; 130 SLP_TREE_TWO_OPERATORS (node) = false; 131 SLP_TREE_DEF_TYPE (node) = vect_internal_def; 132 node->refcnt = 1; 133 node->max_nunits = 1; 134 135 unsigned i; 136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info) 137 STMT_VINFO_NUM_SLP_USES (stmt_info)++; 138 139 return node; 140 } 141 142 /* Create an SLP node for OPS. */ 143 144 static slp_tree 145 vect_create_new_slp_node (vec<tree> ops) 146 { 147 slp_tree node; 148 149 node = XNEW (struct _slp_tree); 150 SLP_TREE_SCALAR_STMTS (node) = vNULL; 151 SLP_TREE_SCALAR_OPS (node) = ops; 152 SLP_TREE_VEC_STMTS (node).create (0); 153 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; 154 SLP_TREE_CHILDREN (node) = vNULL; 155 SLP_TREE_LOAD_PERMUTATION (node) = vNULL; 156 SLP_TREE_TWO_OPERATORS (node) = false; 157 SLP_TREE_DEF_TYPE (node) = vect_external_def; 158 node->refcnt = 1; 159 node->max_nunits = 1; 160 161 return node; 162 } 163 164 165 /* This structure is used in creation of an SLP tree. Each instance 166 corresponds to the same operand in a group of scalar stmts in an SLP 167 node. */ 168 typedef struct _slp_oprnd_info 169 { 170 /* Def-stmts for the operands. */ 171 vec<stmt_vec_info> def_stmts; 172 /* Operands. */ 173 vec<tree> ops; 174 /* Information about the first statement, its vector def-type, type, the 175 operand itself in case it's constant, and an indication if it's a pattern 176 stmt. */ 177 tree first_op_type; 178 enum vect_def_type first_dt; 179 bool any_pattern; 180 } *slp_oprnd_info; 181 182 183 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each 184 operand. */ 185 static vec<slp_oprnd_info> 186 vect_create_oprnd_info (int nops, int group_size) 187 { 188 int i; 189 slp_oprnd_info oprnd_info; 190 vec<slp_oprnd_info> oprnds_info; 191 192 oprnds_info.create (nops); 193 for (i = 0; i < nops; i++) 194 { 195 oprnd_info = XNEW (struct _slp_oprnd_info); 196 oprnd_info->def_stmts.create (group_size); 197 oprnd_info->ops.create (group_size); 198 oprnd_info->first_dt = vect_uninitialized_def; 199 oprnd_info->first_op_type = NULL_TREE; 200 oprnd_info->any_pattern = false; 201 oprnds_info.quick_push (oprnd_info); 202 } 203 204 return oprnds_info; 205 } 206 207 208 /* Free operands info. */ 209 210 static void 211 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) 212 { 213 int i; 214 slp_oprnd_info oprnd_info; 215 216 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) 217 { 218 oprnd_info->def_stmts.release (); 219 oprnd_info->ops.release (); 220 XDELETE (oprnd_info); 221 } 222 223 oprnds_info.release (); 224 } 225 226 227 /* Return true if STMTS contains a pattern statement. */ 228 229 static bool 230 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts) 231 { 232 stmt_vec_info stmt_info; 233 unsigned int i; 234 FOR_EACH_VEC_ELT (stmts, i, stmt_info) 235 if (is_pattern_stmt_p (stmt_info)) 236 return true; 237 return false; 238 } 239 240 /* Find the place of the data-ref in STMT_INFO in the interleaving chain 241 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part 242 of the chain. */ 243 244 int 245 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info, 246 stmt_vec_info first_stmt_info) 247 { 248 stmt_vec_info next_stmt_info = first_stmt_info; 249 int result = 0; 250 251 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info)) 252 return -1; 253 254 do 255 { 256 if (next_stmt_info == stmt_info) 257 return result; 258 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); 259 if (next_stmt_info) 260 result += DR_GROUP_GAP (next_stmt_info); 261 } 262 while (next_stmt_info); 263 264 return -1; 265 } 266 267 /* Check whether it is possible to load COUNT elements of type ELT_TYPE 268 using the method implemented by duplicate_and_interleave. Return true 269 if so, returning the number of intermediate vectors in *NVECTORS_OUT 270 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT 271 (if nonnull). */ 272 273 bool 274 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count, 275 tree elt_type, unsigned int *nvectors_out, 276 tree *vector_type_out, 277 tree *permutes) 278 { 279 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count); 280 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type))) 281 return false; 282 283 machine_mode base_vector_mode = TYPE_MODE (base_vector_type); 284 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode); 285 unsigned int nvectors = 1; 286 for (;;) 287 { 288 scalar_int_mode int_mode; 289 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT; 290 if (int_mode_for_size (elt_bits, 1).exists (&int_mode)) 291 { 292 /* Get the natural vector type for this SLP group size. */ 293 tree int_type = build_nonstandard_integer_type 294 (GET_MODE_BITSIZE (int_mode), 1); 295 tree vector_type 296 = get_vectype_for_scalar_type (vinfo, int_type, count); 297 if (vector_type 298 && VECTOR_MODE_P (TYPE_MODE (vector_type)) 299 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)), 300 GET_MODE_SIZE (base_vector_mode))) 301 { 302 /* Try fusing consecutive sequences of COUNT / NVECTORS elements 303 together into elements of type INT_TYPE and using the result 304 to build NVECTORS vectors. */ 305 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type)); 306 vec_perm_builder sel1 (nelts, 2, 3); 307 vec_perm_builder sel2 (nelts, 2, 3); 308 poly_int64 half_nelts = exact_div (nelts, 2); 309 for (unsigned int i = 0; i < 3; ++i) 310 { 311 sel1.quick_push (i); 312 sel1.quick_push (i + nelts); 313 sel2.quick_push (half_nelts + i); 314 sel2.quick_push (half_nelts + i + nelts); 315 } 316 vec_perm_indices indices1 (sel1, 2, nelts); 317 vec_perm_indices indices2 (sel2, 2, nelts); 318 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1) 319 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2)) 320 { 321 if (nvectors_out) 322 *nvectors_out = nvectors; 323 if (vector_type_out) 324 *vector_type_out = vector_type; 325 if (permutes) 326 { 327 permutes[0] = vect_gen_perm_mask_checked (vector_type, 328 indices1); 329 permutes[1] = vect_gen_perm_mask_checked (vector_type, 330 indices2); 331 } 332 return true; 333 } 334 } 335 } 336 if (!multiple_p (elt_bytes, 2, &elt_bytes)) 337 return false; 338 nvectors *= 2; 339 } 340 } 341 342 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that 343 they are of a valid type and that they match the defs of the first stmt of 344 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts 345 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP 346 indicates swap is required for cond_expr stmts. Specifically, *SWAP 347 is 1 if STMT is cond and operands of comparison need to be swapped; 348 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted. 349 If there is any operand swap in this function, *SWAP is set to non-zero 350 value. 351 If there was a fatal error return -1; if the error could be corrected by 352 swapping operands of father node of this one, return 1; if everything is 353 ok return 0. */ 354 static int 355 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap, 356 vec<stmt_vec_info> stmts, unsigned stmt_num, 357 vec<slp_oprnd_info> *oprnds_info) 358 { 359 stmt_vec_info stmt_info = stmts[stmt_num]; 360 tree oprnd; 361 unsigned int i, number_of_oprnds; 362 enum vect_def_type dt = vect_uninitialized_def; 363 slp_oprnd_info oprnd_info; 364 int first_op_idx = 1; 365 unsigned int commutative_op = -1U; 366 bool first_op_cond = false; 367 bool first = stmt_num == 0; 368 369 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) 370 { 371 number_of_oprnds = gimple_call_num_args (stmt); 372 first_op_idx = 3; 373 if (gimple_call_internal_p (stmt)) 374 { 375 internal_fn ifn = gimple_call_internal_fn (stmt); 376 commutative_op = first_commutative_argument (ifn); 377 378 /* Masked load, only look at mask. */ 379 if (ifn == IFN_MASK_LOAD) 380 { 381 number_of_oprnds = 1; 382 /* Mask operand index. */ 383 first_op_idx = 5; 384 } 385 } 386 } 387 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) 388 { 389 enum tree_code code = gimple_assign_rhs_code (stmt); 390 number_of_oprnds = gimple_num_ops (stmt) - 1; 391 /* Swap can only be done for cond_expr if asked to, otherwise we 392 could result in different comparison code to the first stmt. */ 393 if (code == COND_EXPR 394 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt))) 395 { 396 first_op_cond = true; 397 number_of_oprnds++; 398 } 399 else 400 commutative_op = commutative_tree_code (code) ? 0U : -1U; 401 } 402 else 403 return -1; 404 405 bool swapped = (*swap != 0); 406 gcc_assert (!swapped || first_op_cond); 407 for (i = 0; i < number_of_oprnds; i++) 408 { 409 again: 410 if (first_op_cond) 411 { 412 /* Map indicating how operands of cond_expr should be swapped. */ 413 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}}; 414 int *map = maps[*swap]; 415 416 if (i < 2) 417 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt, 418 first_op_idx), map[i]); 419 else 420 oprnd = gimple_op (stmt_info->stmt, map[i]); 421 } 422 else 423 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i)); 424 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR) 425 oprnd = TREE_OPERAND (oprnd, 0); 426 427 oprnd_info = (*oprnds_info)[i]; 428 429 stmt_vec_info def_stmt_info; 430 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info)) 431 { 432 if (dump_enabled_p ()) 433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 434 "Build SLP failed: can't analyze def for %T\n", 435 oprnd); 436 437 return -1; 438 } 439 440 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info)) 441 oprnd_info->any_pattern = true; 442 443 if (first) 444 { 445 /* For the swapping logic below force vect_reduction_def 446 for the reduction op in a SLP reduction group. */ 447 if (!STMT_VINFO_DATA_REF (stmt_info) 448 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) 449 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info) 450 && def_stmt_info) 451 dt = vect_reduction_def; 452 oprnd_info->first_dt = dt; 453 oprnd_info->first_op_type = TREE_TYPE (oprnd); 454 } 455 else 456 { 457 /* Not first stmt of the group, check that the def-stmt/s match 458 the def-stmt/s of the first stmt. Allow different definition 459 types for reduction chains: the first stmt must be a 460 vect_reduction_def (a phi node), and the rest 461 end in the reduction chain. */ 462 tree type = TREE_TYPE (oprnd); 463 if ((oprnd_info->first_dt != dt 464 && !(oprnd_info->first_dt == vect_reduction_def 465 && !STMT_VINFO_DATA_REF (stmt_info) 466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) 467 && def_stmt_info 468 && !STMT_VINFO_DATA_REF (def_stmt_info) 469 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) 470 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))) 471 && !((oprnd_info->first_dt == vect_external_def 472 || oprnd_info->first_dt == vect_constant_def) 473 && (dt == vect_external_def 474 || dt == vect_constant_def))) 475 || !types_compatible_p (oprnd_info->first_op_type, type) 476 || (!STMT_VINFO_DATA_REF (stmt_info) 477 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) 478 && ((!def_stmt_info 479 || STMT_VINFO_DATA_REF (def_stmt_info) 480 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) 481 != REDUC_GROUP_FIRST_ELEMENT (stmt_info))) 482 != (oprnd_info->first_dt != vect_reduction_def)))) 483 { 484 /* Try swapping operands if we got a mismatch. */ 485 if (i == commutative_op && !swapped) 486 { 487 if (dump_enabled_p ()) 488 dump_printf_loc (MSG_NOTE, vect_location, 489 "trying swapped operands\n"); 490 swapped = true; 491 goto again; 492 } 493 494 if (dump_enabled_p ()) 495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 496 "Build SLP failed: different types\n"); 497 498 return 1; 499 } 500 if ((dt == vect_constant_def 501 || dt == vect_external_def) 502 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () 503 && (TREE_CODE (type) == BOOLEAN_TYPE 504 || !can_duplicate_and_interleave_p (vinfo, stmts.length (), 505 type))) 506 { 507 if (dump_enabled_p ()) 508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 509 "Build SLP failed: invalid type of def " 510 "for variable-length SLP %T\n", oprnd); 511 return -1; 512 } 513 } 514 515 /* Check the types of the definitions. */ 516 switch (dt) 517 { 518 case vect_external_def: 519 /* Make sure to demote the overall operand to external. */ 520 oprnd_info->first_dt = vect_external_def; 521 /* Fallthru. */ 522 case vect_constant_def: 523 oprnd_info->def_stmts.quick_push (NULL); 524 oprnd_info->ops.quick_push (oprnd); 525 break; 526 527 case vect_internal_def: 528 case vect_reduction_def: 529 if (oprnd_info->first_dt == vect_reduction_def 530 && !STMT_VINFO_DATA_REF (stmt_info) 531 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) 532 && !STMT_VINFO_DATA_REF (def_stmt_info) 533 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) 534 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))) 535 { 536 /* For a SLP reduction chain we want to duplicate the 537 reduction to each of the chain members. That gets 538 us a sane SLP graph (still the stmts are not 100% 539 correct wrt the initial values). */ 540 gcc_assert (!first); 541 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]); 542 oprnd_info->ops.quick_push (oprnd_info->ops[0]); 543 break; 544 } 545 /* Fallthru. */ 546 case vect_induction_def: 547 oprnd_info->def_stmts.quick_push (def_stmt_info); 548 oprnd_info->ops.quick_push (oprnd); 549 break; 550 551 default: 552 /* FORNOW: Not supported. */ 553 if (dump_enabled_p ()) 554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 555 "Build SLP failed: illegal type of def %T\n", 556 oprnd); 557 558 return -1; 559 } 560 } 561 562 /* Swap operands. */ 563 if (swapped) 564 { 565 if (dump_enabled_p ()) 566 dump_printf_loc (MSG_NOTE, vect_location, 567 "swapped operands to match def types in %G", 568 stmt_info->stmt); 569 } 570 571 *swap = swapped; 572 return 0; 573 } 574 575 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization. 576 Return true if we can, meaning that this choice doesn't conflict with 577 existing SLP nodes that use STMT_INFO. */ 578 579 static bool 580 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype) 581 { 582 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info); 583 if (old_vectype && useless_type_conversion_p (vectype, old_vectype)) 584 return true; 585 586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 588 { 589 /* We maintain the invariant that if any statement in the group is 590 used, all other members of the group have the same vector type. */ 591 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info); 592 stmt_vec_info member_info = first_info; 593 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info)) 594 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0 595 || is_pattern_stmt_p (member_info)) 596 break; 597 598 if (!member_info) 599 { 600 for (member_info = first_info; member_info; 601 member_info = DR_GROUP_NEXT_ELEMENT (member_info)) 602 STMT_VINFO_VECTYPE (member_info) = vectype; 603 return true; 604 } 605 } 606 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0 607 && !is_pattern_stmt_p (stmt_info)) 608 { 609 STMT_VINFO_VECTYPE (stmt_info) = vectype; 610 return true; 611 } 612 613 if (dump_enabled_p ()) 614 { 615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 616 "Build SLP failed: incompatible vector" 617 " types for: %G", stmt_info->stmt); 618 dump_printf_loc (MSG_NOTE, vect_location, 619 " old vector type: %T\n", old_vectype); 620 dump_printf_loc (MSG_NOTE, vect_location, 621 " new vector type: %T\n", vectype); 622 } 623 return false; 624 } 625 626 /* Try to infer and assign a vector type to all the statements in STMTS. 627 Used only for BB vectorization. */ 628 629 static bool 630 vect_update_all_shared_vectypes (vec<stmt_vec_info> stmts) 631 { 632 tree vectype, nunits_vectype; 633 if (!vect_get_vector_types_for_stmt (stmts[0], &vectype, 634 &nunits_vectype, stmts.length ())) 635 return false; 636 637 stmt_vec_info stmt_info; 638 unsigned int i; 639 FOR_EACH_VEC_ELT (stmts, i, stmt_info) 640 if (!vect_update_shared_vectype (stmt_info, vectype)) 641 return false; 642 643 return true; 644 } 645 646 /* Return true if call statements CALL1 and CALL2 are similar enough 647 to be combined into the same SLP group. */ 648 649 static bool 650 compatible_calls_p (gcall *call1, gcall *call2) 651 { 652 unsigned int nargs = gimple_call_num_args (call1); 653 if (nargs != gimple_call_num_args (call2)) 654 return false; 655 656 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2)) 657 return false; 658 659 if (gimple_call_internal_p (call1)) 660 { 661 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)), 662 TREE_TYPE (gimple_call_lhs (call2)))) 663 return false; 664 for (unsigned int i = 0; i < nargs; ++i) 665 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)), 666 TREE_TYPE (gimple_call_arg (call2, i)))) 667 return false; 668 } 669 else 670 { 671 if (!operand_equal_p (gimple_call_fn (call1), 672 gimple_call_fn (call2), 0)) 673 return false; 674 675 if (gimple_call_fntype (call1) != gimple_call_fntype (call2)) 676 return false; 677 } 678 return true; 679 } 680 681 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the 682 caller's attempt to find the vector type in STMT_INFO with the narrowest 683 element type. Return true if VECTYPE is nonnull and if it is valid 684 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the 685 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for 686 vect_build_slp_tree. */ 687 688 static bool 689 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size, 690 tree vectype, poly_uint64 *max_nunits) 691 { 692 if (!vectype) 693 { 694 if (dump_enabled_p ()) 695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 696 "Build SLP failed: unsupported data-type in %G\n", 697 stmt_info->stmt); 698 /* Fatal mismatch. */ 699 return false; 700 } 701 702 /* If populating the vector type requires unrolling then fail 703 before adjusting *max_nunits for basic-block vectorization. */ 704 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 705 unsigned HOST_WIDE_INT const_nunits; 706 if (STMT_VINFO_BB_VINFO (stmt_info) 707 && (!nunits.is_constant (&const_nunits) 708 || const_nunits > group_size)) 709 { 710 if (dump_enabled_p ()) 711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 712 "Build SLP failed: unrolling required " 713 "in basic block SLP\n"); 714 /* Fatal mismatch. */ 715 return false; 716 } 717 718 /* In case of multiple types we need to detect the smallest type. */ 719 vect_update_max_nunits (max_nunits, vectype); 720 return true; 721 } 722 723 /* STMTS is a group of GROUP_SIZE SLP statements in which some 724 statements do the same operation as the first statement and in which 725 the others do ALT_STMT_CODE. Return true if we can take one vector 726 of the first operation and one vector of the second and permute them 727 to get the required result. VECTYPE is the type of the vector that 728 would be permuted. */ 729 730 static bool 731 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts, 732 unsigned int group_size, tree vectype, 733 tree_code alt_stmt_code) 734 { 735 unsigned HOST_WIDE_INT count; 736 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count)) 737 return false; 738 739 vec_perm_builder sel (count, count, 1); 740 for (unsigned int i = 0; i < count; ++i) 741 { 742 unsigned int elt = i; 743 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt); 744 if (gimple_assign_rhs_code (stmt) == alt_stmt_code) 745 elt += count; 746 sel.quick_push (elt); 747 } 748 vec_perm_indices indices (sel, 2, count); 749 return can_vec_perm_const_p (TYPE_MODE (vectype), indices); 750 } 751 752 /* Verify if the scalar stmts STMTS are isomorphic, require data 753 permutation or are of unsupported types of operation. Return 754 true if they are, otherwise return false and indicate in *MATCHES 755 which stmts are not isomorphic to the first one. If MATCHES[0] 756 is false then this indicates the comparison could not be 757 carried out or the stmts will never be vectorized by SLP. 758 759 Note COND_EXPR is possibly isomorphic to another one after swapping its 760 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to 761 the first stmt by swapping the two operands of comparison; set SWAP[i] 762 to 2 if stmt I is isormorphic to the first stmt by inverting the code 763 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped 764 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */ 765 766 static bool 767 vect_build_slp_tree_1 (unsigned char *swap, 768 vec<stmt_vec_info> stmts, unsigned int group_size, 769 poly_uint64 *max_nunits, bool *matches, 770 bool *two_operators) 771 { 772 unsigned int i; 773 stmt_vec_info first_stmt_info = stmts[0]; 774 enum tree_code first_stmt_code = ERROR_MARK; 775 enum tree_code alt_stmt_code = ERROR_MARK; 776 enum tree_code rhs_code = ERROR_MARK; 777 enum tree_code first_cond_code = ERROR_MARK; 778 tree lhs; 779 bool need_same_oprnds = false; 780 tree vectype = NULL_TREE, first_op1 = NULL_TREE; 781 optab optab; 782 int icode; 783 machine_mode optab_op2_mode; 784 machine_mode vec_mode; 785 stmt_vec_info first_load = NULL, prev_first_load = NULL; 786 bool load_p = false; 787 788 /* For every stmt in NODE find its def stmt/s. */ 789 stmt_vec_info stmt_info; 790 FOR_EACH_VEC_ELT (stmts, i, stmt_info) 791 { 792 vec_info *vinfo = stmt_info->vinfo; 793 gimple *stmt = stmt_info->stmt; 794 swap[i] = 0; 795 matches[i] = false; 796 797 if (dump_enabled_p ()) 798 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt); 799 800 /* Fail to vectorize statements marked as unvectorizable. */ 801 if (!STMT_VINFO_VECTORIZABLE (stmt_info)) 802 { 803 if (dump_enabled_p ()) 804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 805 "Build SLP failed: unvectorizable statement %G", 806 stmt); 807 /* Fatal mismatch. */ 808 matches[0] = false; 809 return false; 810 } 811 812 lhs = gimple_get_lhs (stmt); 813 if (lhs == NULL_TREE) 814 { 815 if (dump_enabled_p ()) 816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 817 "Build SLP failed: not GIMPLE_ASSIGN nor " 818 "GIMPLE_CALL %G", stmt); 819 /* Fatal mismatch. */ 820 matches[0] = false; 821 return false; 822 } 823 824 tree nunits_vectype; 825 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype, 826 &nunits_vectype, group_size) 827 || (nunits_vectype 828 && !vect_record_max_nunits (stmt_info, group_size, 829 nunits_vectype, max_nunits))) 830 { 831 /* Fatal mismatch. */ 832 matches[0] = false; 833 return false; 834 } 835 836 gcc_assert (vectype); 837 838 if (is_a <bb_vec_info> (vinfo) 839 && !vect_update_shared_vectype (stmt_info, vectype)) 840 continue; 841 842 gcall *call_stmt = dyn_cast <gcall *> (stmt); 843 if (call_stmt) 844 { 845 rhs_code = CALL_EXPR; 846 847 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD)) 848 load_p = true; 849 else if ((gimple_call_internal_p (call_stmt) 850 && (!vectorizable_internal_fn_p 851 (gimple_call_internal_fn (call_stmt)))) 852 || gimple_call_tail_p (call_stmt) 853 || gimple_call_noreturn_p (call_stmt) 854 || !gimple_call_nothrow_p (call_stmt) 855 || gimple_call_chain (call_stmt)) 856 { 857 if (dump_enabled_p ()) 858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 859 "Build SLP failed: unsupported call type %G", 860 call_stmt); 861 /* Fatal mismatch. */ 862 matches[0] = false; 863 return false; 864 } 865 } 866 else 867 { 868 rhs_code = gimple_assign_rhs_code (stmt); 869 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference; 870 } 871 872 /* Check the operation. */ 873 if (i == 0) 874 { 875 first_stmt_code = rhs_code; 876 877 /* Shift arguments should be equal in all the packed stmts for a 878 vector shift with scalar shift operand. */ 879 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR 880 || rhs_code == LROTATE_EXPR 881 || rhs_code == RROTATE_EXPR) 882 { 883 vec_mode = TYPE_MODE (vectype); 884 885 /* First see if we have a vector/vector shift. */ 886 optab = optab_for_tree_code (rhs_code, vectype, 887 optab_vector); 888 889 if (!optab 890 || optab_handler (optab, vec_mode) == CODE_FOR_nothing) 891 { 892 /* No vector/vector shift, try for a vector/scalar shift. */ 893 optab = optab_for_tree_code (rhs_code, vectype, 894 optab_scalar); 895 896 if (!optab) 897 { 898 if (dump_enabled_p ()) 899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 900 "Build SLP failed: no optab.\n"); 901 /* Fatal mismatch. */ 902 matches[0] = false; 903 return false; 904 } 905 icode = (int) optab_handler (optab, vec_mode); 906 if (icode == CODE_FOR_nothing) 907 { 908 if (dump_enabled_p ()) 909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 910 "Build SLP failed: " 911 "op not supported by target.\n"); 912 /* Fatal mismatch. */ 913 matches[0] = false; 914 return false; 915 } 916 optab_op2_mode = insn_data[icode].operand[2].mode; 917 if (!VECTOR_MODE_P (optab_op2_mode)) 918 { 919 need_same_oprnds = true; 920 first_op1 = gimple_assign_rhs2 (stmt); 921 } 922 } 923 } 924 else if (rhs_code == WIDEN_LSHIFT_EXPR) 925 { 926 need_same_oprnds = true; 927 first_op1 = gimple_assign_rhs2 (stmt); 928 } 929 else if (call_stmt 930 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2)) 931 { 932 need_same_oprnds = true; 933 first_op1 = gimple_call_arg (call_stmt, 1); 934 } 935 } 936 else 937 { 938 if (first_stmt_code != rhs_code 939 && alt_stmt_code == ERROR_MARK) 940 alt_stmt_code = rhs_code; 941 if (first_stmt_code != rhs_code 942 && (first_stmt_code != IMAGPART_EXPR 943 || rhs_code != REALPART_EXPR) 944 && (first_stmt_code != REALPART_EXPR 945 || rhs_code != IMAGPART_EXPR) 946 /* Handle mismatches in plus/minus by computing both 947 and merging the results. */ 948 && !((first_stmt_code == PLUS_EXPR 949 || first_stmt_code == MINUS_EXPR) 950 && (alt_stmt_code == PLUS_EXPR 951 || alt_stmt_code == MINUS_EXPR) 952 && rhs_code == alt_stmt_code) 953 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info) 954 && (first_stmt_code == ARRAY_REF 955 || first_stmt_code == BIT_FIELD_REF 956 || first_stmt_code == INDIRECT_REF 957 || first_stmt_code == COMPONENT_REF 958 || first_stmt_code == MEM_REF))) 959 { 960 if (dump_enabled_p ()) 961 { 962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 963 "Build SLP failed: different operation " 964 "in stmt %G", stmt); 965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 966 "original stmt %G", first_stmt_info->stmt); 967 } 968 /* Mismatch. */ 969 continue; 970 } 971 972 if (!load_p && rhs_code == CALL_EXPR) 973 { 974 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt), 975 as_a <gcall *> (stmt))) 976 { 977 if (dump_enabled_p ()) 978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 979 "Build SLP failed: different calls in %G", 980 stmt); 981 /* Mismatch. */ 982 continue; 983 } 984 } 985 986 if (need_same_oprnds) 987 { 988 tree other_op1 = (call_stmt 989 ? gimple_call_arg (call_stmt, 1) 990 : gimple_assign_rhs2 (stmt)); 991 if (!operand_equal_p (first_op1, other_op1, 0)) 992 { 993 if (dump_enabled_p ()) 994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 995 "Build SLP failed: different shift " 996 "arguments in %G", stmt); 997 /* Mismatch. */ 998 continue; 999 } 1000 } 1001 } 1002 1003 /* Grouped store or load. */ 1004 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1005 { 1006 if (REFERENCE_CLASS_P (lhs)) 1007 { 1008 /* Store. */ 1009 ; 1010 } 1011 else 1012 { 1013 /* Load. */ 1014 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info); 1015 if (prev_first_load) 1016 { 1017 /* Check that there are no loads from different interleaving 1018 chains in the same node. */ 1019 if (prev_first_load != first_load) 1020 { 1021 if (dump_enabled_p ()) 1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 1023 vect_location, 1024 "Build SLP failed: different " 1025 "interleaving chains in one node %G", 1026 stmt); 1027 /* Mismatch. */ 1028 continue; 1029 } 1030 } 1031 else 1032 prev_first_load = first_load; 1033 } 1034 } /* Grouped access. */ 1035 else 1036 { 1037 if (load_p) 1038 { 1039 /* Not grouped load. */ 1040 if (dump_enabled_p ()) 1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1042 "Build SLP failed: not grouped load %G", stmt); 1043 1044 /* FORNOW: Not grouped loads are not supported. */ 1045 /* Fatal mismatch. */ 1046 matches[0] = false; 1047 return false; 1048 } 1049 1050 /* Not memory operation. */ 1051 if (TREE_CODE_CLASS (rhs_code) != tcc_binary 1052 && TREE_CODE_CLASS (rhs_code) != tcc_unary 1053 && TREE_CODE_CLASS (rhs_code) != tcc_expression 1054 && TREE_CODE_CLASS (rhs_code) != tcc_comparison 1055 && rhs_code != VIEW_CONVERT_EXPR 1056 && rhs_code != CALL_EXPR) 1057 { 1058 if (dump_enabled_p ()) 1059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1060 "Build SLP failed: operation unsupported %G", 1061 stmt); 1062 /* Fatal mismatch. */ 1063 matches[0] = false; 1064 return false; 1065 } 1066 1067 if (rhs_code == COND_EXPR) 1068 { 1069 tree cond_expr = gimple_assign_rhs1 (stmt); 1070 enum tree_code cond_code = TREE_CODE (cond_expr); 1071 enum tree_code swap_code = ERROR_MARK; 1072 enum tree_code invert_code = ERROR_MARK; 1073 1074 if (i == 0) 1075 first_cond_code = TREE_CODE (cond_expr); 1076 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 1077 { 1078 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 1079 swap_code = swap_tree_comparison (cond_code); 1080 invert_code = invert_tree_comparison (cond_code, honor_nans); 1081 } 1082 1083 if (first_cond_code == cond_code) 1084 ; 1085 /* Isomorphic can be achieved by swapping. */ 1086 else if (first_cond_code == swap_code) 1087 swap[i] = 1; 1088 /* Isomorphic can be achieved by inverting. */ 1089 else if (first_cond_code == invert_code) 1090 swap[i] = 2; 1091 else 1092 { 1093 if (dump_enabled_p ()) 1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1095 "Build SLP failed: different" 1096 " operation %G", stmt); 1097 /* Mismatch. */ 1098 continue; 1099 } 1100 } 1101 } 1102 1103 matches[i] = true; 1104 } 1105 1106 for (i = 0; i < group_size; ++i) 1107 if (!matches[i]) 1108 return false; 1109 1110 /* If we allowed a two-operation SLP node verify the target can cope 1111 with the permute we are going to use. */ 1112 if (alt_stmt_code != ERROR_MARK 1113 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference) 1114 { 1115 if (!vect_two_operations_perm_ok_p (stmts, group_size, 1116 vectype, alt_stmt_code)) 1117 { 1118 for (i = 0; i < group_size; ++i) 1119 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code) 1120 { 1121 matches[i] = false; 1122 if (dump_enabled_p ()) 1123 { 1124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1125 "Build SLP failed: different operation " 1126 "in stmt %G", stmts[i]->stmt); 1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1128 "original stmt %G", first_stmt_info->stmt); 1129 } 1130 } 1131 return false; 1132 } 1133 *two_operators = true; 1134 } 1135 1136 return true; 1137 } 1138 1139 /* Traits for the hash_set to record failed SLP builds for a stmt set. 1140 Note we never remove apart from at destruction time so we do not 1141 need a special value for deleted that differs from empty. */ 1142 struct bst_traits 1143 { 1144 typedef vec <stmt_vec_info> value_type; 1145 typedef vec <stmt_vec_info> compare_type; 1146 static inline hashval_t hash (value_type); 1147 static inline bool equal (value_type existing, value_type candidate); 1148 static inline bool is_empty (value_type x) { return !x.exists (); } 1149 static inline bool is_deleted (value_type x) { return !x.exists (); } 1150 static const bool empty_zero_p = true; 1151 static inline void mark_empty (value_type &x) { x.release (); } 1152 static inline void mark_deleted (value_type &x) { x.release (); } 1153 static inline void remove (value_type &x) { x.release (); } 1154 }; 1155 inline hashval_t 1156 bst_traits::hash (value_type x) 1157 { 1158 inchash::hash h; 1159 for (unsigned i = 0; i < x.length (); ++i) 1160 h.add_int (gimple_uid (x[i]->stmt)); 1161 return h.end (); 1162 } 1163 inline bool 1164 bst_traits::equal (value_type existing, value_type candidate) 1165 { 1166 if (existing.length () != candidate.length ()) 1167 return false; 1168 for (unsigned i = 0; i < existing.length (); ++i) 1169 if (existing[i] != candidate[i]) 1170 return false; 1171 return true; 1172 } 1173 1174 typedef hash_map <vec <gimple *>, slp_tree, 1175 simple_hashmap_traits <bst_traits, slp_tree> > 1176 scalar_stmts_to_slp_tree_map_t; 1177 1178 static slp_tree 1179 vect_build_slp_tree_2 (vec_info *vinfo, 1180 vec<stmt_vec_info> stmts, unsigned int group_size, 1181 poly_uint64 *max_nunits, 1182 bool *matches, unsigned *npermutes, unsigned *tree_size, 1183 scalar_stmts_to_slp_tree_map_t *bst_map); 1184 1185 static slp_tree 1186 vect_build_slp_tree (vec_info *vinfo, 1187 vec<stmt_vec_info> stmts, unsigned int group_size, 1188 poly_uint64 *max_nunits, 1189 bool *matches, unsigned *npermutes, unsigned *tree_size, 1190 scalar_stmts_to_slp_tree_map_t *bst_map) 1191 { 1192 if (slp_tree *leader = bst_map->get (stmts)) 1193 { 1194 if (dump_enabled_p ()) 1195 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n", 1196 *leader ? "" : "failed ", *leader); 1197 if (*leader) 1198 { 1199 (*leader)->refcnt++; 1200 vect_update_max_nunits (max_nunits, (*leader)->max_nunits); 1201 } 1202 return *leader; 1203 } 1204 poly_uint64 this_max_nunits = 1; 1205 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, 1206 &this_max_nunits, 1207 matches, npermutes, tree_size, bst_map); 1208 if (res) 1209 { 1210 res->max_nunits = this_max_nunits; 1211 vect_update_max_nunits (max_nunits, this_max_nunits); 1212 /* Keep a reference for the bst_map use. */ 1213 res->refcnt++; 1214 } 1215 bst_map->put (stmts.copy (), res); 1216 return res; 1217 } 1218 1219 /* Recursively build an SLP tree starting from NODE. 1220 Fail (and return a value not equal to zero) if def-stmts are not 1221 isomorphic, require data permutation or are of unsupported types of 1222 operation. Otherwise, return 0. 1223 The value returned is the depth in the SLP tree where a mismatch 1224 was found. */ 1225 1226 static slp_tree 1227 vect_build_slp_tree_2 (vec_info *vinfo, 1228 vec<stmt_vec_info> stmts, unsigned int group_size, 1229 poly_uint64 *max_nunits, 1230 bool *matches, unsigned *npermutes, unsigned *tree_size, 1231 scalar_stmts_to_slp_tree_map_t *bst_map) 1232 { 1233 unsigned nops, i, this_tree_size = 0; 1234 poly_uint64 this_max_nunits = *max_nunits; 1235 slp_tree node; 1236 1237 matches[0] = false; 1238 1239 stmt_vec_info stmt_info = stmts[0]; 1240 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) 1241 nops = gimple_call_num_args (stmt); 1242 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) 1243 { 1244 nops = gimple_num_ops (stmt) - 1; 1245 if (gimple_assign_rhs_code (stmt) == COND_EXPR) 1246 nops++; 1247 } 1248 else if (is_a <gphi *> (stmt_info->stmt)) 1249 nops = 0; 1250 else 1251 return NULL; 1252 1253 /* If the SLP node is a PHI (induction or reduction), terminate 1254 the recursion. */ 1255 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt)) 1256 { 1257 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); 1258 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); 1259 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits)) 1260 return NULL; 1261 1262 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info); 1263 /* Induction from different IVs is not supported. */ 1264 if (def_type == vect_induction_def) 1265 { 1266 stmt_vec_info other_info; 1267 FOR_EACH_VEC_ELT (stmts, i, other_info) 1268 if (stmt_info != other_info) 1269 return NULL; 1270 } 1271 else if (def_type == vect_reduction_def 1272 || def_type == vect_double_reduction_def 1273 || def_type == vect_nested_cycle) 1274 { 1275 /* Else def types have to match. */ 1276 stmt_vec_info other_info; 1277 FOR_EACH_VEC_ELT (stmts, i, other_info) 1278 if (STMT_VINFO_DEF_TYPE (other_info) != def_type) 1279 return NULL; 1280 } 1281 else 1282 return NULL; 1283 (*tree_size)++; 1284 node = vect_create_new_slp_node (stmts); 1285 return node; 1286 } 1287 1288 1289 bool two_operators = false; 1290 unsigned char *swap = XALLOCAVEC (unsigned char, group_size); 1291 if (!vect_build_slp_tree_1 (swap, stmts, group_size, 1292 &this_max_nunits, matches, &two_operators)) 1293 return NULL; 1294 1295 /* If the SLP node is a load, terminate the recursion unless masked. */ 1296 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1297 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1298 { 1299 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) 1300 { 1301 /* Masked load. */ 1302 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)); 1303 nops = 1; 1304 } 1305 else 1306 { 1307 *max_nunits = this_max_nunits; 1308 (*tree_size)++; 1309 node = vect_create_new_slp_node (stmts); 1310 /* And compute the load permutation. Whether it is actually 1311 a permutation depends on the unrolling factor which is 1312 decided later. */ 1313 vec<unsigned> load_permutation; 1314 int j; 1315 stmt_vec_info load_info; 1316 load_permutation.create (group_size); 1317 stmt_vec_info first_stmt_info 1318 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); 1319 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) 1320 { 1321 int load_place = vect_get_place_in_interleaving_chain 1322 (load_info, first_stmt_info); 1323 gcc_assert (load_place != -1); 1324 load_permutation.safe_push (load_place); 1325 } 1326 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; 1327 return node; 1328 } 1329 } 1330 1331 /* Get at the operands, verifying they are compatible. */ 1332 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size); 1333 slp_oprnd_info oprnd_info; 1334 FOR_EACH_VEC_ELT (stmts, i, stmt_info) 1335 { 1336 int res = vect_get_and_check_slp_defs (vinfo, &swap[i], 1337 stmts, i, &oprnds_info); 1338 if (res != 0) 1339 matches[(res == -1) ? 0 : i] = false; 1340 if (!matches[0]) 1341 break; 1342 } 1343 for (i = 0; i < group_size; ++i) 1344 if (!matches[i]) 1345 { 1346 vect_free_oprnd_info (oprnds_info); 1347 return NULL; 1348 } 1349 1350 auto_vec<slp_tree, 4> children; 1351 1352 stmt_info = stmts[0]; 1353 1354 /* Create SLP_TREE nodes for the definition node/s. */ 1355 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) 1356 { 1357 slp_tree child; 1358 unsigned old_tree_size = this_tree_size; 1359 unsigned int j; 1360 1361 if (oprnd_info->first_dt == vect_uninitialized_def) 1362 { 1363 /* COND_EXPR have one too many eventually if the condition 1364 is a SSA name. */ 1365 gcc_assert (i == 3 && nops == 4); 1366 continue; 1367 } 1368 1369 if (oprnd_info->first_dt != vect_internal_def 1370 && oprnd_info->first_dt != vect_reduction_def 1371 && oprnd_info->first_dt != vect_induction_def) 1372 { 1373 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); 1374 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; 1375 oprnd_info->ops = vNULL; 1376 children.safe_push (invnode); 1377 continue; 1378 } 1379 1380 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, 1381 group_size, &this_max_nunits, 1382 matches, npermutes, 1383 &this_tree_size, bst_map)) != NULL) 1384 { 1385 /* If we have all children of a non-unary child built up from 1386 scalars then just throw that away and build it up this node 1387 from scalars. */ 1388 if (is_a <bb_vec_info> (vinfo) 1389 && SLP_TREE_CHILDREN (child).length () > 1 1390 /* ??? Rejecting patterns this way doesn't work. We'd have to 1391 do extra work to cancel the pattern so the uses see the 1392 scalar version. */ 1393 && !oprnd_info->any_pattern) 1394 { 1395 slp_tree grandchild; 1396 1397 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1398 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def) 1399 break; 1400 if (!grandchild 1401 && vect_update_all_shared_vectypes (oprnd_info->def_stmts)) 1402 { 1403 /* Roll back. */ 1404 this_tree_size = old_tree_size; 1405 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1406 vect_free_slp_tree (grandchild, false); 1407 SLP_TREE_CHILDREN (child).truncate (0); 1408 1409 if (dump_enabled_p ()) 1410 dump_printf_loc (MSG_NOTE, vect_location, 1411 "Building parent vector operands from " 1412 "scalars instead\n"); 1413 oprnd_info->def_stmts = vNULL; 1414 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1415 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops; 1416 oprnd_info->ops = vNULL; 1417 ++this_tree_size; 1418 children.safe_push (child); 1419 continue; 1420 } 1421 } 1422 1423 oprnd_info->def_stmts = vNULL; 1424 children.safe_push (child); 1425 continue; 1426 } 1427 1428 /* If the SLP build failed fatally and we analyze a basic-block 1429 simply treat nodes we fail to build as externally defined 1430 (and thus build vectors from the scalar defs). 1431 The cost model will reject outright expensive cases. 1432 ??? This doesn't treat cases where permutation ultimatively 1433 fails (or we don't try permutation below). Ideally we'd 1434 even compute a permutation that will end up with the maximum 1435 SLP tree size... */ 1436 if (is_a <bb_vec_info> (vinfo) 1437 && !matches[0] 1438 /* ??? Rejecting patterns this way doesn't work. We'd have to 1439 do extra work to cancel the pattern so the uses see the 1440 scalar version. */ 1441 && !is_pattern_stmt_p (stmt_info) 1442 && !oprnd_info->any_pattern 1443 && vect_update_all_shared_vectypes (oprnd_info->def_stmts)) 1444 { 1445 if (dump_enabled_p ()) 1446 dump_printf_loc (MSG_NOTE, vect_location, 1447 "Building vector operands from scalars\n"); 1448 this_tree_size++; 1449 child = vect_create_new_slp_node (oprnd_info->def_stmts); 1450 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1451 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops; 1452 children.safe_push (child); 1453 oprnd_info->ops = vNULL; 1454 oprnd_info->def_stmts = vNULL; 1455 continue; 1456 } 1457 1458 /* If the SLP build for operand zero failed and operand zero 1459 and one can be commutated try that for the scalar stmts 1460 that failed the match. */ 1461 if (i == 0 1462 /* A first scalar stmt mismatch signals a fatal mismatch. */ 1463 && matches[0] 1464 /* ??? For COND_EXPRs we can swap the comparison operands 1465 as well as the arms under some constraints. */ 1466 && nops == 2 1467 && oprnds_info[1]->first_dt == vect_internal_def 1468 && is_gimple_assign (stmt_info->stmt) 1469 /* Swapping operands for reductions breaks assumptions later on. */ 1470 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def 1471 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def 1472 /* Do so only if the number of not successful permutes was nor more 1473 than a cut-ff as re-trying the recursive match on 1474 possibly each level of the tree would expose exponential 1475 behavior. */ 1476 && *npermutes < 4) 1477 { 1478 /* See whether we can swap the matching or the non-matching 1479 stmt operands. */ 1480 bool swap_not_matching = true; 1481 do 1482 { 1483 for (j = 0; j < group_size; ++j) 1484 { 1485 if (matches[j] != !swap_not_matching) 1486 continue; 1487 stmt_vec_info stmt_info = stmts[j]; 1488 /* Verify if we can swap operands of this stmt. */ 1489 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt); 1490 if (!stmt 1491 || !commutative_tree_code (gimple_assign_rhs_code (stmt))) 1492 { 1493 if (!swap_not_matching) 1494 goto fail; 1495 swap_not_matching = false; 1496 break; 1497 } 1498 } 1499 } 1500 while (j != group_size); 1501 1502 /* Swap mismatched definition stmts. */ 1503 if (dump_enabled_p ()) 1504 dump_printf_loc (MSG_NOTE, vect_location, 1505 "Re-trying with swapped operands of stmts "); 1506 for (j = 0; j < group_size; ++j) 1507 if (matches[j] == !swap_not_matching) 1508 { 1509 std::swap (oprnds_info[0]->def_stmts[j], 1510 oprnds_info[1]->def_stmts[j]); 1511 std::swap (oprnds_info[0]->ops[j], 1512 oprnds_info[1]->ops[j]); 1513 if (dump_enabled_p ()) 1514 dump_printf (MSG_NOTE, "%d ", j); 1515 } 1516 if (dump_enabled_p ()) 1517 dump_printf (MSG_NOTE, "\n"); 1518 /* After swapping some operands we lost track whether an 1519 operand has any pattern defs so be conservative here. */ 1520 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern) 1521 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true; 1522 /* And try again with scratch 'matches' ... */ 1523 bool *tem = XALLOCAVEC (bool, group_size); 1524 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, 1525 group_size, &this_max_nunits, 1526 tem, npermutes, 1527 &this_tree_size, bst_map)) != NULL) 1528 { 1529 /* If we have all children of a non-unary child built up from 1530 scalars then just throw that away and build it up this node 1531 from scalars. */ 1532 if (is_a <bb_vec_info> (vinfo) 1533 && SLP_TREE_CHILDREN (child).length () > 1 1534 /* ??? Rejecting patterns this way doesn't work. We'd have 1535 to do extra work to cancel the pattern so the uses see the 1536 scalar version. */ 1537 && !oprnd_info->any_pattern) 1538 { 1539 unsigned int j; 1540 slp_tree grandchild; 1541 1542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1543 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def) 1544 break; 1545 if (!grandchild 1546 && (vect_update_all_shared_vectypes 1547 (oprnd_info->def_stmts))) 1548 { 1549 /* Roll back. */ 1550 this_tree_size = old_tree_size; 1551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1552 vect_free_slp_tree (grandchild, false); 1553 SLP_TREE_CHILDREN (child).truncate (0); 1554 1555 if (dump_enabled_p ()) 1556 dump_printf_loc (MSG_NOTE, vect_location, 1557 "Building parent vector operands from " 1558 "scalars instead\n"); 1559 oprnd_info->def_stmts = vNULL; 1560 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1561 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops; 1562 oprnd_info->ops = vNULL; 1563 ++this_tree_size; 1564 children.safe_push (child); 1565 continue; 1566 } 1567 } 1568 1569 oprnd_info->def_stmts = vNULL; 1570 children.safe_push (child); 1571 continue; 1572 } 1573 1574 ++*npermutes; 1575 } 1576 1577 fail: 1578 gcc_assert (child == NULL); 1579 FOR_EACH_VEC_ELT (children, j, child) 1580 vect_free_slp_tree (child, false); 1581 vect_free_oprnd_info (oprnds_info); 1582 return NULL; 1583 } 1584 1585 vect_free_oprnd_info (oprnds_info); 1586 1587 *tree_size += this_tree_size + 1; 1588 *max_nunits = this_max_nunits; 1589 1590 node = vect_create_new_slp_node (stmts); 1591 SLP_TREE_TWO_OPERATORS (node) = two_operators; 1592 SLP_TREE_CHILDREN (node).splice (children); 1593 return node; 1594 } 1595 1596 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */ 1597 1598 static void 1599 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, 1600 slp_tree node, hash_set<slp_tree> &visited) 1601 { 1602 unsigned i, j; 1603 stmt_vec_info stmt_info; 1604 slp_tree child; 1605 tree op; 1606 1607 if (visited.add (node)) 1608 return; 1609 1610 dump_metadata_t metadata (dump_kind, loc.get_impl_location ()); 1611 dump_user_location_t user_loc = loc.get_user_location (); 1612 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n", 1613 SLP_TREE_DEF_TYPE (node) == vect_external_def 1614 ? " (external)" 1615 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def 1616 ? " (constant)" 1617 : ""), node, 1618 estimated_poly_value (node->max_nunits), node->refcnt); 1619 if (SLP_TREE_SCALAR_STMTS (node).exists ()) 1620 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 1621 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt); 1622 else 1623 { 1624 dump_printf_loc (metadata, user_loc, "\t{ "); 1625 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) 1626 dump_printf (metadata, "%T%s ", op, 1627 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : ""); 1628 dump_printf (metadata, "}\n"); 1629 } 1630 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1631 { 1632 dump_printf_loc (metadata, user_loc, "\tload permutation {"); 1633 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j) 1634 dump_printf (dump_kind, " %u", j); 1635 dump_printf (dump_kind, " }\n"); 1636 } 1637 if (SLP_TREE_CHILDREN (node).is_empty ()) 1638 return; 1639 dump_printf_loc (metadata, user_loc, "\tchildren"); 1640 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1641 dump_printf (dump_kind, " %p", (void *)child); 1642 dump_printf (dump_kind, "\n"); 1643 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1644 vect_print_slp_tree (dump_kind, loc, child, visited); 1645 } 1646 1647 static void 1648 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, 1649 slp_tree node) 1650 { 1651 hash_set<slp_tree> visited; 1652 vect_print_slp_tree (dump_kind, loc, node, visited); 1653 } 1654 1655 /* Mark the tree rooted at NODE with PURE_SLP. */ 1656 1657 static void 1658 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited) 1659 { 1660 int i; 1661 stmt_vec_info stmt_info; 1662 slp_tree child; 1663 1664 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1665 return; 1666 1667 if (visited.add (node)) 1668 return; 1669 1670 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 1671 STMT_SLP_TYPE (stmt_info) = pure_slp; 1672 1673 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1674 vect_mark_slp_stmts (child, visited); 1675 } 1676 1677 static void 1678 vect_mark_slp_stmts (slp_tree node) 1679 { 1680 hash_set<slp_tree> visited; 1681 vect_mark_slp_stmts (node, visited); 1682 } 1683 1684 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ 1685 1686 static void 1687 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited) 1688 { 1689 int i; 1690 stmt_vec_info stmt_info; 1691 slp_tree child; 1692 1693 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1694 return; 1695 1696 if (visited.add (node)) 1697 return; 1698 1699 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 1700 { 1701 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) 1702 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); 1703 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; 1704 } 1705 1706 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1707 vect_mark_slp_stmts_relevant (child, visited); 1708 } 1709 1710 static void 1711 vect_mark_slp_stmts_relevant (slp_tree node) 1712 { 1713 hash_set<slp_tree> visited; 1714 vect_mark_slp_stmts_relevant (node, visited); 1715 } 1716 1717 /* Copy the SLP subtree rooted at NODE. */ 1718 1719 static slp_tree 1720 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map) 1721 { 1722 unsigned i; 1723 1724 bool existed_p; 1725 slp_tree ©_ref = map.get_or_insert (node, &existed_p); 1726 if (existed_p) 1727 return copy_ref; 1728 1729 copy_ref = XNEW (_slp_tree); 1730 slp_tree copy = copy_ref; 1731 memcpy (copy, node, sizeof (_slp_tree)); 1732 if (SLP_TREE_SCALAR_STMTS (node).exists ()) 1733 { 1734 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy (); 1735 stmt_vec_info stmt_info; 1736 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 1737 STMT_VINFO_NUM_SLP_USES (stmt_info)++; 1738 } 1739 if (SLP_TREE_SCALAR_OPS (node).exists ()) 1740 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy (); 1741 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1742 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy (); 1743 if (SLP_TREE_CHILDREN (node).exists ()) 1744 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy (); 1745 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ()); 1746 copy->refcnt = 0; 1747 1748 slp_tree child; 1749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child) 1750 { 1751 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map); 1752 SLP_TREE_CHILDREN (copy)[i]->refcnt++; 1753 } 1754 return copy; 1755 } 1756 1757 /* Rearrange the statements of NODE according to PERMUTATION. */ 1758 1759 static void 1760 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, 1761 vec<unsigned> permutation, 1762 hash_set<slp_tree> &visited) 1763 { 1764 unsigned int i; 1765 slp_tree child; 1766 1767 if (visited.add (node)) 1768 return; 1769 1770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1771 vect_slp_rearrange_stmts (child, group_size, permutation, visited); 1772 1773 if (SLP_TREE_SCALAR_STMTS (node).exists ()) 1774 { 1775 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); 1776 /* ??? Computation nodes are isomorphic and need no rearrangement. 1777 This is a quick hack to cover those where rearrangement breaks 1778 semantics because only the first stmt is guaranteed to have the 1779 correct operation code due to others being swapped or inverted. */ 1780 stmt_vec_info first = SLP_TREE_SCALAR_STMTS (node)[0]; 1781 if (is_gimple_assign (first->stmt) 1782 && gimple_assign_rhs_code (first->stmt) == COND_EXPR) 1783 return; 1784 vec<stmt_vec_info> tmp_stmts; 1785 tmp_stmts.create (group_size); 1786 tmp_stmts.quick_grow (group_size); 1787 stmt_vec_info stmt_info; 1788 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 1789 tmp_stmts[permutation[i]] = stmt_info; 1790 SLP_TREE_SCALAR_STMTS (node).release (); 1791 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; 1792 } 1793 if (SLP_TREE_SCALAR_OPS (node).exists ()) 1794 { 1795 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ()); 1796 vec<tree> tmp_ops; 1797 tmp_ops.create (group_size); 1798 tmp_ops.quick_grow (group_size); 1799 tree op; 1800 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) 1801 tmp_ops[permutation[i]] = op; 1802 SLP_TREE_SCALAR_OPS (node).release (); 1803 SLP_TREE_SCALAR_OPS (node) = tmp_ops; 1804 } 1805 } 1806 1807 1808 /* Attempt to reorder stmts in a reduction chain so that we don't 1809 require any load permutation. Return true if that was possible, 1810 otherwise return false. */ 1811 1812 static bool 1813 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) 1814 { 1815 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); 1816 unsigned int i, j; 1817 unsigned int lidx; 1818 slp_tree node, load; 1819 1820 /* Compare all the permutation sequences to the first one. We know 1821 that at least one load is permuted. */ 1822 node = SLP_INSTANCE_LOADS (slp_instn)[0]; 1823 if (!node->load_permutation.exists ()) 1824 return false; 1825 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) 1826 { 1827 if (!load->load_permutation.exists ()) 1828 return false; 1829 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) 1830 if (lidx != node->load_permutation[j]) 1831 return false; 1832 } 1833 1834 /* Check that the loads in the first sequence are different and there 1835 are no gaps between them. */ 1836 auto_sbitmap load_index (group_size); 1837 bitmap_clear (load_index); 1838 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) 1839 { 1840 if (lidx >= group_size) 1841 return false; 1842 if (bitmap_bit_p (load_index, lidx)) 1843 return false; 1844 1845 bitmap_set_bit (load_index, lidx); 1846 } 1847 for (i = 0; i < group_size; i++) 1848 if (!bitmap_bit_p (load_index, i)) 1849 return false; 1850 1851 /* This permutation is valid for reduction. Since the order of the 1852 statements in the nodes is not important unless they are memory 1853 accesses, we can rearrange the statements in all the nodes 1854 according to the order of the loads. */ 1855 1856 /* We have to unshare the SLP tree we modify. */ 1857 hash_map<slp_tree, slp_tree> map; 1858 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map); 1859 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false); 1860 unshared->refcnt++; 1861 SLP_INSTANCE_TREE (slp_instn) = unshared; 1862 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1863 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node); 1864 node = SLP_INSTANCE_LOADS (slp_instn)[0]; 1865 1866 /* Do the actual re-arrangement. */ 1867 hash_set<slp_tree> visited; 1868 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, 1869 node->load_permutation, visited); 1870 1871 /* We are done, no actual permutations need to be generated. */ 1872 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn); 1873 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1874 { 1875 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 1876 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); 1877 /* But we have to keep those permutations that are required because 1878 of handling of gaps. */ 1879 if (known_eq (unrolling_factor, 1U) 1880 || (group_size == DR_GROUP_SIZE (first_stmt_info) 1881 && DR_GROUP_GAP (first_stmt_info) == 0)) 1882 SLP_TREE_LOAD_PERMUTATION (node).release (); 1883 else 1884 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j) 1885 SLP_TREE_LOAD_PERMUTATION (node)[j] = j; 1886 } 1887 1888 return true; 1889 } 1890 1891 /* Gather loads in the SLP graph NODE and populate the INST loads array. */ 1892 1893 static void 1894 vect_gather_slp_loads (slp_instance inst, slp_tree node, 1895 hash_set<slp_tree> &visited) 1896 { 1897 if (visited.add (node)) 1898 return; 1899 1900 if (SLP_TREE_CHILDREN (node).length () == 0) 1901 { 1902 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1903 return; 1904 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 1905 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1906 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1907 SLP_INSTANCE_LOADS (inst).safe_push (node); 1908 } 1909 else 1910 { 1911 unsigned i; 1912 slp_tree child; 1913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1914 vect_gather_slp_loads (inst, child, visited); 1915 } 1916 } 1917 1918 static void 1919 vect_gather_slp_loads (slp_instance inst, slp_tree node) 1920 { 1921 hash_set<slp_tree> visited; 1922 vect_gather_slp_loads (inst, node, visited); 1923 } 1924 1925 /* Check if the required load permutations in the SLP instance 1926 SLP_INSTN are supported. */ 1927 1928 static bool 1929 vect_supported_load_permutation_p (slp_instance slp_instn) 1930 { 1931 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); 1932 unsigned int i, j, k, next; 1933 slp_tree node; 1934 1935 if (dump_enabled_p ()) 1936 { 1937 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); 1938 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1939 if (node->load_permutation.exists ()) 1940 FOR_EACH_VEC_ELT (node->load_permutation, j, next) 1941 dump_printf (MSG_NOTE, "%d ", next); 1942 else 1943 for (k = 0; k < group_size; ++k) 1944 dump_printf (MSG_NOTE, "%d ", k); 1945 dump_printf (MSG_NOTE, "\n"); 1946 } 1947 1948 /* In case of reduction every load permutation is allowed, since the order 1949 of the reduction statements is not important (as opposed to the case of 1950 grouped stores). The only condition we need to check is that all the 1951 load nodes are of the same size and have the same permutation (and then 1952 rearrange all the nodes of the SLP instance according to this 1953 permutation). */ 1954 1955 /* Check that all the load nodes are of the same size. */ 1956 /* ??? Can't we assert this? */ 1957 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1958 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) 1959 return false; 1960 1961 node = SLP_INSTANCE_TREE (slp_instn); 1962 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 1963 1964 /* Reduction (there are no data-refs in the root). 1965 In reduction chain the order of the loads is not important. */ 1966 if (!STMT_VINFO_DATA_REF (stmt_info) 1967 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) 1968 && !SLP_INSTANCE_ROOT_STMT (slp_instn)) 1969 vect_attempt_slp_rearrange_stmts (slp_instn); 1970 1971 /* In basic block vectorization we allow any subchain of an interleaving 1972 chain. 1973 FORNOW: not supported in loop SLP because of realignment compications. */ 1974 if (STMT_VINFO_BB_VINFO (stmt_info)) 1975 { 1976 /* Check whether the loads in an instance form a subchain and thus 1977 no permutation is necessary. */ 1978 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1979 { 1980 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1981 continue; 1982 bool subchain_p = true; 1983 stmt_vec_info next_load_info = NULL; 1984 stmt_vec_info load_info; 1985 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) 1986 { 1987 if (j != 0 1988 && (next_load_info != load_info 1989 || DR_GROUP_GAP (load_info) != 1)) 1990 { 1991 subchain_p = false; 1992 break; 1993 } 1994 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info); 1995 } 1996 if (subchain_p) 1997 SLP_TREE_LOAD_PERMUTATION (node).release (); 1998 else 1999 { 2000 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0]; 2001 group_info = DR_GROUP_FIRST_ELEMENT (group_info); 2002 unsigned HOST_WIDE_INT nunits; 2003 unsigned k, maxk = 0; 2004 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k) 2005 if (k > maxk) 2006 maxk = k; 2007 /* In BB vectorization we may not actually use a loaded vector 2008 accessing elements in excess of DR_GROUP_SIZE. */ 2009 tree vectype = STMT_VINFO_VECTYPE (group_info); 2010 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) 2011 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) 2012 { 2013 if (dump_enabled_p ()) 2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2015 "BB vectorization with gaps at the end of " 2016 "a load is not supported\n"); 2017 return false; 2018 } 2019 2020 /* Verify the permutation can be generated. */ 2021 vec<tree> tem; 2022 unsigned n_perms; 2023 if (!vect_transform_slp_perm_load (node, tem, NULL, 2024 1, slp_instn, true, &n_perms)) 2025 { 2026 if (dump_enabled_p ()) 2027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 2028 vect_location, 2029 "unsupported load permutation\n"); 2030 return false; 2031 } 2032 } 2033 } 2034 return true; 2035 } 2036 2037 /* For loop vectorization verify we can generate the permutation. Be 2038 conservative about the vectorization factor, there are permutations 2039 that will use three vector inputs only starting from a specific factor 2040 and the vectorization factor is not yet final. 2041 ??? The SLP instance unrolling factor might not be the maximum one. */ 2042 unsigned n_perms; 2043 poly_uint64 test_vf 2044 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), 2045 LOOP_VINFO_VECT_FACTOR 2046 (STMT_VINFO_LOOP_VINFO (stmt_info))); 2047 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 2048 if (node->load_permutation.exists () 2049 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf, 2050 slp_instn, true, &n_perms)) 2051 return false; 2052 2053 return true; 2054 } 2055 2056 2057 /* Find the last store in SLP INSTANCE. */ 2058 2059 stmt_vec_info 2060 vect_find_last_scalar_stmt_in_slp (slp_tree node) 2061 { 2062 stmt_vec_info last = NULL; 2063 stmt_vec_info stmt_vinfo; 2064 2065 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++) 2066 { 2067 stmt_vinfo = vect_orig_stmt (stmt_vinfo); 2068 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo; 2069 } 2070 2071 return last; 2072 } 2073 2074 /* Splits a group of stores, currently beginning at FIRST_VINFO, into 2075 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE 2076 (also containing the first GROUP1_SIZE stmts, since stores are 2077 consecutive), the second containing the remainder. 2078 Return the first stmt in the second group. */ 2079 2080 static stmt_vec_info 2081 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size) 2082 { 2083 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo); 2084 gcc_assert (group1_size > 0); 2085 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size; 2086 gcc_assert (group2_size > 0); 2087 DR_GROUP_SIZE (first_vinfo) = group1_size; 2088 2089 stmt_vec_info stmt_info = first_vinfo; 2090 for (unsigned i = group1_size; i > 1; i--) 2091 { 2092 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info); 2093 gcc_assert (DR_GROUP_GAP (stmt_info) == 1); 2094 } 2095 /* STMT is now the last element of the first group. */ 2096 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info); 2097 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0; 2098 2099 DR_GROUP_SIZE (group2) = group2_size; 2100 for (stmt_info = group2; stmt_info; 2101 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) 2102 { 2103 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2; 2104 gcc_assert (DR_GROUP_GAP (stmt_info) == 1); 2105 } 2106 2107 /* For the second group, the DR_GROUP_GAP is that before the original group, 2108 plus skipping over the first vector. */ 2109 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size; 2110 2111 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */ 2112 DR_GROUP_GAP (first_vinfo) += group2_size; 2113 2114 if (dump_enabled_p ()) 2115 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n", 2116 group1_size, group2_size); 2117 2118 return group2; 2119 } 2120 2121 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE 2122 statements and a vector of NUNITS elements. */ 2123 2124 static poly_uint64 2125 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) 2126 { 2127 return exact_div (common_multiple (nunits, group_size), group_size); 2128 } 2129 2130 /* Analyze an SLP instance starting from a group of grouped stores. Call 2131 vect_build_slp_tree to build a tree of packed stmts if possible. 2132 Return FALSE if it's impossible to SLP any stmt in the loop. */ 2133 2134 static bool 2135 vect_analyze_slp_instance (vec_info *vinfo, 2136 scalar_stmts_to_slp_tree_map_t *bst_map, 2137 stmt_vec_info stmt_info, unsigned max_tree_size) 2138 { 2139 slp_instance new_instance; 2140 slp_tree node; 2141 unsigned int group_size; 2142 tree vectype, scalar_type = NULL_TREE; 2143 unsigned int i; 2144 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 2145 vec<stmt_vec_info> scalar_stmts; 2146 bool constructor = false; 2147 2148 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2149 { 2150 scalar_type = TREE_TYPE (DR_REF (dr)); 2151 group_size = DR_GROUP_SIZE (stmt_info); 2152 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size); 2153 } 2154 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 2155 { 2156 gcc_assert (is_a <loop_vec_info> (vinfo)); 2157 vectype = STMT_VINFO_VECTYPE (stmt_info); 2158 group_size = REDUC_GROUP_SIZE (stmt_info); 2159 } 2160 else if (is_gimple_assign (stmt_info->stmt) 2161 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR) 2162 { 2163 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)); 2164 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt)); 2165 constructor = true; 2166 } 2167 else 2168 { 2169 gcc_assert (is_a <loop_vec_info> (vinfo)); 2170 vectype = STMT_VINFO_VECTYPE (stmt_info); 2171 group_size = as_a <loop_vec_info> (vinfo)->reductions.length (); 2172 } 2173 2174 if (!vectype) 2175 { 2176 if (dump_enabled_p ()) 2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2178 "Build SLP failed: unsupported data-type %T\n", 2179 scalar_type); 2180 2181 return false; 2182 } 2183 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 2184 2185 /* Create a node (a root of the SLP tree) for the packed grouped stores. */ 2186 scalar_stmts.create (group_size); 2187 stmt_vec_info next_info = stmt_info; 2188 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2189 { 2190 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ 2191 while (next_info) 2192 { 2193 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info)); 2194 next_info = DR_GROUP_NEXT_ELEMENT (next_info); 2195 } 2196 } 2197 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 2198 { 2199 /* Collect the reduction stmts and store them in 2200 SLP_TREE_SCALAR_STMTS. */ 2201 while (next_info) 2202 { 2203 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info)); 2204 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); 2205 } 2206 /* Mark the first element of the reduction chain as reduction to properly 2207 transform the node. In the reduction analysis phase only the last 2208 element of the chain is marked as reduction. */ 2209 STMT_VINFO_DEF_TYPE (stmt_info) 2210 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); 2211 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) 2212 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); 2213 } 2214 else if (constructor) 2215 { 2216 tree rhs = gimple_assign_rhs1 (stmt_info->stmt); 2217 tree val; 2218 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val) 2219 { 2220 if (TREE_CODE (val) == SSA_NAME) 2221 { 2222 gimple* def = SSA_NAME_DEF_STMT (val); 2223 stmt_vec_info def_info = vinfo->lookup_stmt (def); 2224 /* Value is defined in another basic block. */ 2225 if (!def_info) 2226 return false; 2227 def_info = vect_stmt_to_vectorize (def_info); 2228 scalar_stmts.safe_push (def_info); 2229 } 2230 else 2231 return false; 2232 } 2233 if (dump_enabled_p ()) 2234 dump_printf_loc (MSG_NOTE, vect_location, 2235 "Analyzing vectorizable constructor: %G\n", 2236 stmt_info->stmt); 2237 } 2238 else 2239 { 2240 /* Collect reduction statements. */ 2241 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions; 2242 for (i = 0; reductions.iterate (i, &next_info); i++) 2243 scalar_stmts.safe_push (next_info); 2244 } 2245 2246 /* Build the tree for the SLP instance. */ 2247 bool *matches = XALLOCAVEC (bool, group_size); 2248 unsigned npermutes = 0; 2249 poly_uint64 max_nunits = nunits; 2250 unsigned tree_size = 0; 2251 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, 2252 &max_nunits, matches, &npermutes, 2253 &tree_size, bst_map); 2254 if (node != NULL) 2255 { 2256 /* Calculate the unrolling factor based on the smallest type. */ 2257 poly_uint64 unrolling_factor 2258 = calculate_unrolling_factor (max_nunits, group_size); 2259 2260 if (maybe_ne (unrolling_factor, 1U) 2261 && is_a <bb_vec_info> (vinfo)) 2262 { 2263 unsigned HOST_WIDE_INT const_max_nunits; 2264 if (!max_nunits.is_constant (&const_max_nunits) 2265 || const_max_nunits > group_size) 2266 { 2267 if (dump_enabled_p ()) 2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2269 "Build SLP failed: store group " 2270 "size not a multiple of the vector size " 2271 "in basic block SLP\n"); 2272 vect_free_slp_tree (node, false); 2273 return false; 2274 } 2275 /* Fatal mismatch. */ 2276 matches[group_size / const_max_nunits * const_max_nunits] = false; 2277 vect_free_slp_tree (node, false); 2278 } 2279 else 2280 { 2281 /* Create a new SLP instance. */ 2282 new_instance = XNEW (class _slp_instance); 2283 SLP_INSTANCE_TREE (new_instance) = node; 2284 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; 2285 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; 2286 SLP_INSTANCE_LOADS (new_instance) = vNULL; 2287 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL; 2288 new_instance->reduc_phis = NULL; 2289 2290 vect_gather_slp_loads (new_instance, node); 2291 if (dump_enabled_p ()) 2292 dump_printf_loc (MSG_NOTE, vect_location, 2293 "SLP size %u vs. limit %u.\n", 2294 tree_size, max_tree_size); 2295 2296 /* Compute the load permutation. */ 2297 slp_tree load_node; 2298 bool loads_permuted = false; 2299 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) 2300 { 2301 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) 2302 continue; 2303 unsigned j; 2304 stmt_vec_info load_info; 2305 bool this_load_permuted = false; 2306 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT 2307 (SLP_TREE_SCALAR_STMTS (load_node)[0]); 2308 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) 2309 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) 2310 { 2311 this_load_permuted = true; 2312 break; 2313 } 2314 if (!this_load_permuted 2315 /* The load requires permutation when unrolling exposes 2316 a gap either because the group is larger than the SLP 2317 group-size or because there is a gap between the groups. */ 2318 && group_size == DR_GROUP_SIZE (first_stmt_info) 2319 && DR_GROUP_GAP (first_stmt_info) == 0) 2320 { 2321 SLP_TREE_LOAD_PERMUTATION (load_node).release (); 2322 continue; 2323 } 2324 loads_permuted = true; 2325 } 2326 2327 if (loads_permuted) 2328 { 2329 if (!vect_supported_load_permutation_p (new_instance)) 2330 { 2331 if (dump_enabled_p ()) 2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2333 "Build SLP failed: unsupported load " 2334 "permutation %G", stmt_info->stmt); 2335 vect_free_slp_instance (new_instance, false); 2336 return false; 2337 } 2338 } 2339 2340 /* If the loads and stores can be handled with load/store-lan 2341 instructions do not generate this SLP instance. */ 2342 if (is_a <loop_vec_info> (vinfo) 2343 && loads_permuted 2344 && dr && vect_store_lanes_supported (vectype, group_size, false)) 2345 { 2346 slp_tree load_node; 2347 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) 2348 { 2349 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT 2350 (SLP_TREE_SCALAR_STMTS (load_node)[0]); 2351 /* Use SLP for strided accesses (or if we can't load-lanes). */ 2352 if (STMT_VINFO_STRIDED_P (stmt_vinfo) 2353 || ! vect_load_lanes_supported 2354 (STMT_VINFO_VECTYPE (stmt_vinfo), 2355 DR_GROUP_SIZE (stmt_vinfo), false)) 2356 break; 2357 } 2358 if (i == SLP_INSTANCE_LOADS (new_instance).length ()) 2359 { 2360 if (dump_enabled_p ()) 2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2362 "Built SLP cancelled: can use " 2363 "load/store-lanes\n"); 2364 vect_free_slp_instance (new_instance, false); 2365 return false; 2366 } 2367 } 2368 2369 /* If this is a reduction chain with a conversion in front 2370 amend the SLP tree with a node for that. */ 2371 if (!dr 2372 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) 2373 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) 2374 { 2375 /* Get at the conversion stmt - we know it's the single use 2376 of the last stmt of the reduction chain. */ 2377 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt; 2378 use_operand_p use_p; 2379 gimple *use_stmt; 2380 bool r = single_imm_use (gimple_assign_lhs (tem), 2381 &use_p, &use_stmt); 2382 gcc_assert (r); 2383 next_info = vinfo->lookup_stmt (use_stmt); 2384 next_info = vect_stmt_to_vectorize (next_info); 2385 scalar_stmts = vNULL; 2386 scalar_stmts.create (group_size); 2387 for (unsigned i = 0; i < group_size; ++i) 2388 scalar_stmts.quick_push (next_info); 2389 slp_tree conv = vect_create_new_slp_node (scalar_stmts); 2390 SLP_TREE_CHILDREN (conv).quick_push (node); 2391 SLP_INSTANCE_TREE (new_instance) = conv; 2392 /* We also have to fake this conversion stmt as SLP reduction 2393 group so we don't have to mess with too much code 2394 elsewhere. */ 2395 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; 2396 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; 2397 } 2398 2399 vinfo->slp_instances.safe_push (new_instance); 2400 2401 if (dump_enabled_p ()) 2402 { 2403 dump_printf_loc (MSG_NOTE, vect_location, 2404 "Final SLP tree for instance:\n"); 2405 vect_print_slp_tree (MSG_NOTE, vect_location, 2406 SLP_INSTANCE_TREE (new_instance)); 2407 } 2408 2409 return true; 2410 } 2411 } 2412 else 2413 { 2414 /* Failed to SLP. */ 2415 /* Free the allocated memory. */ 2416 scalar_stmts.release (); 2417 } 2418 2419 /* For basic block SLP, try to break the group up into multiples of the 2420 vector size. */ 2421 unsigned HOST_WIDE_INT const_nunits; 2422 if (is_a <bb_vec_info> (vinfo) 2423 && STMT_VINFO_GROUPED_ACCESS (stmt_info) 2424 && DR_GROUP_FIRST_ELEMENT (stmt_info) 2425 && nunits.is_constant (&const_nunits)) 2426 { 2427 /* We consider breaking the group only on VF boundaries from the existing 2428 start. */ 2429 for (i = 0; i < group_size; i++) 2430 if (!matches[i]) break; 2431 2432 if (i >= const_nunits && i < group_size) 2433 { 2434 /* Split into two groups at the first vector boundary before i. */ 2435 gcc_assert ((const_nunits & (const_nunits - 1)) == 0); 2436 unsigned group1_size = i & ~(const_nunits - 1); 2437 2438 stmt_vec_info rest = vect_split_slp_store_group (stmt_info, 2439 group1_size); 2440 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info, 2441 max_tree_size); 2442 /* If the first non-match was in the middle of a vector, 2443 skip the rest of that vector. */ 2444 if (group1_size < i) 2445 { 2446 i = group1_size + const_nunits; 2447 if (i < group_size) 2448 rest = vect_split_slp_store_group (rest, const_nunits); 2449 } 2450 if (i < group_size) 2451 res |= vect_analyze_slp_instance (vinfo, bst_map, 2452 rest, max_tree_size); 2453 return res; 2454 } 2455 /* Even though the first vector did not all match, we might be able to SLP 2456 (some) of the remainder. FORNOW ignore this possibility. */ 2457 } 2458 2459 return false; 2460 } 2461 2462 2463 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP 2464 trees of packed scalar stmts if SLP is possible. */ 2465 2466 opt_result 2467 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) 2468 { 2469 unsigned int i; 2470 stmt_vec_info first_element; 2471 2472 DUMP_VECT_SCOPE ("vect_analyze_slp"); 2473 2474 scalar_stmts_to_slp_tree_map_t *bst_map 2475 = new scalar_stmts_to_slp_tree_map_t (); 2476 2477 /* Find SLP sequences starting from groups of grouped stores. */ 2478 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) 2479 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size); 2480 2481 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 2482 { 2483 if (loop_vinfo->reduction_chains.length () > 0) 2484 { 2485 /* Find SLP sequences starting from reduction chains. */ 2486 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) 2487 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element, 2488 max_tree_size)) 2489 { 2490 /* Dissolve reduction chain group. */ 2491 stmt_vec_info vinfo = first_element; 2492 stmt_vec_info last = NULL; 2493 while (vinfo) 2494 { 2495 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo); 2496 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; 2497 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; 2498 last = vinfo; 2499 vinfo = next; 2500 } 2501 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def; 2502 /* It can be still vectorized as part of an SLP reduction. */ 2503 loop_vinfo->reductions.safe_push (last); 2504 } 2505 } 2506 2507 /* Find SLP sequences starting from groups of reductions. */ 2508 if (loop_vinfo->reductions.length () > 1) 2509 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0], 2510 max_tree_size); 2511 } 2512 2513 /* The map keeps a reference on SLP nodes built, release that. */ 2514 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); 2515 it != bst_map->end (); ++it) 2516 if ((*it).second) 2517 vect_free_slp_tree ((*it).second, false); 2518 delete bst_map; 2519 2520 return opt_result::success (); 2521 } 2522 2523 2524 /* For each possible SLP instance decide whether to SLP it and calculate overall 2525 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at 2526 least one instance. */ 2527 2528 bool 2529 vect_make_slp_decision (loop_vec_info loop_vinfo) 2530 { 2531 unsigned int i; 2532 poly_uint64 unrolling_factor = 1; 2533 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2534 slp_instance instance; 2535 int decided_to_slp = 0; 2536 2537 DUMP_VECT_SCOPE ("vect_make_slp_decision"); 2538 2539 FOR_EACH_VEC_ELT (slp_instances, i, instance) 2540 { 2541 /* FORNOW: SLP if you can. */ 2542 /* All unroll factors have the form: 2543 2544 GET_MODE_SIZE (vinfo->vector_mode) * X 2545 2546 for some rational X, so they must have a common multiple. */ 2547 unrolling_factor 2548 = force_common_multiple (unrolling_factor, 2549 SLP_INSTANCE_UNROLLING_FACTOR (instance)); 2550 2551 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we 2552 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and 2553 loop-based vectorization. Such stmts will be marked as HYBRID. */ 2554 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); 2555 decided_to_slp++; 2556 } 2557 2558 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; 2559 2560 if (decided_to_slp && dump_enabled_p ()) 2561 { 2562 dump_printf_loc (MSG_NOTE, vect_location, 2563 "Decided to SLP %d instances. Unrolling factor ", 2564 decided_to_slp); 2565 dump_dec (MSG_NOTE, unrolling_factor); 2566 dump_printf (MSG_NOTE, "\n"); 2567 } 2568 2569 return (decided_to_slp > 0); 2570 } 2571 2572 2573 /* Private data for vect_detect_hybrid_slp. */ 2574 struct vdhs_data 2575 { 2576 loop_vec_info loop_vinfo; 2577 vec<stmt_vec_info> *worklist; 2578 }; 2579 2580 /* Walker for walk_gimple_op. */ 2581 2582 static tree 2583 vect_detect_hybrid_slp (tree *tp, int *, void *data) 2584 { 2585 walk_stmt_info *wi = (walk_stmt_info *)data; 2586 vdhs_data *dat = (vdhs_data *)wi->info; 2587 2588 if (wi->is_lhs) 2589 return NULL_TREE; 2590 2591 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp); 2592 if (!def_stmt_info) 2593 return NULL_TREE; 2594 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info); 2595 if (PURE_SLP_STMT (def_stmt_info)) 2596 { 2597 if (dump_enabled_p ()) 2598 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G", 2599 def_stmt_info->stmt); 2600 STMT_SLP_TYPE (def_stmt_info) = hybrid; 2601 dat->worklist->safe_push (def_stmt_info); 2602 } 2603 2604 return NULL_TREE; 2605 } 2606 2607 /* Find stmts that must be both vectorized and SLPed. */ 2608 2609 void 2610 vect_detect_hybrid_slp (loop_vec_info loop_vinfo) 2611 { 2612 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp"); 2613 2614 /* All stmts participating in SLP are marked pure_slp, all other 2615 stmts are loop_vect. 2616 First collect all loop_vect stmts into a worklist. */ 2617 auto_vec<stmt_vec_info> worklist; 2618 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2619 { 2620 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2621 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); 2622 gsi_next (&gsi)) 2623 { 2624 gphi *phi = gsi.phi (); 2625 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi); 2626 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) 2627 worklist.safe_push (stmt_info); 2628 } 2629 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 2630 gsi_next (&gsi)) 2631 { 2632 gimple *stmt = gsi_stmt (gsi); 2633 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); 2634 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2635 { 2636 for (gimple_stmt_iterator gsi2 2637 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); 2638 !gsi_end_p (gsi2); gsi_next (&gsi2)) 2639 { 2640 stmt_vec_info patt_info 2641 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2)); 2642 if (!STMT_SLP_TYPE (patt_info) 2643 && STMT_VINFO_RELEVANT (patt_info)) 2644 worklist.safe_push (patt_info); 2645 } 2646 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 2647 } 2648 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info)) 2649 worklist.safe_push (stmt_info); 2650 } 2651 } 2652 2653 /* Now we have a worklist of non-SLP stmts, follow use->def chains and 2654 mark any SLP vectorized stmt as hybrid. 2655 ??? We're visiting def stmts N times (once for each non-SLP and 2656 once for each hybrid-SLP use). */ 2657 walk_stmt_info wi; 2658 vdhs_data dat; 2659 dat.worklist = &worklist; 2660 dat.loop_vinfo = loop_vinfo; 2661 memset (&wi, 0, sizeof (wi)); 2662 wi.info = (void *)&dat; 2663 while (!worklist.is_empty ()) 2664 { 2665 stmt_vec_info stmt_info = worklist.pop (); 2666 /* Since SSA operands are not set up for pattern stmts we need 2667 to use walk_gimple_op. */ 2668 wi.is_lhs = 0; 2669 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi); 2670 } 2671 } 2672 2673 2674 /* Initialize a bb_vec_info struct for the statements between 2675 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */ 2676 2677 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in, 2678 gimple_stmt_iterator region_end_in, 2679 vec_info_shared *shared) 2680 : vec_info (vec_info::bb, init_cost (NULL), shared), 2681 bb (gsi_bb (region_begin_in)), 2682 region_begin (region_begin_in), 2683 region_end (region_end_in) 2684 { 2685 gimple_stmt_iterator gsi; 2686 2687 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end); 2688 gsi_next (&gsi)) 2689 { 2690 gimple *stmt = gsi_stmt (gsi); 2691 gimple_set_uid (stmt, 0); 2692 add_stmt (stmt); 2693 } 2694 2695 bb->aux = this; 2696 } 2697 2698 2699 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the 2700 stmts in the basic block. */ 2701 2702 _bb_vec_info::~_bb_vec_info () 2703 { 2704 for (gimple_stmt_iterator si = region_begin; 2705 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si)) 2706 /* Reset region marker. */ 2707 gimple_set_uid (gsi_stmt (si), -1); 2708 2709 bb->aux = NULL; 2710 } 2711 2712 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE, 2713 given then that child nodes have already been processed, and that 2714 their def types currently match their SLP node's def type. */ 2715 2716 static bool 2717 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, 2718 slp_instance node_instance, 2719 stmt_vector_for_cost *cost_vec) 2720 { 2721 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 2722 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect); 2723 2724 /* Calculate the number of vector statements to be created for the 2725 scalar stmts in this node. For SLP reductions it is equal to the 2726 number of vector statements in the children (which has already been 2727 calculated by the recursive call). Otherwise it is the number of 2728 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by 2729 VF divided by the number of elements in a vector. */ 2730 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2731 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 2732 { 2733 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i) 2734 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def) 2735 { 2736 SLP_TREE_NUMBER_OF_VEC_STMTS (node) 2737 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]); 2738 break; 2739 } 2740 } 2741 else 2742 { 2743 poly_uint64 vf; 2744 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 2745 vf = loop_vinfo->vectorization_factor; 2746 else 2747 vf = 1; 2748 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance); 2749 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 2750 SLP_TREE_NUMBER_OF_VEC_STMTS (node) 2751 = vect_get_num_vectors (vf * group_size, vectype); 2752 } 2753 2754 bool dummy; 2755 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec); 2756 } 2757 2758 /* Try to build NODE from scalars, returning true on success. 2759 NODE_INSTANCE is the SLP instance that contains NODE. */ 2760 2761 static bool 2762 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node, 2763 slp_instance node_instance) 2764 { 2765 stmt_vec_info stmt_info; 2766 unsigned int i; 2767 2768 if (!is_a <bb_vec_info> (vinfo) 2769 || node == SLP_INSTANCE_TREE (node_instance) 2770 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))) 2771 return false; 2772 2773 if (dump_enabled_p ()) 2774 dump_printf_loc (MSG_NOTE, vect_location, 2775 "Building vector operands from scalars instead\n"); 2776 2777 /* Don't remove and free the child nodes here, since they could be 2778 referenced by other structures. The analysis and scheduling phases 2779 (need to) ignore child nodes of anything that isn't vect_internal_def. */ 2780 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length (); 2781 SLP_TREE_DEF_TYPE (node) = vect_external_def; 2782 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size); 2783 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 2784 { 2785 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt); 2786 SLP_TREE_SCALAR_OPS (node)[i] = lhs; 2787 } 2788 return true; 2789 } 2790 2791 /* Analyze statements contained in SLP tree NODE after recursively analyzing 2792 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE. 2793 2794 Return true if the operations are supported. */ 2795 2796 static bool 2797 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, 2798 slp_instance node_instance, 2799 hash_set<slp_tree> &visited, 2800 hash_set<slp_tree> &lvisited, 2801 stmt_vector_for_cost *cost_vec) 2802 { 2803 int i, j; 2804 slp_tree child; 2805 2806 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 2807 return true; 2808 2809 /* If we already analyzed the exact same set of scalar stmts we're done. 2810 We share the generated vector stmts for those. 2811 The SLP graph is acyclic so not caching whether we failed or succeeded 2812 doesn't result in any issue since we throw away the lvisited set 2813 when we fail. */ 2814 if (visited.contains (node) 2815 || lvisited.add (node)) 2816 return true; 2817 2818 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 2819 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance, 2820 visited, lvisited, cost_vec)) 2821 return false; 2822 2823 /* ??? We have to catch the case late where two first scalar stmts appear 2824 in multiple SLP children with different def type and fail. Remember 2825 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily 2826 match it when that is vect_internal_def. */ 2827 auto_vec<vect_def_type, 4> dt; 2828 dt.safe_grow (SLP_TREE_CHILDREN (node).length ()); 2829 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2830 if (SLP_TREE_SCALAR_STMTS (child).length () != 0) 2831 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]); 2832 2833 /* Push SLP node def-type to stmt operands. */ 2834 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2835 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def 2836 && SLP_TREE_SCALAR_STMTS (child).length () != 0) 2837 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) 2838 = SLP_TREE_DEF_TYPE (child); 2839 2840 /* Check everything worked out. */ 2841 bool res = true; 2842 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2843 if (SLP_TREE_SCALAR_STMTS (child).length () != 0) 2844 { 2845 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 2846 { 2847 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) 2848 != SLP_TREE_DEF_TYPE (child)) 2849 res = false; 2850 } 2851 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) 2852 != dt[j]) 2853 res = false; 2854 } 2855 if (!res && dump_enabled_p ()) 2856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2857 "not vectorized: same operand with different " 2858 "def type in stmt.\n"); 2859 2860 if (res) 2861 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance, 2862 cost_vec); 2863 2864 /* Restore def-types. */ 2865 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2866 if (SLP_TREE_SCALAR_STMTS (child).length () != 0) 2867 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j]; 2868 2869 /* If this node can't be vectorized, try pruning the tree here rather 2870 than felling the whole thing. */ 2871 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance)) 2872 res = true; 2873 2874 return res; 2875 } 2876 2877 2878 /* Analyze statements in SLP instances of VINFO. Return true if the 2879 operations are supported. */ 2880 2881 bool 2882 vect_slp_analyze_operations (vec_info *vinfo) 2883 { 2884 slp_instance instance; 2885 int i; 2886 2887 DUMP_VECT_SCOPE ("vect_slp_analyze_operations"); 2888 2889 hash_set<slp_tree> visited; 2890 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ) 2891 { 2892 hash_set<slp_tree> lvisited; 2893 stmt_vector_for_cost cost_vec; 2894 cost_vec.create (2); 2895 if (!vect_slp_analyze_node_operations (vinfo, 2896 SLP_INSTANCE_TREE (instance), 2897 instance, visited, lvisited, 2898 &cost_vec) 2899 /* Instances with a root stmt require vectorized defs for the 2900 SLP tree root. */ 2901 || (SLP_INSTANCE_ROOT_STMT (instance) 2902 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) 2903 != vect_internal_def))) 2904 { 2905 slp_tree node = SLP_INSTANCE_TREE (instance); 2906 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 2907 if (dump_enabled_p ()) 2908 dump_printf_loc (MSG_NOTE, vect_location, 2909 "removing SLP instance operations starting from: %G", 2910 stmt_info->stmt); 2911 vect_free_slp_instance (instance, false); 2912 vinfo->slp_instances.ordered_remove (i); 2913 cost_vec.release (); 2914 } 2915 else 2916 { 2917 for (hash_set<slp_tree>::iterator x = lvisited.begin(); 2918 x != lvisited.end(); ++x) 2919 visited.add (*x); 2920 i++; 2921 2922 add_stmt_costs (vinfo->target_cost_data, &cost_vec); 2923 cost_vec.release (); 2924 } 2925 } 2926 2927 return !vinfo->slp_instances.is_empty (); 2928 } 2929 2930 2931 /* Compute the scalar cost of the SLP node NODE and its children 2932 and return it. Do not account defs that are marked in LIFE and 2933 update LIFE according to uses of NODE. */ 2934 2935 static void 2936 vect_bb_slp_scalar_cost (basic_block bb, 2937 slp_tree node, vec<bool, va_heap> *life, 2938 stmt_vector_for_cost *cost_vec, 2939 hash_set<slp_tree> &visited) 2940 { 2941 unsigned i; 2942 stmt_vec_info stmt_info; 2943 slp_tree child; 2944 2945 if (visited.add (node)) 2946 return; 2947 2948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 2949 { 2950 gimple *stmt = stmt_info->stmt; 2951 vec_info *vinfo = stmt_info->vinfo; 2952 ssa_op_iter op_iter; 2953 def_operand_p def_p; 2954 2955 if ((*life)[i]) 2956 continue; 2957 2958 /* If there is a non-vectorized use of the defs then the scalar 2959 stmt is kept live in which case we do not account it or any 2960 required defs in the SLP children in the scalar cost. This 2961 way we make the vectorization more costly when compared to 2962 the scalar cost. */ 2963 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) 2964 { 2965 imm_use_iterator use_iter; 2966 gimple *use_stmt; 2967 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) 2968 if (!is_gimple_debug (use_stmt)) 2969 { 2970 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt); 2971 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info)) 2972 { 2973 (*life)[i] = true; 2974 BREAK_FROM_IMM_USE_STMT (use_iter); 2975 } 2976 } 2977 } 2978 if ((*life)[i]) 2979 continue; 2980 2981 /* Count scalar stmts only once. */ 2982 if (gimple_visited_p (stmt)) 2983 continue; 2984 gimple_set_visited (stmt, true); 2985 2986 vect_cost_for_stmt kind; 2987 if (STMT_VINFO_DATA_REF (stmt_info)) 2988 { 2989 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 2990 kind = scalar_load; 2991 else 2992 kind = scalar_store; 2993 } 2994 else if (vect_nop_conversion_p (stmt_info)) 2995 continue; 2996 else 2997 kind = scalar_stmt; 2998 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body); 2999 } 3000 3001 auto_vec<bool, 20> subtree_life; 3002 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 3003 { 3004 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 3005 { 3006 /* Do not directly pass LIFE to the recursive call, copy it to 3007 confine changes in the callee to the current child/subtree. */ 3008 subtree_life.safe_splice (*life); 3009 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec, 3010 visited); 3011 subtree_life.truncate (0); 3012 } 3013 } 3014 } 3015 3016 /* Check if vectorization of the basic block is profitable. */ 3017 3018 static bool 3019 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) 3020 { 3021 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); 3022 slp_instance instance; 3023 int i; 3024 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; 3025 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; 3026 3027 /* Calculate scalar cost. */ 3028 stmt_vector_for_cost scalar_costs; 3029 scalar_costs.create (0); 3030 hash_set<slp_tree> visited; 3031 FOR_EACH_VEC_ELT (slp_instances, i, instance) 3032 { 3033 auto_vec<bool, 20> life; 3034 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance)); 3035 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo), 3036 SLP_INSTANCE_TREE (instance), 3037 &life, &scalar_costs, visited); 3038 } 3039 void *target_cost_data = init_cost (NULL); 3040 add_stmt_costs (target_cost_data, &scalar_costs); 3041 scalar_costs.release (); 3042 unsigned dummy; 3043 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy); 3044 destroy_cost_data (target_cost_data); 3045 3046 /* Unset visited flag. */ 3047 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin; 3048 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi)) 3049 gimple_set_visited (gsi_stmt (gsi), false); 3050 3051 /* Complete the target-specific cost calculation. */ 3052 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost, 3053 &vec_inside_cost, &vec_epilogue_cost); 3054 3055 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; 3056 3057 if (dump_enabled_p ()) 3058 { 3059 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3060 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", 3061 vec_inside_cost); 3062 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost); 3063 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost); 3064 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost); 3065 } 3066 3067 /* Vectorization is profitable if its cost is more than the cost of scalar 3068 version. Note that we err on the vector side for equal cost because 3069 the cost estimate is otherwise quite pessimistic (constant uses are 3070 free on the scalar side but cost a load on the vector side for 3071 example). */ 3072 if (vec_outside_cost + vec_inside_cost > scalar_cost) 3073 return false; 3074 3075 return true; 3076 } 3077 3078 /* Find any vectorizable constructors and add them to the grouped_store 3079 array. */ 3080 3081 static void 3082 vect_slp_check_for_constructors (bb_vec_info bb_vinfo) 3083 { 3084 gimple_stmt_iterator gsi; 3085 3086 for (gsi = bb_vinfo->region_begin; 3087 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi)) 3088 { 3089 gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi)); 3090 if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR) 3091 continue; 3092 3093 tree rhs = gimple_assign_rhs1 (stmt); 3094 if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) 3095 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)), 3096 CONSTRUCTOR_NELTS (rhs)) 3097 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value)) 3098 || uniform_vector_p (rhs)) 3099 continue; 3100 3101 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt); 3102 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); 3103 } 3104 } 3105 3106 /* Check if the region described by BB_VINFO can be vectorized, returning 3107 true if so. When returning false, set FATAL to true if the same failure 3108 would prevent vectorization at other vector sizes, false if it is still 3109 worth trying other sizes. N_STMTS is the number of statements in the 3110 region. */ 3111 3112 static bool 3113 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) 3114 { 3115 DUMP_VECT_SCOPE ("vect_slp_analyze_bb"); 3116 3117 slp_instance instance; 3118 int i; 3119 poly_uint64 min_vf = 2; 3120 3121 /* The first group of checks is independent of the vector size. */ 3122 fatal = true; 3123 3124 /* Analyze the data references. */ 3125 3126 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) 3127 { 3128 if (dump_enabled_p ()) 3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3130 "not vectorized: unhandled data-ref in basic " 3131 "block.\n"); 3132 return false; 3133 } 3134 3135 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2) 3136 { 3137 if (dump_enabled_p ()) 3138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3139 "not vectorized: not enough data-refs in " 3140 "basic block.\n"); 3141 return false; 3142 } 3143 3144 if (!vect_analyze_data_ref_accesses (bb_vinfo)) 3145 { 3146 if (dump_enabled_p ()) 3147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3148 "not vectorized: unhandled data access in " 3149 "basic block.\n"); 3150 return false; 3151 } 3152 3153 vect_slp_check_for_constructors (bb_vinfo); 3154 3155 /* If there are no grouped stores in the region there is no need 3156 to continue with pattern recog as vect_analyze_slp will fail 3157 anyway. */ 3158 if (bb_vinfo->grouped_stores.is_empty ()) 3159 { 3160 if (dump_enabled_p ()) 3161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3162 "not vectorized: no grouped stores in " 3163 "basic block.\n"); 3164 return false; 3165 } 3166 3167 /* While the rest of the analysis below depends on it in some way. */ 3168 fatal = false; 3169 3170 vect_pattern_recog (bb_vinfo); 3171 3172 /* Check the SLP opportunities in the basic block, analyze and build SLP 3173 trees. */ 3174 if (!vect_analyze_slp (bb_vinfo, n_stmts)) 3175 { 3176 if (dump_enabled_p ()) 3177 { 3178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3179 "Failed to SLP the basic block.\n"); 3180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3181 "not vectorized: failed to find SLP opportunities " 3182 "in basic block.\n"); 3183 } 3184 return false; 3185 } 3186 3187 vect_record_base_alignments (bb_vinfo); 3188 3189 /* Analyze and verify the alignment of data references and the 3190 dependence in the SLP instances. */ 3191 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); ) 3192 { 3193 if (! vect_slp_analyze_and_verify_instance_alignment (instance) 3194 || ! vect_slp_analyze_instance_dependence (instance)) 3195 { 3196 slp_tree node = SLP_INSTANCE_TREE (instance); 3197 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 3198 if (dump_enabled_p ()) 3199 dump_printf_loc (MSG_NOTE, vect_location, 3200 "removing SLP instance operations starting from: %G", 3201 stmt_info->stmt); 3202 vect_free_slp_instance (instance, false); 3203 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i); 3204 continue; 3205 } 3206 3207 /* Mark all the statements that we want to vectorize as pure SLP and 3208 relevant. */ 3209 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); 3210 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); 3211 if (SLP_INSTANCE_ROOT_STMT (instance)) 3212 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp; 3213 3214 i++; 3215 } 3216 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) 3217 return false; 3218 3219 if (!vect_slp_analyze_operations (bb_vinfo)) 3220 { 3221 if (dump_enabled_p ()) 3222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3223 "not vectorized: bad operation in basic block.\n"); 3224 return false; 3225 } 3226 3227 /* Cost model: check if the vectorization is worthwhile. */ 3228 if (!unlimited_cost_model (NULL) 3229 && !vect_bb_vectorization_profitable_p (bb_vinfo)) 3230 { 3231 if (dump_enabled_p ()) 3232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3233 "not vectorized: vectorization is not " 3234 "profitable.\n"); 3235 return false; 3236 } 3237 3238 if (dump_enabled_p ()) 3239 dump_printf_loc (MSG_NOTE, vect_location, 3240 "Basic block will be vectorized using SLP\n"); 3241 return true; 3242 } 3243 3244 /* Subroutine of vect_slp_bb. Try to vectorize the statements between 3245 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true 3246 on success. The region has N_STMTS statements and has the datarefs 3247 given by DATAREFS. */ 3248 3249 static bool 3250 vect_slp_bb_region (gimple_stmt_iterator region_begin, 3251 gimple_stmt_iterator region_end, 3252 vec<data_reference_p> datarefs, 3253 unsigned int n_stmts) 3254 { 3255 bb_vec_info bb_vinfo; 3256 auto_vector_modes vector_modes; 3257 3258 /* Autodetect first vector size we try. */ 3259 machine_mode next_vector_mode = VOIDmode; 3260 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); 3261 unsigned int mode_i = 0; 3262 3263 vec_info_shared shared; 3264 3265 machine_mode autodetected_vector_mode = VOIDmode; 3266 while (1) 3267 { 3268 bool vectorized = false; 3269 bool fatal = false; 3270 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared); 3271 3272 bool first_time_p = shared.datarefs.is_empty (); 3273 BB_VINFO_DATAREFS (bb_vinfo) = datarefs; 3274 if (first_time_p) 3275 bb_vinfo->shared->save_datarefs (); 3276 else 3277 bb_vinfo->shared->check_datarefs (); 3278 bb_vinfo->vector_mode = next_vector_mode; 3279 3280 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal) 3281 && dbg_cnt (vect_slp)) 3282 { 3283 if (dump_enabled_p ()) 3284 { 3285 dump_printf_loc (MSG_NOTE, vect_location, 3286 "***** Analysis succeeded with vector mode" 3287 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode)); 3288 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n"); 3289 } 3290 3291 bb_vinfo->shared->check_datarefs (); 3292 vect_schedule_slp (bb_vinfo); 3293 3294 unsigned HOST_WIDE_INT bytes; 3295 if (dump_enabled_p ()) 3296 { 3297 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes)) 3298 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, 3299 "basic block part vectorized using %wu byte " 3300 "vectors\n", bytes); 3301 else 3302 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, 3303 "basic block part vectorized using variable " 3304 "length vectors\n"); 3305 } 3306 3307 vectorized = true; 3308 } 3309 else 3310 { 3311 if (dump_enabled_p ()) 3312 dump_printf_loc (MSG_NOTE, vect_location, 3313 "***** Analysis failed with vector mode %s\n", 3314 GET_MODE_NAME (bb_vinfo->vector_mode)); 3315 } 3316 3317 if (mode_i == 0) 3318 autodetected_vector_mode = bb_vinfo->vector_mode; 3319 3320 if (!fatal) 3321 while (mode_i < vector_modes.length () 3322 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i])) 3323 { 3324 if (dump_enabled_p ()) 3325 dump_printf_loc (MSG_NOTE, vect_location, 3326 "***** The result for vector mode %s would" 3327 " be the same\n", 3328 GET_MODE_NAME (vector_modes[mode_i])); 3329 mode_i += 1; 3330 } 3331 3332 delete bb_vinfo; 3333 3334 if (mode_i < vector_modes.length () 3335 && VECTOR_MODE_P (autodetected_vector_mode) 3336 && (related_vector_mode (vector_modes[mode_i], 3337 GET_MODE_INNER (autodetected_vector_mode)) 3338 == autodetected_vector_mode) 3339 && (related_vector_mode (autodetected_vector_mode, 3340 GET_MODE_INNER (vector_modes[mode_i])) 3341 == vector_modes[mode_i])) 3342 { 3343 if (dump_enabled_p ()) 3344 dump_printf_loc (MSG_NOTE, vect_location, 3345 "***** Skipping vector mode %s, which would" 3346 " repeat the analysis for %s\n", 3347 GET_MODE_NAME (vector_modes[mode_i]), 3348 GET_MODE_NAME (autodetected_vector_mode)); 3349 mode_i += 1; 3350 } 3351 3352 if (vectorized 3353 || mode_i == vector_modes.length () 3354 || autodetected_vector_mode == VOIDmode 3355 /* If vect_slp_analyze_bb_1 signaled that analysis for all 3356 vector sizes will fail do not bother iterating. */ 3357 || fatal) 3358 return vectorized; 3359 3360 /* Try the next biggest vector size. */ 3361 next_vector_mode = vector_modes[mode_i++]; 3362 if (dump_enabled_p ()) 3363 dump_printf_loc (MSG_NOTE, vect_location, 3364 "***** Re-trying analysis with vector mode %s\n", 3365 GET_MODE_NAME (next_vector_mode)); 3366 } 3367 } 3368 3369 /* Main entry for the BB vectorizer. Analyze and transform BB, returns 3370 true if anything in the basic-block was vectorized. */ 3371 3372 bool 3373 vect_slp_bb (basic_block bb) 3374 { 3375 gimple_stmt_iterator gsi; 3376 bool any_vectorized = false; 3377 3378 gsi = gsi_start_bb (bb); 3379 while (!gsi_end_p (gsi)) 3380 { 3381 gimple_stmt_iterator region_begin = gsi; 3382 vec<data_reference_p> datarefs = vNULL; 3383 int insns = 0; 3384 3385 for (; !gsi_end_p (gsi); gsi_next (&gsi)) 3386 { 3387 gimple *stmt = gsi_stmt (gsi); 3388 if (is_gimple_debug (stmt)) 3389 continue; 3390 insns++; 3391 3392 if (gimple_location (stmt) != UNKNOWN_LOCATION) 3393 vect_location = stmt; 3394 3395 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs)) 3396 break; 3397 } 3398 3399 /* Skip leading unhandled stmts. */ 3400 if (gsi_stmt (region_begin) == gsi_stmt (gsi)) 3401 { 3402 gsi_next (&gsi); 3403 continue; 3404 } 3405 3406 gimple_stmt_iterator region_end = gsi; 3407 3408 if (insns > param_slp_max_insns_in_bb) 3409 { 3410 if (dump_enabled_p ()) 3411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3412 "not vectorized: too many instructions in " 3413 "basic block.\n"); 3414 } 3415 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns)) 3416 any_vectorized = true; 3417 3418 if (gsi_end_p (region_end)) 3419 break; 3420 3421 /* Skip the unhandled stmt. */ 3422 gsi_next (&gsi); 3423 } 3424 3425 return any_vectorized; 3426 } 3427 3428 3429 /* Return 1 if vector type STMT_VINFO is a boolean vector. */ 3430 3431 static bool 3432 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, unsigned op_num) 3433 { 3434 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt); 3435 tree op, vectype; 3436 enum vect_def_type dt; 3437 3438 /* For comparison and COND_EXPR type is chosen depending 3439 on the non-constant other comparison operand. */ 3440 if (TREE_CODE_CLASS (code) == tcc_comparison) 3441 { 3442 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt); 3443 op = gimple_assign_rhs1 (stmt); 3444 3445 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype)) 3446 gcc_unreachable (); 3447 3448 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); 3449 } 3450 3451 if (code == COND_EXPR) 3452 { 3453 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt); 3454 tree cond = gimple_assign_rhs1 (stmt); 3455 3456 if (TREE_CODE (cond) == SSA_NAME) 3457 { 3458 if (op_num > 0) 3459 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo)); 3460 op = cond; 3461 } 3462 else 3463 { 3464 if (op_num > 1) 3465 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo)); 3466 op = TREE_OPERAND (cond, 0); 3467 } 3468 3469 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype)) 3470 gcc_unreachable (); 3471 3472 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); 3473 } 3474 3475 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo)); 3476 } 3477 3478 /* Build a variable-length vector in which the elements in ELTS are repeated 3479 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in 3480 RESULTS and add any new instructions to SEQ. 3481 3482 The approach we use is: 3483 3484 (1) Find a vector mode VM with integer elements of mode IM. 3485 3486 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of 3487 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs 3488 from small vectors to IM. 3489 3490 (3) Duplicate each ELTS'[I] into a vector of mode VM. 3491 3492 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the 3493 correct byte contents. 3494 3495 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type. 3496 3497 We try to find the largest IM for which this sequence works, in order 3498 to cut down on the number of interleaves. */ 3499 3500 void 3501 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type, 3502 vec<tree> elts, unsigned int nresults, 3503 vec<tree> &results) 3504 { 3505 unsigned int nelts = elts.length (); 3506 tree element_type = TREE_TYPE (vector_type); 3507 3508 /* (1) Find a vector mode VM with integer elements of mode IM. */ 3509 unsigned int nvectors = 1; 3510 tree new_vector_type; 3511 tree permutes[2]; 3512 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type, 3513 &nvectors, &new_vector_type, 3514 permutes)) 3515 gcc_unreachable (); 3516 3517 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */ 3518 unsigned int partial_nelts = nelts / nvectors; 3519 tree partial_vector_type = build_vector_type (element_type, partial_nelts); 3520 3521 tree_vector_builder partial_elts; 3522 auto_vec<tree, 32> pieces (nvectors * 2); 3523 pieces.quick_grow_cleared (nvectors * 2); 3524 for (unsigned int i = 0; i < nvectors; ++i) 3525 { 3526 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of 3527 ELTS' has mode IM. */ 3528 partial_elts.new_vector (partial_vector_type, partial_nelts, 1); 3529 for (unsigned int j = 0; j < partial_nelts; ++j) 3530 partial_elts.quick_push (elts[i * partial_nelts + j]); 3531 tree t = gimple_build_vector (seq, &partial_elts); 3532 t = gimple_build (seq, VIEW_CONVERT_EXPR, 3533 TREE_TYPE (new_vector_type), t); 3534 3535 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */ 3536 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t); 3537 } 3538 3539 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the 3540 correct byte contents. 3541 3542 Conceptually, we need to repeat the following operation log2(nvectors) 3543 times, where hi_start = nvectors / 2: 3544 3545 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute); 3546 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute); 3547 3548 However, if each input repeats every N elements and the VF is 3549 a multiple of N * 2, the HI result is the same as the LO result. 3550 This will be true for the first N1 iterations of the outer loop, 3551 followed by N2 iterations for which both the LO and HI results 3552 are needed. I.e.: 3553 3554 N1 + N2 = log2(nvectors) 3555 3556 Each "N1 iteration" doubles the number of redundant vectors and the 3557 effect of the process as a whole is to have a sequence of nvectors/2**N1 3558 vectors that repeats 2**N1 times. Rather than generate these redundant 3559 vectors, we halve the number of vectors for each N1 iteration. */ 3560 unsigned int in_start = 0; 3561 unsigned int out_start = nvectors; 3562 unsigned int new_nvectors = nvectors; 3563 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2) 3564 { 3565 unsigned int hi_start = new_nvectors / 2; 3566 unsigned int out_i = 0; 3567 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i) 3568 { 3569 if ((in_i & 1) != 0 3570 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type), 3571 2 * in_repeat)) 3572 continue; 3573 3574 tree output = make_ssa_name (new_vector_type); 3575 tree input1 = pieces[in_start + (in_i / 2)]; 3576 tree input2 = pieces[in_start + (in_i / 2) + hi_start]; 3577 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR, 3578 input1, input2, 3579 permutes[in_i & 1]); 3580 gimple_seq_add_stmt (seq, stmt); 3581 pieces[out_start + out_i] = output; 3582 out_i += 1; 3583 } 3584 std::swap (in_start, out_start); 3585 new_nvectors = out_i; 3586 } 3587 3588 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */ 3589 results.reserve (nresults); 3590 for (unsigned int i = 0; i < nresults; ++i) 3591 if (i < new_nvectors) 3592 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type, 3593 pieces[in_start + i])); 3594 else 3595 results.quick_push (results[i - new_nvectors]); 3596 } 3597 3598 3599 /* For constant and loop invariant defs of SLP_NODE this function returns 3600 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. 3601 OP_NODE determines the node for the operand containing the scalar 3602 operands. */ 3603 3604 static void 3605 vect_get_constant_vectors (slp_tree slp_node, unsigned op_num, 3606 vec<tree> *vec_oprnds) 3607 { 3608 slp_tree op_node = SLP_TREE_CHILDREN (slp_node)[op_num]; 3609 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0]; 3610 vec_info *vinfo = stmt_vinfo->vinfo; 3611 unsigned HOST_WIDE_INT nunits; 3612 tree vec_cst; 3613 unsigned j, number_of_places_left_in_vector; 3614 tree vector_type; 3615 tree vop; 3616 int group_size = op_node->ops.length (); 3617 unsigned int vec_num, i; 3618 unsigned number_of_copies = 1; 3619 bool constant_p; 3620 tree neutral_op = NULL; 3621 gimple_seq ctor_seq = NULL; 3622 auto_vec<tree, 16> permute_results; 3623 3624 /* ??? SLP analysis should compute the vector type for the 3625 constant / invariant and store it in the SLP node. */ 3626 tree op = op_node->ops[0]; 3627 /* Check if vector type is a boolean vector. */ 3628 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3629 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op)) 3630 && vect_mask_constant_operand_p (stmt_vinfo, op_num)) 3631 vector_type = truth_type_for (stmt_vectype); 3632 else 3633 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), op_node); 3634 3635 /* ??? For lane-reducing ops we should also have the required number 3636 of vector stmts initialized rather than second-guessing here. */ 3637 unsigned int number_of_vectors; 3638 if (is_gimple_assign (stmt_vinfo->stmt) 3639 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR 3640 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR 3641 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR)) 3642 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 3643 else 3644 number_of_vectors 3645 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) 3646 * TYPE_VECTOR_SUBPARTS (stmt_vectype), 3647 vector_type); 3648 vec_oprnds->create (number_of_vectors); 3649 auto_vec<tree> voprnds (number_of_vectors); 3650 3651 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 3652 created vectors. It is greater than 1 if unrolling is performed. 3653 3654 For example, we have two scalar operands, s1 and s2 (e.g., group of 3655 strided accesses of size two), while NUNITS is four (i.e., four scalars 3656 of this type can be packed in a vector). The output vector will contain 3657 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 3658 will be 2). 3659 3660 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors 3661 containing the operands. 3662 3663 For example, NUNITS is four as before, and the group size is 8 3664 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 3665 {s5, s6, s7, s8}. */ 3666 3667 /* When using duplicate_and_interleave, we just need one element for 3668 each scalar statement. */ 3669 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 3670 nunits = group_size; 3671 3672 number_of_copies = nunits * number_of_vectors / group_size; 3673 3674 number_of_places_left_in_vector = nunits; 3675 constant_p = true; 3676 tree_vector_builder elts (vector_type, nunits, 1); 3677 elts.quick_grow (nunits); 3678 bool place_after_defs = false; 3679 for (j = 0; j < number_of_copies; j++) 3680 { 3681 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--) 3682 { 3683 /* Create 'vect_ = {op0,op1,...,opn}'. */ 3684 number_of_places_left_in_vector--; 3685 tree orig_op = op; 3686 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op))) 3687 { 3688 if (CONSTANT_CLASS_P (op)) 3689 { 3690 if (VECTOR_BOOLEAN_TYPE_P (vector_type)) 3691 { 3692 /* Can't use VIEW_CONVERT_EXPR for booleans because 3693 of possibly different sizes of scalar value and 3694 vector element. */ 3695 if (integer_zerop (op)) 3696 op = build_int_cst (TREE_TYPE (vector_type), 0); 3697 else if (integer_onep (op)) 3698 op = build_all_ones_cst (TREE_TYPE (vector_type)); 3699 else 3700 gcc_unreachable (); 3701 } 3702 else 3703 op = fold_unary (VIEW_CONVERT_EXPR, 3704 TREE_TYPE (vector_type), op); 3705 gcc_assert (op && CONSTANT_CLASS_P (op)); 3706 } 3707 else 3708 { 3709 tree new_temp = make_ssa_name (TREE_TYPE (vector_type)); 3710 gimple *init_stmt; 3711 if (VECTOR_BOOLEAN_TYPE_P (vector_type)) 3712 { 3713 tree true_val 3714 = build_all_ones_cst (TREE_TYPE (vector_type)); 3715 tree false_val 3716 = build_zero_cst (TREE_TYPE (vector_type)); 3717 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op))); 3718 init_stmt = gimple_build_assign (new_temp, COND_EXPR, 3719 op, true_val, 3720 false_val); 3721 } 3722 else 3723 { 3724 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), 3725 op); 3726 init_stmt 3727 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, 3728 op); 3729 } 3730 gimple_seq_add_stmt (&ctor_seq, init_stmt); 3731 op = new_temp; 3732 } 3733 } 3734 elts[number_of_places_left_in_vector] = op; 3735 if (!CONSTANT_CLASS_P (op)) 3736 constant_p = false; 3737 if (TREE_CODE (orig_op) == SSA_NAME 3738 && !SSA_NAME_IS_DEFAULT_DEF (orig_op) 3739 && STMT_VINFO_BB_VINFO (stmt_vinfo) 3740 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb 3741 == gimple_bb (SSA_NAME_DEF_STMT (orig_op)))) 3742 place_after_defs = true; 3743 3744 if (number_of_places_left_in_vector == 0) 3745 { 3746 if (constant_p 3747 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 3748 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 3749 vec_cst = gimple_build_vector (&ctor_seq, &elts); 3750 else 3751 { 3752 if (permute_results.is_empty ()) 3753 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, 3754 elts, number_of_vectors, 3755 permute_results); 3756 vec_cst = permute_results[number_of_vectors - j - 1]; 3757 } 3758 tree init; 3759 gimple_stmt_iterator gsi; 3760 if (place_after_defs) 3761 { 3762 stmt_vec_info last_stmt_info 3763 = vect_find_last_scalar_stmt_in_slp (slp_node); 3764 gsi = gsi_for_stmt (last_stmt_info->stmt); 3765 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type, 3766 &gsi); 3767 } 3768 else 3769 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type, 3770 NULL); 3771 if (ctor_seq != NULL) 3772 { 3773 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init)); 3774 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT); 3775 ctor_seq = NULL; 3776 } 3777 voprnds.quick_push (init); 3778 place_after_defs = false; 3779 number_of_places_left_in_vector = nunits; 3780 constant_p = true; 3781 elts.new_vector (vector_type, nunits, 1); 3782 elts.quick_grow (nunits); 3783 } 3784 } 3785 } 3786 3787 /* Since the vectors are created in the reverse order, we should invert 3788 them. */ 3789 vec_num = voprnds.length (); 3790 for (j = vec_num; j != 0; j--) 3791 { 3792 vop = voprnds[j - 1]; 3793 vec_oprnds->quick_push (vop); 3794 } 3795 3796 /* In case that VF is greater than the unrolling factor needed for the SLP 3797 group of stmts, NUMBER_OF_VECTORS to be created is greater than 3798 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have 3799 to replicate the vectors. */ 3800 while (number_of_vectors > vec_oprnds->length ()) 3801 { 3802 tree neutral_vec = NULL; 3803 3804 if (neutral_op) 3805 { 3806 if (!neutral_vec) 3807 neutral_vec = build_vector_from_val (vector_type, neutral_op); 3808 3809 vec_oprnds->quick_push (neutral_vec); 3810 } 3811 else 3812 { 3813 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) 3814 vec_oprnds->quick_push (vop); 3815 } 3816 } 3817 } 3818 3819 3820 /* Get vectorized definitions from SLP_NODE that contains corresponding 3821 vectorized def-stmts. */ 3822 3823 static void 3824 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds) 3825 { 3826 stmt_vec_info vec_def_stmt_info; 3827 unsigned int i; 3828 3829 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ()); 3830 3831 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info) 3832 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt)); 3833 } 3834 3835 3836 /* Get N vectorized definitions for SLP_NODE. 3837 If the scalar definitions are loop invariants or constants, collect them and 3838 call vect_get_constant_vectors() to create vector stmts. 3839 Otherwise, the def-stmts must be already vectorized and the vectorized stmts 3840 must be stored in the corresponding child of SLP_NODE, and we call 3841 vect_get_slp_vect_defs () to retrieve them. */ 3842 3843 void 3844 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n) 3845 { 3846 if (n == -1U) 3847 n = SLP_TREE_CHILDREN (slp_node).length (); 3848 3849 for (unsigned i = 0; i < n; ++i) 3850 { 3851 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; 3852 3853 vec<tree> vec_defs = vNULL; 3854 3855 /* For each operand we check if it has vectorized definitions in a child 3856 node or we need to create them (for invariants and constants). */ 3857 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 3858 { 3859 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child)); 3860 vect_get_slp_vect_defs (child, &vec_defs); 3861 } 3862 else 3863 vect_get_constant_vectors (slp_node, i, &vec_defs); 3864 3865 vec_oprnds->quick_push (vec_defs); 3866 } 3867 } 3868 3869 /* Generate vector permute statements from a list of loads in DR_CHAIN. 3870 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid 3871 permute statements for the SLP node NODE of the SLP instance 3872 SLP_NODE_INSTANCE. */ 3873 3874 bool 3875 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, 3876 gimple_stmt_iterator *gsi, poly_uint64 vf, 3877 slp_instance slp_node_instance, bool analyze_only, 3878 unsigned *n_perms) 3879 { 3880 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 3881 vec_info *vinfo = stmt_info->vinfo; 3882 int vec_index = 0; 3883 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3884 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); 3885 unsigned int mask_element; 3886 machine_mode mode; 3887 3888 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 3889 return false; 3890 3891 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); 3892 3893 mode = TYPE_MODE (vectype); 3894 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 3895 3896 /* Initialize the vect stmts of NODE to properly insert the generated 3897 stmts later. */ 3898 if (! analyze_only) 3899 for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); 3900 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) 3901 SLP_TREE_VEC_STMTS (node).quick_push (NULL); 3902 3903 /* Generate permutation masks for every NODE. Number of masks for each NODE 3904 is equal to GROUP_SIZE. 3905 E.g., we have a group of three nodes with three loads from the same 3906 location in each node, and the vector size is 4. I.e., we have a 3907 a0b0c0a1b1c1... sequence and we need to create the following vectors: 3908 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 3909 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 3910 ... 3911 3912 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}. 3913 The last mask is illegal since we assume two operands for permute 3914 operation, and the mask element values can't be outside that range. 3915 Hence, the last mask must be converted into {2,5,5,5}. 3916 For the first two permutations we need the first and the second input 3917 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation 3918 we need the second and the third vectors: {b1,c1,a2,b2} and 3919 {c2,a3,b3,c3}. */ 3920 3921 int vect_stmts_counter = 0; 3922 unsigned int index = 0; 3923 int first_vec_index = -1; 3924 int second_vec_index = -1; 3925 bool noop_p = true; 3926 *n_perms = 0; 3927 3928 vec_perm_builder mask; 3929 unsigned int nelts_to_build; 3930 unsigned int nvectors_per_build; 3931 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) 3932 && multiple_p (nunits, group_size)); 3933 if (repeating_p) 3934 { 3935 /* A single vector contains a whole number of copies of the node, so: 3936 (a) all permutes can use the same mask; and 3937 (b) the permutes only need a single vector input. */ 3938 mask.new_vector (nunits, group_size, 3); 3939 nelts_to_build = mask.encoded_nelts (); 3940 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); 3941 } 3942 else 3943 { 3944 /* We need to construct a separate mask for each vector statement. */ 3945 unsigned HOST_WIDE_INT const_nunits, const_vf; 3946 if (!nunits.is_constant (&const_nunits) 3947 || !vf.is_constant (&const_vf)) 3948 return false; 3949 mask.new_vector (const_nunits, const_nunits, 1); 3950 nelts_to_build = const_vf * group_size; 3951 nvectors_per_build = 1; 3952 } 3953 3954 unsigned int count = mask.encoded_nelts (); 3955 mask.quick_grow (count); 3956 vec_perm_indices indices; 3957 3958 for (unsigned int j = 0; j < nelts_to_build; j++) 3959 { 3960 unsigned int iter_num = j / group_size; 3961 unsigned int stmt_num = j % group_size; 3962 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) 3963 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); 3964 if (repeating_p) 3965 { 3966 first_vec_index = 0; 3967 mask_element = i; 3968 } 3969 else 3970 { 3971 /* Enforced before the loop when !repeating_p. */ 3972 unsigned int const_nunits = nunits.to_constant (); 3973 vec_index = i / const_nunits; 3974 mask_element = i % const_nunits; 3975 if (vec_index == first_vec_index 3976 || first_vec_index == -1) 3977 { 3978 first_vec_index = vec_index; 3979 } 3980 else if (vec_index == second_vec_index 3981 || second_vec_index == -1) 3982 { 3983 second_vec_index = vec_index; 3984 mask_element += const_nunits; 3985 } 3986 else 3987 { 3988 if (dump_enabled_p ()) 3989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3990 "permutation requires at " 3991 "least three vectors %G", 3992 stmt_info->stmt); 3993 gcc_assert (analyze_only); 3994 return false; 3995 } 3996 3997 gcc_assert (mask_element < 2 * const_nunits); 3998 } 3999 4000 if (mask_element != index) 4001 noop_p = false; 4002 mask[index++] = mask_element; 4003 4004 if (index == count && !noop_p) 4005 { 4006 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits); 4007 if (!can_vec_perm_const_p (mode, indices)) 4008 { 4009 if (dump_enabled_p ()) 4010 { 4011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 4012 vect_location, 4013 "unsupported vect permute { "); 4014 for (i = 0; i < count; ++i) 4015 { 4016 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); 4017 dump_printf (MSG_MISSED_OPTIMIZATION, " "); 4018 } 4019 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); 4020 } 4021 gcc_assert (analyze_only); 4022 return false; 4023 } 4024 4025 ++*n_perms; 4026 } 4027 4028 if (index == count) 4029 { 4030 if (!analyze_only) 4031 { 4032 tree mask_vec = NULL_TREE; 4033 4034 if (! noop_p) 4035 mask_vec = vect_gen_perm_mask_checked (vectype, indices); 4036 4037 if (second_vec_index == -1) 4038 second_vec_index = first_vec_index; 4039 4040 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) 4041 { 4042 /* Generate the permute statement if necessary. */ 4043 tree first_vec = dr_chain[first_vec_index + ri]; 4044 tree second_vec = dr_chain[second_vec_index + ri]; 4045 stmt_vec_info perm_stmt_info; 4046 if (! noop_p) 4047 { 4048 gassign *stmt = as_a <gassign *> (stmt_info->stmt); 4049 tree perm_dest 4050 = vect_create_destination_var (gimple_assign_lhs (stmt), 4051 vectype); 4052 perm_dest = make_ssa_name (perm_dest); 4053 gassign *perm_stmt 4054 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, 4055 first_vec, second_vec, 4056 mask_vec); 4057 perm_stmt_info 4058 = vect_finish_stmt_generation (stmt_info, perm_stmt, 4059 gsi); 4060 } 4061 else 4062 /* If mask was NULL_TREE generate the requested 4063 identity transform. */ 4064 perm_stmt_info = vinfo->lookup_def (first_vec); 4065 4066 /* Store the vector statement in NODE. */ 4067 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] 4068 = perm_stmt_info; 4069 } 4070 } 4071 4072 index = 0; 4073 first_vec_index = -1; 4074 second_vec_index = -1; 4075 noop_p = true; 4076 } 4077 } 4078 4079 return true; 4080 } 4081 4082 /* Vectorize SLP instance tree in postorder. */ 4083 4084 static void 4085 vect_schedule_slp_instance (slp_tree node, slp_instance instance) 4086 { 4087 gimple_stmt_iterator si; 4088 stmt_vec_info stmt_info; 4089 unsigned int group_size; 4090 tree vectype; 4091 int i, j; 4092 slp_tree child; 4093 4094 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 4095 return; 4096 4097 /* See if we have already vectorized the node in the graph of the 4098 SLP instance. */ 4099 if (SLP_TREE_VEC_STMTS (node).exists ()) 4100 return; 4101 4102 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4103 vect_schedule_slp_instance (child, instance); 4104 4105 /* Push SLP node def-type to stmts. */ 4106 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4107 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 4108 { 4109 stmt_vec_info child_stmt_info; 4110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info) 4111 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child); 4112 } 4113 4114 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 4115 4116 /* VECTYPE is the type of the destination. */ 4117 vectype = STMT_VINFO_VECTYPE (stmt_info); 4118 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 4119 group_size = SLP_INSTANCE_GROUP_SIZE (instance); 4120 4121 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); 4122 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); 4123 4124 if (dump_enabled_p ()) 4125 dump_printf_loc (MSG_NOTE, vect_location, 4126 "------>vectorizing SLP node starting from: %G", 4127 stmt_info->stmt); 4128 4129 /* Vectorized stmts go before the last scalar stmt which is where 4130 all uses are ready. */ 4131 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); 4132 si = gsi_for_stmt (last_stmt_info->stmt); 4133 4134 /* Handle two-operation SLP nodes by vectorizing the group with 4135 both operations and then performing a merge. */ 4136 bool done_p = false; 4137 if (SLP_TREE_TWO_OPERATORS (node)) 4138 { 4139 gassign *stmt = as_a <gassign *> (stmt_info->stmt); 4140 enum tree_code code0 = gimple_assign_rhs_code (stmt); 4141 enum tree_code ocode = ERROR_MARK; 4142 stmt_vec_info ostmt_info; 4143 vec_perm_builder mask (group_size, group_size, 1); 4144 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info) 4145 { 4146 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt); 4147 if (gimple_assign_rhs_code (ostmt) != code0) 4148 { 4149 mask.quick_push (1); 4150 ocode = gimple_assign_rhs_code (ostmt); 4151 } 4152 else 4153 mask.quick_push (0); 4154 } 4155 if (ocode != ERROR_MARK) 4156 { 4157 vec<stmt_vec_info> v0; 4158 vec<stmt_vec_info> v1; 4159 unsigned j; 4160 tree tmask = NULL_TREE; 4161 vect_transform_stmt (stmt_info, &si, node, instance); 4162 v0 = SLP_TREE_VEC_STMTS (node).copy (); 4163 SLP_TREE_VEC_STMTS (node).truncate (0); 4164 gimple_assign_set_rhs_code (stmt, ocode); 4165 vect_transform_stmt (stmt_info, &si, node, instance); 4166 gimple_assign_set_rhs_code (stmt, code0); 4167 v1 = SLP_TREE_VEC_STMTS (node).copy (); 4168 SLP_TREE_VEC_STMTS (node).truncate (0); 4169 tree meltype = build_nonstandard_integer_type 4170 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1); 4171 tree mvectype = get_same_sized_vectype (meltype, vectype); 4172 unsigned k = 0, l; 4173 for (j = 0; j < v0.length (); ++j) 4174 { 4175 /* Enforced by vect_build_slp_tree, which rejects variable-length 4176 vectors for SLP_TREE_TWO_OPERATORS. */ 4177 unsigned int const_nunits = nunits.to_constant (); 4178 tree_vector_builder melts (mvectype, const_nunits, 1); 4179 for (l = 0; l < const_nunits; ++l) 4180 { 4181 if (k >= group_size) 4182 k = 0; 4183 tree t = build_int_cst (meltype, 4184 mask[k++] * const_nunits + l); 4185 melts.quick_push (t); 4186 } 4187 tmask = melts.build (); 4188 4189 /* ??? Not all targets support a VEC_PERM_EXPR with a 4190 constant mask that would translate to a vec_merge RTX 4191 (with their vec_perm_const_ok). We can either not 4192 vectorize in that case or let veclower do its job. 4193 Unfortunately that isn't too great and at least for 4194 plus/minus we'd eventually like to match targets 4195 vector addsub instructions. */ 4196 gimple *vstmt; 4197 vstmt = gimple_build_assign (make_ssa_name (vectype), 4198 VEC_PERM_EXPR, 4199 gimple_assign_lhs (v0[j]->stmt), 4200 gimple_assign_lhs (v1[j]->stmt), 4201 tmask); 4202 SLP_TREE_VEC_STMTS (node).quick_push 4203 (vect_finish_stmt_generation (stmt_info, vstmt, &si)); 4204 } 4205 v0.release (); 4206 v1.release (); 4207 done_p = true; 4208 } 4209 } 4210 if (!done_p) 4211 vect_transform_stmt (stmt_info, &si, node, instance); 4212 4213 /* Restore stmt def-types. */ 4214 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4215 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 4216 { 4217 stmt_vec_info child_stmt_info; 4218 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info) 4219 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def; 4220 } 4221 } 4222 4223 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero. 4224 For loop vectorization this is done in vectorizable_call, but for SLP 4225 it needs to be deferred until end of vect_schedule_slp, because multiple 4226 SLP instances may refer to the same scalar stmt. */ 4227 4228 static void 4229 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited) 4230 { 4231 gimple *new_stmt; 4232 gimple_stmt_iterator gsi; 4233 int i; 4234 slp_tree child; 4235 tree lhs; 4236 stmt_vec_info stmt_info; 4237 4238 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 4239 return; 4240 4241 if (visited.add (node)) 4242 return; 4243 4244 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4245 vect_remove_slp_scalar_calls (child, visited); 4246 4247 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) 4248 { 4249 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt); 4250 if (!stmt || gimple_bb (stmt) == NULL) 4251 continue; 4252 if (is_pattern_stmt_p (stmt_info) 4253 || !PURE_SLP_STMT (stmt_info)) 4254 continue; 4255 lhs = gimple_call_lhs (stmt); 4256 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); 4257 gsi = gsi_for_stmt (stmt); 4258 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt); 4259 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt; 4260 } 4261 } 4262 4263 static void 4264 vect_remove_slp_scalar_calls (slp_tree node) 4265 { 4266 hash_set<slp_tree> visited; 4267 vect_remove_slp_scalar_calls (node, visited); 4268 } 4269 4270 /* Vectorize the instance root. */ 4271 4272 void 4273 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) 4274 { 4275 gassign *rstmt = NULL; 4276 4277 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1) 4278 { 4279 stmt_vec_info child_stmt_info; 4280 int j; 4281 4282 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info) 4283 { 4284 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt); 4285 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt); 4286 if (!useless_type_conversion_p (TREE_TYPE (root_lhs), 4287 TREE_TYPE (vect_lhs))) 4288 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs), 4289 vect_lhs); 4290 rstmt = gimple_build_assign (root_lhs, vect_lhs); 4291 break; 4292 } 4293 } 4294 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1) 4295 { 4296 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); 4297 stmt_vec_info child_stmt_info; 4298 int j; 4299 vec<constructor_elt, va_gc> *v; 4300 vec_alloc (v, nelts); 4301 4302 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info) 4303 { 4304 CONSTRUCTOR_APPEND_ELT (v, 4305 NULL_TREE, 4306 gimple_get_lhs (child_stmt_info->stmt)); 4307 } 4308 tree lhs = gimple_get_lhs (instance->root_stmt->stmt); 4309 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt)); 4310 tree r_constructor = build_constructor (rtype, v); 4311 rstmt = gimple_build_assign (lhs, r_constructor); 4312 } 4313 4314 gcc_assert (rstmt); 4315 4316 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt); 4317 gsi_replace (&rgsi, rstmt, true); 4318 } 4319 4320 /* Generate vector code for all SLP instances in the loop/basic block. */ 4321 4322 void 4323 vect_schedule_slp (vec_info *vinfo) 4324 { 4325 vec<slp_instance> slp_instances; 4326 slp_instance instance; 4327 unsigned int i; 4328 4329 slp_instances = vinfo->slp_instances; 4330 FOR_EACH_VEC_ELT (slp_instances, i, instance) 4331 { 4332 slp_tree node = SLP_INSTANCE_TREE (instance); 4333 /* Schedule the tree of INSTANCE. */ 4334 vect_schedule_slp_instance (node, instance); 4335 4336 if (SLP_INSTANCE_ROOT_STMT (instance)) 4337 vectorize_slp_instance_root_stmt (node, instance); 4338 4339 if (dump_enabled_p ()) 4340 dump_printf_loc (MSG_NOTE, vect_location, 4341 "vectorizing stmts using SLP.\n"); 4342 } 4343 4344 FOR_EACH_VEC_ELT (slp_instances, i, instance) 4345 { 4346 slp_tree root = SLP_INSTANCE_TREE (instance); 4347 stmt_vec_info store_info; 4348 unsigned int j; 4349 4350 /* For reductions set the latch values of the vectorized PHIs. */ 4351 if (instance->reduc_phis 4352 && STMT_VINFO_REDUC_TYPE (SLP_TREE_SCALAR_STMTS 4353 (instance->reduc_phis)[0]) != FOLD_LEFT_REDUCTION 4354 && STMT_VINFO_REDUC_TYPE (SLP_TREE_SCALAR_STMTS 4355 (instance->reduc_phis)[0]) != EXTRACT_LAST_REDUCTION) 4356 { 4357 slp_tree slp_node = root; 4358 slp_tree phi_node = instance->reduc_phis; 4359 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt); 4360 edge e = loop_latch_edge (gimple_bb (phi)->loop_father); 4361 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length () 4362 == SLP_TREE_VEC_STMTS (slp_node).length ()); 4363 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j) 4364 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]->stmt), 4365 gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[j]->stmt), 4366 e, gimple_phi_arg_location (phi, e->dest_idx)); 4367 } 4368 4369 /* Remove scalar call stmts. Do not do this for basic-block 4370 vectorization as not all uses may be vectorized. 4371 ??? Why should this be necessary? DCE should be able to 4372 remove the stmts itself. 4373 ??? For BB vectorization we can as well remove scalar 4374 stmts starting from the SLP tree root if they have no 4375 uses. */ 4376 if (is_a <loop_vec_info> (vinfo)) 4377 vect_remove_slp_scalar_calls (root); 4378 4379 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info) 4380 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) 4381 { 4382 if (!STMT_VINFO_DATA_REF (store_info)) 4383 break; 4384 4385 if (SLP_INSTANCE_ROOT_STMT (instance)) 4386 continue; 4387 4388 store_info = vect_orig_stmt (store_info); 4389 /* Free the attached stmt_vec_info and remove the stmt. */ 4390 vinfo->remove_stmt (store_info); 4391 } 4392 } 4393 } 4394